1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  *       it may change values between parallel regions.  __kmp_max_nth
20  *       is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52   typedef typename traits_t<T>::unsigned_t UT;
53   typedef typename traits_t<T>::signed_t ST;
54   UT count; // unsigned
55   T ub;
56   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57   T lb;
58   ST st; // signed
59   UT tc; // unsigned
60   T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62   /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65   //    a) parm3 is properly aligned and
66   //    b) all parm1-4 are in the same cache line.
67   // Because of parm1-4 are used together, performance seems to be better
68   // if they are in the same line (not measured though).
69 
70   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71     T parm1;
72     T parm2;
73     T parm3;
74     T parm4;
75   };
76 
77   UT ordered_lower; // unsigned
78   UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80   T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   T lb;
92   T ub;
93   ST st; // signed
94   UT tc; // unsigned
95 
96   T parm1;
97   T parm2;
98   T parm3;
99   T parm4;
100 
101   UT count; // unsigned
102 
103   UT ordered_lower; // unsigned
104   UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106   T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114   // duplicate alignment here, otherwise size of structure is not correct in our
115   // compiler
116   union KMP_ALIGN_CACHE private_info_tmpl {
117     dispatch_private_infoXX_template<T> p;
118     dispatch_private_info64_t p64;
119   } u;
120   enum sched_type schedule; /* scheduling algorithm */
121   kmp_uint32 ordered; /* ordered clause specified */
122   kmp_uint32 ordered_bumped;
123   // To retain the structure size after making ordered_iteration scalar
124   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126   kmp_uint32 nomerge; /* don't merge iters if serialized */
127   kmp_uint32 type_size;
128   enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134   /* chunk index under dynamic, number of idle threads under static-steal;
135      iteration index otherwise */
136   volatile UT iteration;
137   volatile UT num_done;
138   volatile UT ordered_iteration;
139   // to retain the structure size making ordered_iteration scalar
140   UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145   // we need union here to keep the structure size
146   union shared_info_tmpl {
147     dispatch_shared_infoXX_template<UT> s;
148     dispatch_shared_info64_t s64;
149   } u;
150   volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152   volatile kmp_int32 doacross_buf_idx; // teamwise index
153   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154   kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158   // machines (> 48 cores). Performance analysis showed that a cache thrash
159   // was occurring and this padding helps alleviate the problem.
160   char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173                                                  kmp_int32 d) {
174   kmp_int32 r;
175   r = KMP_TEST_THEN_ADD32(p, d);
176   return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181                                                  kmp_int64 d) {
182   kmp_int64 r;
183   r = KMP_TEST_THEN_ADD64(p, d);
184   return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192   kmp_int32 r;
193   r = KMP_TEST_THEN_INC_ACQ32(p);
194   return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199   kmp_int64 r;
200   r = KMP_TEST_THEN_INC_ACQ64(p);
201   return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209   kmp_int32 r;
210   r = KMP_TEST_THEN_INC32(p);
211   return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216   kmp_int64 r;
217   r = KMP_TEST_THEN_INC64(p);
218   return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227                                                     kmp_int32 c, kmp_int32 s) {
228   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233                                                     kmp_int64 c, kmp_int64 s) {
234   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238     Waits until function returns non-zero when called with *spinner and check.
239     Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241     Arguments:
242         obj -- is higher-level synchronization object to report to ittnotify.
243         It is used to report locks consistently. For example, if lock is
244         acquired immediately, its address is reported to ittnotify via
245         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247         same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253     __kmp_wait_yield(
254         volatile UT *spinner, UT checker,
255         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256             void *obj) // Higher-level synchronization object, or NULL.
257         ) {
258   // note: we may not belong to a team at this point
259   register volatile UT *spin = spinner;
260   register UT check = checker;
261   register kmp_uint32 spins;
262   register kmp_uint32 (*f)(UT, UT) = pred;
263   register UT r;
264 
265   KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
266   KMP_INIT_YIELD(spins);
267   // main wait spin loop
268   while (!f(r = *spin, check)) {
269     KMP_FSYNC_SPIN_PREPARE(obj);
270     /* GEH - remove this since it was accidentally introduced when kmp_wait was
271        split. It causes problems with infinite recursion because of exit lock */
272     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273         __kmp_abort_thread(); */
274 
275     // if we are oversubscribed, or have waited a bit (and
276     // KMP_LIBRARY=throughput, then yield. pause is in the following code
277     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278     KMP_YIELD_SPIN(spins);
279   }
280   KMP_FSYNC_SPIN_ACQUIRED(obj);
281   return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285   return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289   return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293   return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297   return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301   return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307                                      ident_t *loc_ref) {
308   kmp_info_t *th;
309 
310   KMP_DEBUG_ASSERT(gtid_ref);
311 
312   if (__kmp_env_consistency_check) {
313     th = __kmp_threads[*gtid_ref];
314     if (th->th.th_root->r.r_active &&
315         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321     }
322   }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327   typedef typename traits_t<UT>::signed_t ST;
328   dispatch_private_info_template<UT> *pr;
329 
330   int gtid = *gtid_ref;
331   //    int  cid = *cid_ref;
332   kmp_info_t *th = __kmp_threads[gtid];
333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336   if (__kmp_env_consistency_check) {
337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338         th->th.th_dispatch->th_dispatch_pr_current);
339     if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345     }
346   }
347 
348   if (!th->th.th_team->t.t_serialized) {
349     dispatch_shared_info_template<UT> *sh =
350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
351             th->th.th_dispatch->th_dispatch_sh_current);
352     UT lower;
353 
354     if (!__kmp_env_consistency_check) {
355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356           th->th.th_dispatch->th_dispatch_pr_current);
357     }
358     lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361     if (__kmp_env_consistency_check) {
362       if (pr->ordered_bumped) {
363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365                                ct_ordered_in_pdo, loc_ref,
366                                &p->stack_data[p->w_top]);
367       }
368     }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371     KMP_MB();
372 #ifdef KMP_DEBUG
373     {
374       const char *buff;
375       // create format specifiers before the debug output
376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377                               "ordered_iter:%%%s lower:%%%s\n",
378                               traits_t<UT>::spec, traits_t<UT>::spec);
379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380       __kmp_str_free(&buff);
381     }
382 #endif
383 
384     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386     KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388     {
389       const char *buff;
390       // create format specifiers before the debug output
391       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392                               "ordered_iter:%%%s lower:%%%s\n",
393                               traits_t<UT>::spec, traits_t<UT>::spec);
394       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395       __kmp_str_free(&buff);
396     }
397 #endif
398   }
399   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403                                      ident_t *loc_ref) {
404   kmp_info_t *th;
405 
406   if (__kmp_env_consistency_check) {
407     th = __kmp_threads[*gtid_ref];
408     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410     }
411   }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416   typedef typename traits_t<UT>::signed_t ST;
417   dispatch_private_info_template<UT> *pr;
418 
419   int gtid = *gtid_ref;
420   //    int  cid = *cid_ref;
421   kmp_info_t *th = __kmp_threads[gtid];
422   KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425   if (__kmp_env_consistency_check) {
426     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427         th->th.th_dispatch->th_dispatch_pr_current);
428     if (pr->pushed_ws != ct_none) {
429       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430     }
431   }
432 
433   if (!th->th.th_team->t.t_serialized) {
434     dispatch_shared_info_template<UT> *sh =
435         reinterpret_cast<dispatch_shared_info_template<UT> *>(
436             th->th.th_dispatch->th_dispatch_sh_current);
437 
438     if (!__kmp_env_consistency_check) {
439       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440           th->th.th_dispatch->th_dispatch_pr_current);
441     }
442 
443     KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration);
444 #if !defined(KMP_GOMP_COMPAT)
445     if (__kmp_env_consistency_check) {
446       if (pr->ordered_bumped != 0) {
447         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448         /* How to test it? - OM */
449         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450                                ct_ordered_in_pdo, loc_ref,
451                                &p->stack_data[p->w_top]);
452       }
453     }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456     KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458     pr->ordered_bumped += 1;
459 
460     KD_TRACE(1000,
461              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462               gtid, pr->ordered_bumped));
463 
464     KMP_MB(); /* Flush all pending memory write invalidates.  */
465 
466     /* TODO use general release procedure? */
467     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469     KMP_MB(); /* Flush all pending memory write invalidates.  */
470   }
471   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477   long double s = 1.0L;
478 
479   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481   while (y) {
482     if (y & 1)
483       s *= x;
484     x *= x;
485     y >>= 1;
486   }
487   return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491    have been assigned (the total number of unassigned iterations in chunks with
492    index greater than or equal to idx). __forceinline seems to be broken so that
493    if we __forceinline this function, the behavior is wrong
494    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498                                 typename traits_t<T>::unsigned_t idx) {
499   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500      ICL 8.1, long double arithmetic may not really have long double precision,
501      even with /Qlong_double.  Currently, we workaround that in the caller code,
502      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
503      of precision is not expected to be a correctness issue, though. */
504   typedef typename traits_t<T>::unsigned_t UT;
505 
506   long double x = tc * __kmp_pow<UT>(base, idx);
507   UT r = (UT)x;
508   if (x == r)
509     return r;
510   return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
515 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527                     T ub, typename traits_t<T>::signed_t st,
528                     typename traits_t<T>::signed_t chunk, int push_ws) {
529   typedef typename traits_t<T>::unsigned_t UT;
530   typedef typename traits_t<T>::signed_t ST;
531   typedef typename traits_t<T>::floating_t DBL;
532 
533   int active;
534   T tc;
535   kmp_info_t *th;
536   kmp_team_t *team;
537   kmp_uint32 my_buffer_index;
538   dispatch_private_info_template<T> *pr;
539   dispatch_shared_info_template<UT> volatile *sh;
540 
541   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542                    sizeof(dispatch_private_info));
543   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544                    sizeof(dispatch_shared_info));
545 
546   if (!TCR_4(__kmp_init_parallel))
547     __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550   SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553   {
554     const char *buff;
555     // create format specifiers before the debug output
556     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558                             traits_t<ST>::spec, traits_t<T>::spec,
559                             traits_t<T>::spec, traits_t<ST>::spec);
560     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561     __kmp_str_free(&buff);
562   }
563 #endif
564   /* setup data */
565   th = __kmp_threads[gtid];
566   team = th->th.th_team;
567   active = !team->t.t_serialized;
568   th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571   kmp_uint64 cur_chunk = chunk;
572   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573                                     __kmp_forkjoin_frames_mode == 3 &&
574                                     KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576                                     th->th.th_teams_microtask == NULL &&
577 #endif
578                                     team->t.t_active_level == 1;
579 #endif
580   if (!active) {
581     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583   } else {
584     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587     my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589     /* What happens when number of threads changes, need to resize buffer? */
590     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591         &th->th.th_dispatch
592              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595   }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599     // AC: we now have only one implementation of stealing, so use it
600     schedule = kmp_sch_static_steal;
601   else
602 #endif
603     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605   /* Pick up the nomerge/ordered bits from the scheduling type */
606   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607     pr->nomerge = TRUE;
608     schedule =
609         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610   } else {
611     pr->nomerge = FALSE;
612   }
613   pr->type_size = traits_t<T>::type_size; // remember the size of variables
614   if (kmp_ord_lower & schedule) {
615     pr->ordered = TRUE;
616     schedule =
617         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618   } else {
619     pr->ordered = FALSE;
620   }
621 
622   if (schedule == kmp_sch_static) {
623     schedule = __kmp_static;
624   } else {
625     if (schedule == kmp_sch_runtime) {
626       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627       // not specified)
628       schedule = team->t.t_sched.r_sched_type;
629       // Detail the schedule if needed (global controls are differentiated
630       // appropriately)
631       if (schedule == kmp_sch_guided_chunked) {
632         schedule = __kmp_guided;
633       } else if (schedule == kmp_sch_static) {
634         schedule = __kmp_static;
635       }
636       // Use the chunk size specified by OMP_SCHEDULE (or default if not
637       // specified)
638       chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640       cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643       {
644         const char *buff;
645         // create format specifiers before the debug output
646         buff = __kmp_str_format(
647             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648             traits_t<ST>::spec);
649         KD_TRACE(10, (buff, gtid, schedule, chunk));
650         __kmp_str_free(&buff);
651       }
652 #endif
653     } else {
654       if (schedule == kmp_sch_guided_chunked) {
655         schedule = __kmp_guided;
656       }
657       if (chunk <= 0) {
658         chunk = KMP_DEFAULT_CHUNK;
659       }
660     }
661 
662     if (schedule == kmp_sch_auto) {
663       // mapping and differentiation: in the __kmp_do_serial_initialize()
664       schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666       {
667         const char *buff;
668         // create format specifiers before the debug output
669         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670                                 "schedule:%%d chunk:%%%s\n",
671                                 traits_t<ST>::spec);
672         KD_TRACE(10, (buff, gtid, schedule, chunk));
673         __kmp_str_free(&buff);
674       }
675 #endif
676     }
677 
678     /* guided analytical not safe for too many threads */
679     if (schedule == kmp_sch_guided_analytical_chunked &&
680         th->th.th_team_nproc > 1 << 20) {
681       schedule = kmp_sch_guided_iterative_chunked;
682       KMP_WARNING(DispatchManyThreads);
683     }
684     if (schedule == kmp_sch_runtime_simd) {
685       // compiler provides simd_width in the chunk parameter
686       schedule = team->t.t_sched.r_sched_type;
687       // Detail the schedule if needed (global controls are differentiated
688       // appropriately)
689       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690           schedule == __kmp_static) {
691         schedule = kmp_sch_static_balanced_chunked;
692       } else {
693         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694           schedule = kmp_sch_guided_simd;
695         }
696         chunk = team->t.t_sched.chunk * chunk;
697       }
698 #if USE_ITT_BUILD
699       cur_chunk = chunk;
700 #endif
701 #ifdef KMP_DEBUG
702       {
703         const char *buff;
704         // create format specifiers before the debug output
705         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706                                 " chunk:%%%s\n",
707                                 traits_t<ST>::spec);
708         KD_TRACE(10, (buff, gtid, schedule, chunk));
709         __kmp_str_free(&buff);
710       }
711 #endif
712     }
713     pr->u.p.parm1 = chunk;
714   }
715   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716               "unknown scheduling type");
717 
718   pr->u.p.count = 0;
719 
720   if (__kmp_env_consistency_check) {
721     if (st == 0) {
722       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724     }
725   }
726   // compute trip count
727   if (st == 1) { // most common case
728     if (ub >= lb) {
729       tc = ub - lb + 1;
730     } else { // ub < lb
731       tc = 0; // zero-trip
732     }
733   } else if (st < 0) {
734     if (lb >= ub) {
735       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736       // where the division needs to be unsigned regardless of the result type
737       tc = (UT)(lb - ub) / (-st) + 1;
738     } else { // lb < ub
739       tc = 0; // zero-trip
740     }
741   } else { // st > 0
742     if (ub >= lb) {
743       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744       // where the division needs to be unsigned regardless of the result type
745       tc = (UT)(ub - lb) / st + 1;
746     } else { // ub < lb
747       tc = 0; // zero-trip
748     }
749   }
750 
751   // Any half-decent optimizer will remove this test when the blocks are empty
752   // since the macros expand to nothing when statistics are disabled.
753   if (schedule == __kmp_static) {
754     KMP_COUNT_BLOCK(OMP_FOR_static);
755     KMP_COUNT_VALUE(FOR_static_iterations, tc);
756   } else {
757     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759   }
760 
761   pr->u.p.lb = lb;
762   pr->u.p.ub = ub;
763   pr->u.p.st = st;
764   pr->u.p.tc = tc;
765 
766 #if KMP_OS_WINDOWS
767   pr->u.p.last_upper = ub + st;
768 #endif /* KMP_OS_WINDOWS */
769 
770   /* NOTE: only the active parallel region(s) has active ordered sections */
771 
772   if (active) {
773     if (pr->ordered == 0) {
774       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776     } else {
777       pr->ordered_bumped = 0;
778 
779       pr->u.p.ordered_lower = 1;
780       pr->u.p.ordered_upper = 0;
781 
782       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784     }
785   }
786 
787   if (__kmp_env_consistency_check) {
788     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789     if (push_ws) {
790       __kmp_push_workshare(gtid, ws, loc);
791       pr->pushed_ws = ws;
792     } else {
793       __kmp_check_workshare(gtid, ws, loc);
794       pr->pushed_ws = ct_none;
795     }
796   }
797 
798   switch (schedule) {
799 #if (KMP_STATIC_STEAL_ENABLED)
800   case kmp_sch_static_steal: {
801     T nproc = th->th.th_team_nproc;
802     T ntc, init;
803 
804     KD_TRACE(100,
805              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806 
807     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808     if (nproc > 1 && ntc >= nproc) {
809       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810       T id = __kmp_tid_from_gtid(gtid);
811       T small_chunk, extras;
812 
813       small_chunk = ntc / nproc;
814       extras = ntc % nproc;
815 
816       init = id * small_chunk + (id < extras ? id : extras);
817       pr->u.p.count = init;
818       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819 
820       pr->u.p.parm2 = lb;
821       // pr->pfields.parm3 = 0; // it's not used in static_steal
822       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823       pr->u.p.st = st;
824       if (traits_t<T>::type_size > 4) {
825         // AC: TODO: check if 16-byte CAS available and use it to
826         // improve performance (probably wait for explicit request
827         // before spending time on this).
828         // For now use dynamically allocated per-thread lock,
829         // free memory in __kmp_dispatch_next when status==0.
830         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831         th->th.th_dispatch->th_steal_lock =
832             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834       }
835       break;
836     } else {
837       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838                      "kmp_sch_static_balanced\n",
839                      gtid));
840       schedule = kmp_sch_static_balanced;
841       /* too few iterations: fall-through to kmp_sch_static_balanced */
842     } // if
843     /* FALL-THROUGH to static balanced */
844   } // case
845 #endif
846   case kmp_sch_static_balanced: {
847     T nproc = th->th.th_team_nproc;
848     T init, limit;
849 
850     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851                    gtid));
852 
853     if (nproc > 1) {
854       T id = __kmp_tid_from_gtid(gtid);
855 
856       if (tc < nproc) {
857         if (id < tc) {
858           init = id;
859           limit = id;
860           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861         } else {
862           pr->u.p.count = 1; /* means no more chunks to execute */
863           pr->u.p.parm1 = FALSE;
864           break;
865         }
866       } else {
867         T small_chunk = tc / nproc;
868         T extras = tc % nproc;
869         init = id * small_chunk + (id < extras ? id : extras);
870         limit = init + small_chunk - (id < extras ? 0 : 1);
871         pr->u.p.parm1 = (id == nproc - 1);
872       }
873     } else {
874       if (tc > 0) {
875         init = 0;
876         limit = tc - 1;
877         pr->u.p.parm1 = TRUE;
878       } else { // zero trip count
879         pr->u.p.count = 1; /* means no more chunks to execute */
880         pr->u.p.parm1 = FALSE;
881         break;
882       }
883     }
884 #if USE_ITT_BUILD
885     // Calculate chunk for metadata report
886     if (itt_need_metadata_reporting)
887       cur_chunk = limit - init + 1;
888 #endif
889     if (st == 1) {
890       pr->u.p.lb = lb + init;
891       pr->u.p.ub = lb + limit;
892     } else {
893       // calculated upper bound, "ub" is user-defined upper bound
894       T ub_tmp = lb + limit * st;
895       pr->u.p.lb = lb + init * st;
896       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897       // it exactly
898       if (st > 0) {
899         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900       } else {
901         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902       }
903     }
904     if (pr->ordered) {
905       pr->u.p.ordered_lower = init;
906       pr->u.p.ordered_upper = limit;
907     }
908     break;
909   } // case
910   case kmp_sch_static_balanced_chunked: {
911     // similar to balanced, but chunk adjusted to multiple of simd width
912     T nth = th->th.th_team_nproc;
913     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914                    " -> falling-through to static_greedy\n",
915                    gtid));
916     schedule = kmp_sch_static_greedy;
917     if (nth > 1)
918       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919     else
920       pr->u.p.parm1 = tc;
921     break;
922   } // case
923   case kmp_sch_guided_iterative_chunked:
924   case kmp_sch_guided_simd: {
925     T nproc = th->th.th_team_nproc;
926     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927                    " case\n",
928                    gtid));
929 
930     if (nproc > 1) {
931       if ((2L * chunk + 1) * nproc >= tc) {
932         /* chunk size too large, switch to dynamic */
933         schedule = kmp_sch_dynamic_chunked;
934       } else {
935         // when remaining iters become less than parm2 - switch to dynamic
936         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937         *(double *)&pr->u.p.parm3 =
938             guided_flt_param / nproc; // may occupy parm3 and parm4
939       }
940     } else {
941       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942                      "kmp_sch_static_greedy\n",
943                      gtid));
944       schedule = kmp_sch_static_greedy;
945       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947                      gtid));
948       pr->u.p.parm1 = tc;
949     } // if
950   } // case
951   break;
952   case kmp_sch_guided_analytical_chunked: {
953     T nproc = th->th.th_team_nproc;
954     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955                    " case\n",
956                    gtid));
957     if (nproc > 1) {
958       if ((2L * chunk + 1) * nproc >= tc) {
959         /* chunk size too large, switch to dynamic */
960         schedule = kmp_sch_dynamic_chunked;
961       } else {
962         /* commonly used term: (2 nproc - 1)/(2 nproc) */
963         DBL x;
964 
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966         /* Linux* OS already has 64-bit computation by default for long double,
967            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969            instead of the default 53-bit. Even though long double doesn't work
970            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971            expected to impact the correctness of the algorithm, but this has not
972            been mathematically proven. */
973         // save original FPCW and set precision to 64-bit, as
974         // Windows* OS on IA-32 architecture defaults to 53-bit
975         unsigned int oldFpcw = _control87(0, 0);
976         _control87(_PC_64, _MCW_PC); // 0,0x30000
977 #endif
978         /* value used for comparison in solver for cross-over point */
979         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980 
981         /* crossover point--chunk indexes equal to or greater than
982            this point switch to dynamic-style scheduling */
983         UT cross;
984 
985         /* commonly used term: (2 nproc - 1)/(2 nproc) */
986         x = (long double)1.0 - (long double)0.5 / nproc;
987 
988 #ifdef KMP_DEBUG
989         { // test natural alignment
990           struct _test_a {
991             char a;
992             union {
993               char b;
994               DBL d;
995             };
996           } t;
997           ptrdiff_t natural_alignment =
998               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000           // long)natural_alignment );
1001           KMP_DEBUG_ASSERT(
1002               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003         }
1004 #endif // KMP_DEBUG
1005 
1006         /* save the term in thread private dispatch structure */
1007         *(DBL *)&pr->u.p.parm3 = x;
1008 
1009         /* solve for the crossover point to the nearest integer i for which C_i
1010            <= chunk */
1011         {
1012           UT left, right, mid;
1013           long double p;
1014 
1015           /* estimate initial upper and lower bound */
1016 
1017           /* doesn't matter what value right is as long as it is positive, but
1018              it affects performance of the solver */
1019           right = 229;
1020           p = __kmp_pow<UT>(x, right);
1021           if (p > target) {
1022             do {
1023               p *= p;
1024               right <<= 1;
1025             } while (p > target && right < (1 << 27));
1026             /* lower bound is previous (failed) estimate of upper bound */
1027             left = right >> 1;
1028           } else {
1029             left = 0;
1030           }
1031 
1032           /* bisection root-finding method */
1033           while (left + 1 < right) {
1034             mid = (left + right) / 2;
1035             if (__kmp_pow<UT>(x, mid) > target) {
1036               left = mid;
1037             } else {
1038               right = mid;
1039             }
1040           } // while
1041           cross = right;
1042         }
1043         /* assert sanity of computed crossover point */
1044         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045                    __kmp_pow<UT>(x, cross) <= target);
1046 
1047         /* save the crossover point in thread private dispatch structure */
1048         pr->u.p.parm2 = cross;
1049 
1050 // C75803
1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053 #else
1054 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1055 #endif
1056         /* dynamic-style scheduling offset */
1057         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059                         cross * chunk;
1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1061         // restore FPCW
1062         _control87(oldFpcw, _MCW_PC);
1063 #endif
1064       } // if
1065     } else {
1066       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067                      "kmp_sch_static_greedy\n",
1068                      gtid));
1069       schedule = kmp_sch_static_greedy;
1070       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071       pr->u.p.parm1 = tc;
1072     } // if
1073   } // case
1074   break;
1075   case kmp_sch_static_greedy:
1076     KD_TRACE(100,
1077              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080                         : tc;
1081     break;
1082   case kmp_sch_static_chunked:
1083   case kmp_sch_dynamic_chunked:
1084     if (pr->u.p.parm1 <= 0) {
1085       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086     }
1087     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089                    gtid));
1090     break;
1091   case kmp_sch_trapezoidal: {
1092     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093 
1094     T parm1, parm2, parm3, parm4;
1095     KD_TRACE(100,
1096              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097 
1098     parm1 = chunk;
1099 
1100     /* F : size of the first cycle */
1101     parm2 = (tc / (2 * th->th.th_team_nproc));
1102 
1103     if (parm2 < 1) {
1104       parm2 = 1;
1105     }
1106 
1107     /* L : size of the last cycle.  Make sure the last cycle is not larger
1108        than the first cycle. */
1109     if (parm1 < 1) {
1110       parm1 = 1;
1111     } else if (parm1 > parm2) {
1112       parm1 = parm2;
1113     }
1114 
1115     /* N : number of cycles */
1116     parm3 = (parm2 + parm1);
1117     parm3 = (2 * tc + parm3 - 1) / parm3;
1118 
1119     if (parm3 < 2) {
1120       parm3 = 2;
1121     }
1122 
1123     /* sigma : decreasing incr of the trapezoid */
1124     parm4 = (parm3 - 1);
1125     parm4 = (parm2 - parm1) / parm4;
1126 
1127     // pointless check, because parm4 >= 0 always
1128     // if ( parm4 < 0 ) {
1129     //    parm4 = 0;
1130     //}
1131 
1132     pr->u.p.parm1 = parm1;
1133     pr->u.p.parm2 = parm2;
1134     pr->u.p.parm3 = parm3;
1135     pr->u.p.parm4 = parm4;
1136   } // case
1137   break;
1138 
1139   default: {
1140     __kmp_msg(kmp_ms_fatal, // Severity
1141               KMP_MSG(UnknownSchedTypeDetected), // Primary message
1142               KMP_HNT(GetNewerLibrary), // Hint
1143               __kmp_msg_null // Variadic argument list terminator
1144               );
1145   } break;
1146   } // switch
1147   pr->schedule = schedule;
1148   if (active) {
1149     /* The name of this buffer should be my_buffer_index when it's free to use
1150      * it */
1151 
1152     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1153                    "sh->buffer_index:%d\n",
1154                    gtid, my_buffer_index, sh->buffer_index));
1155     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1156                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1157     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1158     // my_buffer_index are *always* 32-bit integers.
1159     KMP_MB(); /* is this necessary? */
1160     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1161                    "sh->buffer_index:%d\n",
1162                    gtid, my_buffer_index, sh->buffer_index));
1163 
1164     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1165     th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh;
1166 #if USE_ITT_BUILD
1167     if (pr->ordered) {
1168       __kmp_itt_ordered_init(gtid);
1169     }; // if
1170     // Report loop metadata
1171     if (itt_need_metadata_reporting) {
1172       // Only report metadata by master of active team at level 1
1173       kmp_uint64 schedtype = 0;
1174       switch (schedule) {
1175       case kmp_sch_static_chunked:
1176       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1177         break;
1178       case kmp_sch_static_greedy:
1179         cur_chunk = pr->u.p.parm1;
1180         break;
1181       case kmp_sch_dynamic_chunked:
1182         schedtype = 1;
1183         break;
1184       case kmp_sch_guided_iterative_chunked:
1185       case kmp_sch_guided_analytical_chunked:
1186       case kmp_sch_guided_simd:
1187         schedtype = 2;
1188         break;
1189       default:
1190         // Should we put this case under "static"?
1191         // case kmp_sch_static_steal:
1192         schedtype = 3;
1193         break;
1194       }
1195       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1196     }
1197 #endif /* USE_ITT_BUILD */
1198   }; // if
1199 
1200 #ifdef KMP_DEBUG
1201   {
1202     const char *buff;
1203     // create format specifiers before the debug output
1204     buff = __kmp_str_format(
1205         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1206         "lb:%%%s ub:%%%s"
1207         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1208         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1209         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1210         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1211         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1212         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1213     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1214                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1215                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1216                   pr->u.p.parm3, pr->u.p.parm4));
1217     __kmp_str_free(&buff);
1218   }
1219 #endif
1220 #if (KMP_STATIC_STEAL_ENABLED)
1221   // It cannot be guaranteed that after execution of a loop with some other
1222   // schedule kind all the parm3 variables will contain the same value. Even if
1223   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1224   // rather than program life-time increment. So the dedicated variable is
1225   // required. The 'static_steal_counter' is used.
1226   if (schedule == kmp_sch_static_steal) {
1227     // Other threads will inspect this variable when searching for a victim.
1228     // This is a flag showing that other threads may steal from this thread
1229     // since then.
1230     volatile T *p = &pr->u.p.static_steal_counter;
1231     *p = *p + 1;
1232   }
1233 #endif // ( KMP_STATIC_STEAL_ENABLED )
1234 
1235 #if OMPT_SUPPORT && OMPT_TRACE
1236   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240         team_info->parallel_id, task_info->task_id, team_info->microtask);
1241   }
1242 #endif
1243 }
1244 
1245 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1246  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247  * every chunk of iterations.  If the ordered section(s) were not executed
1248  * for this iteration (or every iteration in this chunk), we need to set the
1249  * ordered iteration counters so that the next thread can proceed. */
1250 template <typename UT>
1251 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252   typedef typename traits_t<UT>::signed_t ST;
1253   kmp_info_t *th = __kmp_threads[gtid];
1254 
1255   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256   if (!th->th.th_team->t.t_serialized) {
1257 
1258     dispatch_private_info_template<UT> *pr =
1259         reinterpret_cast<dispatch_private_info_template<UT> *>(
1260             th->th.th_dispatch->th_dispatch_pr_current);
1261     dispatch_shared_info_template<UT> volatile *sh =
1262         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263             th->th.th_dispatch->th_dispatch_sh_current);
1264     KMP_DEBUG_ASSERT(pr);
1265     KMP_DEBUG_ASSERT(sh);
1266     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1268 
1269     if (pr->ordered_bumped) {
1270       KD_TRACE(
1271           1000,
1272           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273            gtid));
1274       pr->ordered_bumped = 0;
1275     } else {
1276       UT lower = pr->u.p.ordered_lower;
1277 
1278 #ifdef KMP_DEBUG
1279       {
1280         const char *buff;
1281         // create format specifiers before the debug output
1282         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283                                 "ordered_iteration:%%%s lower:%%%s\n",
1284                                 traits_t<UT>::spec, traits_t<UT>::spec);
1285         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286         __kmp_str_free(&buff);
1287       }
1288 #endif
1289 
1290       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292       KMP_MB(); /* is this necessary? */
1293 #ifdef KMP_DEBUG
1294       {
1295         const char *buff;
1296         // create format specifiers before the debug output
1297         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298                                 "ordered_iteration:%%%s lower:%%%s\n",
1299                                 traits_t<UT>::spec, traits_t<UT>::spec);
1300         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301         __kmp_str_free(&buff);
1302       }
1303 #endif
1304 
1305       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1306     } // if
1307   } // if
1308   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1309 }
1310 
1311 #ifdef KMP_GOMP_COMPAT
1312 
1313 template <typename UT>
1314 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315   typedef typename traits_t<UT>::signed_t ST;
1316   kmp_info_t *th = __kmp_threads[gtid];
1317 
1318   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319   if (!th->th.th_team->t.t_serialized) {
1320     //        int cid;
1321     dispatch_private_info_template<UT> *pr =
1322         reinterpret_cast<dispatch_private_info_template<UT> *>(
1323             th->th.th_dispatch->th_dispatch_pr_current);
1324     dispatch_shared_info_template<UT> volatile *sh =
1325         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326             th->th.th_dispatch->th_dispatch_sh_current);
1327     KMP_DEBUG_ASSERT(pr);
1328     KMP_DEBUG_ASSERT(sh);
1329     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1331 
1332     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333     UT lower = pr->u.p.ordered_lower;
1334     UT upper = pr->u.p.ordered_upper;
1335     UT inc = upper - lower + 1;
1336 
1337     if (pr->ordered_bumped == inc) {
1338       KD_TRACE(
1339           1000,
1340           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341            gtid));
1342       pr->ordered_bumped = 0;
1343     } else {
1344       inc -= pr->ordered_bumped;
1345 
1346 #ifdef KMP_DEBUG
1347       {
1348         const char *buff;
1349         // create format specifiers before the debug output
1350         buff = __kmp_str_format(
1351             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355         __kmp_str_free(&buff);
1356       }
1357 #endif
1358 
1359       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1361 
1362       KMP_MB(); /* is this necessary? */
1363       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364                       "ordered_bumped to zero\n",
1365                       gtid));
1366       pr->ordered_bumped = 0;
1367 //!!!!! TODO check if the inc should be unsigned, or signed???
1368 #ifdef KMP_DEBUG
1369       {
1370         const char *buff;
1371         // create format specifiers before the debug output
1372         buff = __kmp_str_format(
1373             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376             traits_t<UT>::spec);
1377         KD_TRACE(1000,
1378                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379         __kmp_str_free(&buff);
1380       }
1381 #endif
1382 
1383       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1384     }
1385     //        }
1386   }
1387   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1388 }
1389 
1390 #endif /* KMP_GOMP_COMPAT */
1391 
1392 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394    is not called. */
1395 #if OMPT_SUPPORT && OMPT_TRACE
1396 #define OMPT_LOOP_END                                                          \
1397   if (status == 0) {                                                           \
1398     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1399       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1400       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1401       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1402           team_info->parallel_id, task_info->task_id);                         \
1403     }                                                                          \
1404   }
1405 #else
1406 #define OMPT_LOOP_END // no-op
1407 #endif
1408 
1409 template <typename T>
1410 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1411                                T *p_lb, T *p_ub,
1412                                typename traits_t<T>::signed_t *p_st) {
1413 
1414   typedef typename traits_t<T>::unsigned_t UT;
1415   typedef typename traits_t<T>::signed_t ST;
1416   typedef typename traits_t<T>::floating_t DBL;
1417 
1418   // This is potentially slightly misleading, schedule(runtime) will appear here
1419   // even if the actual runtme schedule is static. (Which points out a
1420   // disadavantage of schedule(runtime): even when static scheduling is used it
1421   // costs more than a compile time choice to use static scheduling would.)
1422   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1423 
1424   int status;
1425   dispatch_private_info_template<T> *pr;
1426   kmp_info_t *th = __kmp_threads[gtid];
1427   kmp_team_t *team = th->th.th_team;
1428 
1429   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1430 #ifdef KMP_DEBUG
1431   {
1432     const char *buff;
1433     // create format specifiers before the debug output
1434     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1435                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1436                             traits_t<T>::spec, traits_t<T>::spec,
1437                             traits_t<ST>::spec);
1438     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1439     __kmp_str_free(&buff);
1440   }
1441 #endif
1442 
1443   if (team->t.t_serialized) {
1444     /* NOTE: serialize this dispatch becase we are not at the active level */
1445     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1446         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1447     KMP_DEBUG_ASSERT(pr);
1448 
1449     if ((status = (pr->u.p.tc != 0)) == 0) {
1450       *p_lb = 0;
1451       *p_ub = 0;
1452       //            if ( p_last != NULL )
1453       //                *p_last = 0;
1454       if (p_st != NULL)
1455         *p_st = 0;
1456       if (__kmp_env_consistency_check) {
1457         if (pr->pushed_ws != ct_none) {
1458           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1459         }
1460       }
1461     } else if (pr->nomerge) {
1462       kmp_int32 last;
1463       T start;
1464       UT limit, trip, init;
1465       ST incr;
1466       T chunk = pr->u.p.parm1;
1467 
1468       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1469                      gtid));
1470 
1471       init = chunk * pr->u.p.count++;
1472       trip = pr->u.p.tc - 1;
1473 
1474       if ((status = (init <= trip)) == 0) {
1475         *p_lb = 0;
1476         *p_ub = 0;
1477         //                if ( p_last != NULL )
1478         //                    *p_last = 0;
1479         if (p_st != NULL)
1480           *p_st = 0;
1481         if (__kmp_env_consistency_check) {
1482           if (pr->pushed_ws != ct_none) {
1483             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1484           }
1485         }
1486       } else {
1487         start = pr->u.p.lb;
1488         limit = chunk + init - 1;
1489         incr = pr->u.p.st;
1490 
1491         if ((last = (limit >= trip)) != 0) {
1492           limit = trip;
1493 #if KMP_OS_WINDOWS
1494           pr->u.p.last_upper = pr->u.p.ub;
1495 #endif /* KMP_OS_WINDOWS */
1496         }
1497         if (p_last != NULL)
1498           *p_last = last;
1499         if (p_st != NULL)
1500           *p_st = incr;
1501         if (incr == 1) {
1502           *p_lb = start + init;
1503           *p_ub = start + limit;
1504         } else {
1505           *p_lb = start + init * incr;
1506           *p_ub = start + limit * incr;
1507         }
1508 
1509         if (pr->ordered) {
1510           pr->u.p.ordered_lower = init;
1511           pr->u.p.ordered_upper = limit;
1512 #ifdef KMP_DEBUG
1513           {
1514             const char *buff;
1515             // create format specifiers before the debug output
1516             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1517                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1518                                     traits_t<UT>::spec, traits_t<UT>::spec);
1519             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1520                             pr->u.p.ordered_upper));
1521             __kmp_str_free(&buff);
1522           }
1523 #endif
1524         } // if
1525       } // if
1526     } else {
1527       pr->u.p.tc = 0;
1528       *p_lb = pr->u.p.lb;
1529       *p_ub = pr->u.p.ub;
1530 #if KMP_OS_WINDOWS
1531       pr->u.p.last_upper = *p_ub;
1532 #endif /* KMP_OS_WINDOWS */
1533       if (p_last != NULL)
1534         *p_last = TRUE;
1535       if (p_st != NULL)
1536         *p_st = pr->u.p.st;
1537     } // if
1538 #ifdef KMP_DEBUG
1539     {
1540       const char *buff;
1541       // create format specifiers before the debug output
1542       buff = __kmp_str_format(
1543           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1544           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1545           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1546       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1547       __kmp_str_free(&buff);
1548     }
1549 #endif
1550 #if INCLUDE_SSC_MARKS
1551     SSC_MARK_DISPATCH_NEXT();
1552 #endif
1553     OMPT_LOOP_END;
1554     return status;
1555   } else {
1556     kmp_int32 last = 0;
1557     dispatch_shared_info_template<UT> *sh;
1558     T start;
1559     ST incr;
1560     UT limit, trip, init;
1561 
1562     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1563                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1564 
1565     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1566         th->th.th_dispatch->th_dispatch_pr_current);
1567     KMP_DEBUG_ASSERT(pr);
1568     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1569         th->th.th_dispatch->th_dispatch_sh_current);
1570     KMP_DEBUG_ASSERT(sh);
1571 
1572     if (pr->u.p.tc == 0) {
1573       // zero trip count
1574       status = 0;
1575     } else {
1576       switch (pr->schedule) {
1577 #if (KMP_STATIC_STEAL_ENABLED)
1578       case kmp_sch_static_steal: {
1579         T chunk = pr->u.p.parm1;
1580         int nproc = th->th.th_team_nproc;
1581 
1582         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1583                        gtid));
1584 
1585         trip = pr->u.p.tc - 1;
1586 
1587         if (traits_t<T>::type_size > 4) {
1588           // use lock for 8-byte and CAS for 4-byte induction
1589           // variable. TODO (optional): check and use 16-byte CAS
1590           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1591           KMP_DEBUG_ASSERT(lck != NULL);
1592           if (pr->u.p.count < (UT)pr->u.p.ub) {
1593             __kmp_acquire_lock(lck, gtid);
1594             // try to get own chunk of iterations
1595             init = (pr->u.p.count)++;
1596             status = (init < (UT)pr->u.p.ub);
1597             __kmp_release_lock(lck, gtid);
1598           } else {
1599             status = 0; // no own chunks
1600           }
1601           if (!status) { // try to steal
1602             kmp_info_t **other_threads = team->t.t_threads;
1603             int while_limit = nproc; // nproc attempts to find a victim
1604             int while_index = 0;
1605             // TODO: algorithm of searching for a victim
1606             // should be cleaned up and measured
1607             while ((!status) && (while_limit != ++while_index)) {
1608               T remaining;
1609               T victimIdx = pr->u.p.parm4;
1610               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1611               dispatch_private_info_template<T> *victim =
1612                   reinterpret_cast<dispatch_private_info_template<T> *>(
1613                       other_threads[victimIdx]
1614                           ->th.th_dispatch->th_dispatch_pr_current);
1615               while ((victim == NULL || victim == pr ||
1616                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1617                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1618                      oldVictimIdx != victimIdx) {
1619                 victimIdx = (victimIdx + 1) % nproc;
1620                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1621                     other_threads[victimIdx]
1622                         ->th.th_dispatch->th_dispatch_pr_current);
1623               };
1624               if (!victim ||
1625                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1626                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1627                 continue; // try once more (nproc attempts in total)
1628                 // no victim is ready yet to participate in stealing
1629                 // because all victims are still in kmp_init_dispatch
1630               }
1631               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1632                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1633                 continue; // not enough chunks to steal, goto next victim
1634               }
1635 
1636               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1637               KMP_ASSERT(lck != NULL);
1638               __kmp_acquire_lock(lck, gtid);
1639               limit = victim->u.p.ub; // keep initial ub
1640               if (victim->u.p.count >= limit ||
1641                   (remaining = limit - victim->u.p.count) < 2) {
1642                 __kmp_release_lock(lck, gtid);
1643                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1644                 continue; // not enough chunks to steal
1645               }
1646               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1647               // or by 1
1648               if (remaining > 3) {
1649                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1650                 init = (victim->u.p.ub -=
1651                         (remaining >> 2)); // steal 1/4 of remaining
1652               } else {
1653                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1654                 init =
1655                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1656               }
1657               __kmp_release_lock(lck, gtid);
1658 
1659               KMP_DEBUG_ASSERT(init + 1 <= limit);
1660               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1661               status = 1;
1662               while_index = 0;
1663               // now update own count and ub with stolen range but init chunk
1664               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1665               pr->u.p.count = init + 1;
1666               pr->u.p.ub = limit;
1667               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1668             } // while (search for victim)
1669           } // if (try to find victim and steal)
1670         } else {
1671           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1672           typedef union {
1673             struct {
1674               UT count;
1675               T ub;
1676             } p;
1677             kmp_int64 b;
1678           } union_i4;
1679           // All operations on 'count' or 'ub' must be combined atomically
1680           // together.
1681           {
1682             union_i4 vold, vnew;
1683             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1684             vnew = vold;
1685             vnew.p.count++;
1686             while (!KMP_COMPARE_AND_STORE_ACQ64(
1687                 (volatile kmp_int64 *)&pr->u.p.count,
1688                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1689                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1690               KMP_CPU_PAUSE();
1691               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1692               vnew = vold;
1693               vnew.p.count++;
1694             }
1695             vnew = vold;
1696             init = vnew.p.count;
1697             status = (init < (UT)vnew.p.ub);
1698           }
1699 
1700           if (!status) {
1701             kmp_info_t **other_threads = team->t.t_threads;
1702             int while_limit = nproc; // nproc attempts to find a victim
1703             int while_index = 0;
1704 
1705             // TODO: algorithm of searching for a victim
1706             // should be cleaned up and measured
1707             while ((!status) && (while_limit != ++while_index)) {
1708               union_i4 vold, vnew;
1709               kmp_int32 remaining;
1710               T victimIdx = pr->u.p.parm4;
1711               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1712               dispatch_private_info_template<T> *victim =
1713                   reinterpret_cast<dispatch_private_info_template<T> *>(
1714                       other_threads[victimIdx]
1715                           ->th.th_dispatch->th_dispatch_pr_current);
1716               while ((victim == NULL || victim == pr ||
1717                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1718                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1719                      oldVictimIdx != victimIdx) {
1720                 victimIdx = (victimIdx + 1) % nproc;
1721                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1722                     other_threads[victimIdx]
1723                         ->th.th_dispatch->th_dispatch_pr_current);
1724               };
1725               if (!victim ||
1726                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1727                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1728                 continue; // try once more (nproc attempts in total)
1729                 // no victim is ready yet to participate in stealing
1730                 // because all victims are still in kmp_init_dispatch
1731               }
1732               pr->u.p.parm4 = victimIdx; // new victim found
1733               while (1) { // CAS loop if victim has enough chunks to steal
1734                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1735                 vnew = vold;
1736 
1737                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1738                 if (vnew.p.count >= (UT)vnew.p.ub ||
1739                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1740                   pr->u.p.parm4 =
1741                       (victimIdx + 1) % nproc; // shift start victim id
1742                   break; // not enough chunks to steal, goto next victim
1743                 }
1744                 if (remaining > 3) {
1745                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1746                 } else {
1747                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1748                 }
1749                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1750                 // TODO: Should this be acquire or release?
1751                 if (KMP_COMPARE_AND_STORE_ACQ64(
1752                         (volatile kmp_int64 *)&victim->u.p.count,
1753                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1754                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1755                   // stealing succeeded
1756                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1757                                   vold.p.ub - vnew.p.ub);
1758                   status = 1;
1759                   while_index = 0;
1760                   // now update own count and ub
1761                   init = vnew.p.ub;
1762                   vold.p.count = init + 1;
1763 #if KMP_ARCH_X86
1764                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1765                                    vold.b);
1766 #else
1767                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1768 #endif
1769                   break;
1770                 } // if (check CAS result)
1771                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1772               } // while (try to steal from particular victim)
1773             } // while (search for victim)
1774           } // if (try to find victim and steal)
1775         } // if (4-byte induction variable)
1776         if (!status) {
1777           *p_lb = 0;
1778           *p_ub = 0;
1779           if (p_st != NULL)
1780             *p_st = 0;
1781         } else {
1782           start = pr->u.p.parm2;
1783           init *= chunk;
1784           limit = chunk + init - 1;
1785           incr = pr->u.p.st;
1786           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1787 
1788           KMP_DEBUG_ASSERT(init <= trip);
1789           if ((last = (limit >= trip)) != 0)
1790             limit = trip;
1791           if (p_st != NULL)
1792             *p_st = incr;
1793 
1794           if (incr == 1) {
1795             *p_lb = start + init;
1796             *p_ub = start + limit;
1797           } else {
1798             *p_lb = start + init * incr;
1799             *p_ub = start + limit * incr;
1800           }
1801 
1802           if (pr->ordered) {
1803             pr->u.p.ordered_lower = init;
1804             pr->u.p.ordered_upper = limit;
1805 #ifdef KMP_DEBUG
1806             {
1807               const char *buff;
1808               // create format specifiers before the debug output
1809               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1810                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1811                                       traits_t<UT>::spec, traits_t<UT>::spec);
1812               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1813                               pr->u.p.ordered_upper));
1814               __kmp_str_free(&buff);
1815             }
1816 #endif
1817           } // if
1818         } // if
1819         break;
1820       } // case
1821 #endif // ( KMP_STATIC_STEAL_ENABLED )
1822       case kmp_sch_static_balanced: {
1823         KD_TRACE(
1824             100,
1825             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1826         if ((status = !pr->u.p.count) !=
1827             0) { /* check if thread has any iteration to do */
1828           pr->u.p.count = 1;
1829           *p_lb = pr->u.p.lb;
1830           *p_ub = pr->u.p.ub;
1831           last = pr->u.p.parm1;
1832           if (p_st != NULL)
1833             *p_st = pr->u.p.st;
1834         } else { /* no iterations to do */
1835           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1836         }
1837         if (pr->ordered) {
1838 #ifdef KMP_DEBUG
1839           {
1840             const char *buff;
1841             // create format specifiers before the debug output
1842             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1843                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1844                                     traits_t<UT>::spec, traits_t<UT>::spec);
1845             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1846                             pr->u.p.ordered_upper));
1847             __kmp_str_free(&buff);
1848           }
1849 #endif
1850         } // if
1851       } // case
1852       break;
1853       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1854                                      merged here */
1855       case kmp_sch_static_chunked: {
1856         T parm1;
1857 
1858         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1859                        "kmp_sch_static_[affinity|chunked] case\n",
1860                        gtid));
1861         parm1 = pr->u.p.parm1;
1862 
1863         trip = pr->u.p.tc - 1;
1864         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1865 
1866         if ((status = (init <= trip)) != 0) {
1867           start = pr->u.p.lb;
1868           incr = pr->u.p.st;
1869           limit = parm1 + init - 1;
1870 
1871           if ((last = (limit >= trip)) != 0)
1872             limit = trip;
1873 
1874           if (p_st != NULL)
1875             *p_st = incr;
1876 
1877           pr->u.p.count += th->th.th_team_nproc;
1878 
1879           if (incr == 1) {
1880             *p_lb = start + init;
1881             *p_ub = start + limit;
1882           } else {
1883             *p_lb = start + init * incr;
1884             *p_ub = start + limit * incr;
1885           }
1886 
1887           if (pr->ordered) {
1888             pr->u.p.ordered_lower = init;
1889             pr->u.p.ordered_upper = limit;
1890 #ifdef KMP_DEBUG
1891             {
1892               const char *buff;
1893               // create format specifiers before the debug output
1894               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1895                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1896                                       traits_t<UT>::spec, traits_t<UT>::spec);
1897               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1898                               pr->u.p.ordered_upper));
1899               __kmp_str_free(&buff);
1900             }
1901 #endif
1902           } // if
1903         } // if
1904       } // case
1905       break;
1906 
1907       case kmp_sch_dynamic_chunked: {
1908         T chunk = pr->u.p.parm1;
1909 
1910         KD_TRACE(
1911             100,
1912             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1913 
1914         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1915         trip = pr->u.p.tc - 1;
1916 
1917         if ((status = (init <= trip)) == 0) {
1918           *p_lb = 0;
1919           *p_ub = 0;
1920           if (p_st != NULL)
1921             *p_st = 0;
1922         } else {
1923           start = pr->u.p.lb;
1924           limit = chunk + init - 1;
1925           incr = pr->u.p.st;
1926 
1927           if ((last = (limit >= trip)) != 0)
1928             limit = trip;
1929 
1930           if (p_st != NULL)
1931             *p_st = incr;
1932 
1933           if (incr == 1) {
1934             *p_lb = start + init;
1935             *p_ub = start + limit;
1936           } else {
1937             *p_lb = start + init * incr;
1938             *p_ub = start + limit * incr;
1939           }
1940 
1941           if (pr->ordered) {
1942             pr->u.p.ordered_lower = init;
1943             pr->u.p.ordered_upper = limit;
1944 #ifdef KMP_DEBUG
1945             {
1946               const char *buff;
1947               // create format specifiers before the debug output
1948               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1949                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1950                                       traits_t<UT>::spec, traits_t<UT>::spec);
1951               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1952                               pr->u.p.ordered_upper));
1953               __kmp_str_free(&buff);
1954             }
1955 #endif
1956           } // if
1957         } // if
1958       } // case
1959       break;
1960 
1961       case kmp_sch_guided_iterative_chunked: {
1962         T chunkspec = pr->u.p.parm1;
1963         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1964                        "iterative case\n",
1965                        gtid));
1966         trip = pr->u.p.tc;
1967         // Start atomic part of calculations
1968         while (1) {
1969           ST remaining; // signed, because can be < 0
1970           init = sh->u.s.iteration; // shared value
1971           remaining = trip - init;
1972           if (remaining <= 0) { // AC: need to compare with 0 first
1973             // nothing to do, don't try atomic op
1974             status = 0;
1975             break;
1976           }
1977           if ((T)remaining <
1978               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1979             // use dynamic-style shcedule
1980             // atomically inrement iterations, get old value
1981             init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec);
1982             remaining = trip - init;
1983             if (remaining <= 0) {
1984               status = 0; // all iterations got by other threads
1985             } else { // got some iterations to work on
1986               status = 1;
1987               if ((T)remaining > chunkspec) {
1988                 limit = init + chunkspec - 1;
1989               } else {
1990                 last = 1; // the last chunk
1991                 limit = init + remaining - 1;
1992               } // if
1993             } // if
1994             break;
1995           } // if
1996           limit = init + (UT)(remaining *
1997                               *(double *)&pr->u.p.parm3); // divide by K*nproc
1998           if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
1999                                    (ST)limit)) {
2000             // CAS was successful, chunk obtained
2001             status = 1;
2002             --limit;
2003             break;
2004           } // if
2005         } // while
2006         if (status != 0) {
2007           start = pr->u.p.lb;
2008           incr = pr->u.p.st;
2009           if (p_st != NULL)
2010             *p_st = incr;
2011           *p_lb = start + init * incr;
2012           *p_ub = start + limit * incr;
2013           if (pr->ordered) {
2014             pr->u.p.ordered_lower = init;
2015             pr->u.p.ordered_upper = limit;
2016 #ifdef KMP_DEBUG
2017             {
2018               const char *buff;
2019               // create format specifiers before the debug output
2020               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2021                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2022                                       traits_t<UT>::spec, traits_t<UT>::spec);
2023               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2024                               pr->u.p.ordered_upper));
2025               __kmp_str_free(&buff);
2026             }
2027 #endif
2028           } // if
2029         } else {
2030           *p_lb = 0;
2031           *p_ub = 0;
2032           if (p_st != NULL)
2033             *p_st = 0;
2034         } // if
2035       } // case
2036       break;
2037 
2038       case kmp_sch_guided_simd: {
2039         // same as iterative but curr-chunk adjusted to be multiple of given
2040         // chunk
2041         T chunk = pr->u.p.parm1;
2042         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2043                        gtid));
2044         trip = pr->u.p.tc;
2045         // Start atomic part of calculations
2046         while (1) {
2047           ST remaining; // signed, because can be < 0
2048           init = sh->u.s.iteration; // shared value
2049           remaining = trip - init;
2050           if (remaining <= 0) { // AC: need to compare with 0 first
2051             status = 0; // nothing to do, don't try atomic op
2052             break;
2053           }
2054           KMP_DEBUG_ASSERT(init % chunk == 0);
2055           // compare with K*nproc*(chunk+1), K=2 by default
2056           if ((T)remaining < pr->u.p.parm2) {
2057             // use dynamic-style shcedule
2058             // atomically inrement iterations, get old value
2059             init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunk);
2060             remaining = trip - init;
2061             if (remaining <= 0) {
2062               status = 0; // all iterations got by other threads
2063             } else {
2064               // got some iterations to work on
2065               status = 1;
2066               if ((T)remaining > chunk) {
2067                 limit = init + chunk - 1;
2068               } else {
2069                 last = 1; // the last chunk
2070                 limit = init + remaining - 1;
2071               } // if
2072             } // if
2073             break;
2074           } // if
2075           // divide by K*nproc
2076           UT span = remaining * (*(double *)&pr->u.p.parm3);
2077           UT rem = span % chunk;
2078           if (rem) // adjust so that span%chunk == 0
2079             span += chunk - rem;
2080           limit = init + span;
2081           if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
2082                                    (ST)limit)) {
2083             // CAS was successful, chunk obtained
2084             status = 1;
2085             --limit;
2086             break;
2087           } // if
2088         } // while
2089         if (status != 0) {
2090           start = pr->u.p.lb;
2091           incr = pr->u.p.st;
2092           if (p_st != NULL)
2093             *p_st = incr;
2094           *p_lb = start + init * incr;
2095           *p_ub = start + limit * incr;
2096           if (pr->ordered) {
2097             pr->u.p.ordered_lower = init;
2098             pr->u.p.ordered_upper = limit;
2099 #ifdef KMP_DEBUG
2100             {
2101               const char *buff;
2102               // create format specifiers before the debug output
2103               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2104                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2105                                       traits_t<UT>::spec, traits_t<UT>::spec);
2106               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2107                               pr->u.p.ordered_upper));
2108               __kmp_str_free(&buff);
2109             }
2110 #endif
2111           } // if
2112         } else {
2113           *p_lb = 0;
2114           *p_ub = 0;
2115           if (p_st != NULL)
2116             *p_st = 0;
2117         } // if
2118       } // case
2119       break;
2120 
2121       case kmp_sch_guided_analytical_chunked: {
2122         T chunkspec = pr->u.p.parm1;
2123         UT chunkIdx;
2124 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2125         /* for storing original FPCW value for Windows* OS on
2126            IA-32 architecture 8-byte version */
2127         unsigned int oldFpcw;
2128         unsigned int fpcwSet = 0;
2129 #endif
2130         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2131                        "analytical case\n",
2132                        gtid));
2133 
2134         trip = pr->u.p.tc;
2135 
2136         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2137         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2138                          trip);
2139 
2140         while (1) { /* this while loop is a safeguard against unexpected zero
2141                        chunk sizes */
2142           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2143           if (chunkIdx >= (UT)pr->u.p.parm2) {
2144             --trip;
2145             /* use dynamic-style scheduling */
2146             init = chunkIdx * chunkspec + pr->u.p.count;
2147             /* need to verify init > 0 in case of overflow in the above
2148              * calculation */
2149             if ((status = (init > 0 && init <= trip)) != 0) {
2150               limit = init + chunkspec - 1;
2151 
2152               if ((last = (limit >= trip)) != 0)
2153                 limit = trip;
2154             }
2155             break;
2156           } else {
2157 /* use exponential-style scheduling */
2158 /* The following check is to workaround the lack of long double precision on
2159    Windows* OS.
2160    This check works around the possible effect that init != 0 for chunkIdx == 0.
2161  */
2162 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2163             /* If we haven't already done so, save original FPCW and set
2164                precision to 64-bit, as Windows* OS on IA-32 architecture
2165                defaults to 53-bit */
2166             if (!fpcwSet) {
2167               oldFpcw = _control87(0, 0);
2168               _control87(_PC_64, _MCW_PC);
2169               fpcwSet = 0x30000;
2170             }
2171 #endif
2172             if (chunkIdx) {
2173               init = __kmp_dispatch_guided_remaining<T>(
2174                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2175               KMP_DEBUG_ASSERT(init);
2176               init = trip - init;
2177             } else
2178               init = 0;
2179             limit = trip - __kmp_dispatch_guided_remaining<T>(
2180                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2181             KMP_ASSERT(init <= limit);
2182             if (init < limit) {
2183               KMP_DEBUG_ASSERT(limit <= trip);
2184               --limit;
2185               status = 1;
2186               break;
2187             } // if
2188           } // if
2189         } // while (1)
2190 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2191         /* restore FPCW if necessary
2192            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2193            here */
2194         if (fpcwSet && (oldFpcw & fpcwSet))
2195           _control87(oldFpcw, _MCW_PC);
2196 #endif
2197         if (status != 0) {
2198           start = pr->u.p.lb;
2199           incr = pr->u.p.st;
2200           if (p_st != NULL)
2201             *p_st = incr;
2202           *p_lb = start + init * incr;
2203           *p_ub = start + limit * incr;
2204           if (pr->ordered) {
2205             pr->u.p.ordered_lower = init;
2206             pr->u.p.ordered_upper = limit;
2207 #ifdef KMP_DEBUG
2208             {
2209               const char *buff;
2210               // create format specifiers before the debug output
2211               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2212                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2213                                       traits_t<UT>::spec, traits_t<UT>::spec);
2214               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2215                               pr->u.p.ordered_upper));
2216               __kmp_str_free(&buff);
2217             }
2218 #endif
2219           }
2220         } else {
2221           *p_lb = 0;
2222           *p_ub = 0;
2223           if (p_st != NULL)
2224             *p_st = 0;
2225         }
2226       } // case
2227       break;
2228 
2229       case kmp_sch_trapezoidal: {
2230         UT index;
2231         T parm2 = pr->u.p.parm2;
2232         T parm3 = pr->u.p.parm3;
2233         T parm4 = pr->u.p.parm4;
2234         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2235                        gtid));
2236 
2237         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2238 
2239         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2240         trip = pr->u.p.tc - 1;
2241 
2242         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2243           *p_lb = 0;
2244           *p_ub = 0;
2245           if (p_st != NULL)
2246             *p_st = 0;
2247         } else {
2248           start = pr->u.p.lb;
2249           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2250           incr = pr->u.p.st;
2251 
2252           if ((last = (limit >= trip)) != 0)
2253             limit = trip;
2254 
2255           if (p_st != NULL)
2256             *p_st = incr;
2257 
2258           if (incr == 1) {
2259             *p_lb = start + init;
2260             *p_ub = start + limit;
2261           } else {
2262             *p_lb = start + init * incr;
2263             *p_ub = start + limit * incr;
2264           }
2265 
2266           if (pr->ordered) {
2267             pr->u.p.ordered_lower = init;
2268             pr->u.p.ordered_upper = limit;
2269 #ifdef KMP_DEBUG
2270             {
2271               const char *buff;
2272               // create format specifiers before the debug output
2273               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2274                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2275                                       traits_t<UT>::spec, traits_t<UT>::spec);
2276               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2277                               pr->u.p.ordered_upper));
2278               __kmp_str_free(&buff);
2279             }
2280 #endif
2281           } // if
2282         } // if
2283       } // case
2284       break;
2285       default: {
2286         status = 0; // to avoid complaints on uninitialized variable use
2287         __kmp_msg(kmp_ms_fatal, // Severity
2288                   KMP_MSG(UnknownSchedTypeDetected), // Primary message
2289                   KMP_HNT(GetNewerLibrary), // Hint
2290                   __kmp_msg_null // Variadic argument list terminator
2291                   );
2292       } break;
2293       } // switch
2294     } // if tc == 0;
2295 
2296     if (status == 0) {
2297       UT num_done;
2298 
2299       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2300 #ifdef KMP_DEBUG
2301       {
2302         const char *buff;
2303         // create format specifiers before the debug output
2304         buff = __kmp_str_format(
2305             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2306             traits_t<UT>::spec);
2307         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2308         __kmp_str_free(&buff);
2309       }
2310 #endif
2311 
2312       if ((ST)num_done == th->th.th_team_nproc - 1) {
2313 #if (KMP_STATIC_STEAL_ENABLED)
2314         if (pr->schedule == kmp_sch_static_steal &&
2315             traits_t<T>::type_size > 4) {
2316           int i;
2317           kmp_info_t **other_threads = team->t.t_threads;
2318           // loop complete, safe to destroy locks used for stealing
2319           for (i = 0; i < th->th.th_team_nproc; ++i) {
2320             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2321             KMP_ASSERT(lck != NULL);
2322             __kmp_destroy_lock(lck);
2323             __kmp_free(lck);
2324             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2325           }
2326         }
2327 #endif
2328         /* NOTE: release this buffer to be reused */
2329 
2330         KMP_MB(); /* Flush all pending memory write invalidates.  */
2331 
2332         sh->u.s.num_done = 0;
2333         sh->u.s.iteration = 0;
2334 
2335         /* TODO replace with general release procedure? */
2336         if (pr->ordered) {
2337           sh->u.s.ordered_iteration = 0;
2338         }
2339 
2340         KMP_MB(); /* Flush all pending memory write invalidates.  */
2341 
2342         sh->buffer_index += __kmp_dispatch_num_buffers;
2343         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2344                        gtid, sh->buffer_index));
2345 
2346         KMP_MB(); /* Flush all pending memory write invalidates.  */
2347 
2348       } // if
2349       if (__kmp_env_consistency_check) {
2350         if (pr->pushed_ws != ct_none) {
2351           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2352         }
2353       }
2354 
2355       th->th.th_dispatch->th_deo_fcn = NULL;
2356       th->th.th_dispatch->th_dxo_fcn = NULL;
2357       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2358       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2359     } // if (status == 0)
2360 #if KMP_OS_WINDOWS
2361     else if (last) {
2362       pr->u.p.last_upper = pr->u.p.ub;
2363     }
2364 #endif /* KMP_OS_WINDOWS */
2365     if (p_last != NULL && status != 0)
2366       *p_last = last;
2367   } // if
2368 
2369 #ifdef KMP_DEBUG
2370   {
2371     const char *buff;
2372     // create format specifiers before the debug output
2373     buff = __kmp_str_format(
2374         "__kmp_dispatch_next: T#%%d normal case: "
2375         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2376         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2377     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2378     __kmp_str_free(&buff);
2379   }
2380 #endif
2381 #if INCLUDE_SSC_MARKS
2382   SSC_MARK_DISPATCH_NEXT();
2383 #endif
2384   OMPT_LOOP_END;
2385   return status;
2386 }
2387 
2388 template <typename T>
2389 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2390                                   kmp_int32 *plastiter, T *plower, T *pupper,
2391                                   typename traits_t<T>::signed_t incr) {
2392   typedef typename traits_t<T>::unsigned_t UT;
2393   typedef typename traits_t<T>::signed_t ST;
2394   register kmp_uint32 team_id;
2395   register kmp_uint32 nteams;
2396   register UT trip_count;
2397   register kmp_team_t *team;
2398   kmp_info_t *th;
2399 
2400   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2401   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2402 #ifdef KMP_DEBUG
2403   {
2404     const char *buff;
2405     // create format specifiers before the debug output
2406     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2407                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2408                             traits_t<T>::spec, traits_t<T>::spec,
2409                             traits_t<ST>::spec, traits_t<T>::spec);
2410     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2411     __kmp_str_free(&buff);
2412   }
2413 #endif
2414 
2415   if (__kmp_env_consistency_check) {
2416     if (incr == 0) {
2417       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2418                             loc);
2419     }
2420     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2421       // The loop is illegal.
2422       // Some zero-trip loops maintained by compiler, e.g.:
2423       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2424       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2425       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2426       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2427       // Compiler does not check the following illegal loops:
2428       //   for(i=0;i<10;i+=incr) // where incr<0
2429       //   for(i=10;i>0;i-=incr) // where incr<0
2430       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2431     }
2432   }
2433   th = __kmp_threads[gtid];
2434   team = th->th.th_team;
2435 #if OMP_40_ENABLED
2436   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2437   nteams = th->th.th_teams_size.nteams;
2438 #endif
2439   team_id = team->t.t_master_tid;
2440   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2441 
2442   // compute global trip count
2443   if (incr == 1) {
2444     trip_count = *pupper - *plower + 1;
2445   } else if (incr == -1) {
2446     trip_count = *plower - *pupper + 1;
2447   } else if (incr > 0) {
2448     // upper-lower can exceed the limit of signed type
2449     trip_count = (UT)(*pupper - *plower) / incr + 1;
2450   } else {
2451     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2452   }
2453 
2454   if (trip_count <= nteams) {
2455     KMP_DEBUG_ASSERT(
2456         __kmp_static == kmp_sch_static_greedy ||
2457         __kmp_static ==
2458             kmp_sch_static_balanced); // Unknown static scheduling type.
2459     // only some teams get single iteration, others get nothing
2460     if (team_id < trip_count) {
2461       *pupper = *plower = *plower + team_id * incr;
2462     } else {
2463       *plower = *pupper + incr; // zero-trip loop
2464     }
2465     if (plastiter != NULL)
2466       *plastiter = (team_id == trip_count - 1);
2467   } else {
2468     if (__kmp_static == kmp_sch_static_balanced) {
2469       register UT chunk = trip_count / nteams;
2470       register UT extras = trip_count % nteams;
2471       *plower +=
2472           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2473       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2474       if (plastiter != NULL)
2475         *plastiter = (team_id == nteams - 1);
2476     } else {
2477       register T chunk_inc_count =
2478           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2479       register T upper = *pupper;
2480       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2481       // Unknown static scheduling type.
2482       *plower += team_id * chunk_inc_count;
2483       *pupper = *plower + chunk_inc_count - incr;
2484       // Check/correct bounds if needed
2485       if (incr > 0) {
2486         if (*pupper < *plower)
2487           *pupper = traits_t<T>::max_value;
2488         if (plastiter != NULL)
2489           *plastiter = *plower <= upper && *pupper > upper - incr;
2490         if (*pupper > upper)
2491           *pupper = upper; // tracker C73258
2492       } else {
2493         if (*pupper > *plower)
2494           *pupper = traits_t<T>::min_value;
2495         if (plastiter != NULL)
2496           *plastiter = *plower >= upper && *pupper < upper - incr;
2497         if (*pupper < upper)
2498           *pupper = upper; // tracker C73258
2499       }
2500     }
2501   }
2502 }
2503 
2504 //-----------------------------------------------------------------------------
2505 // Dispatch routines
2506 //    Transfer call to template< type T >
2507 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2508 //                         T lb, T ub, ST st, ST chunk )
2509 extern "C" {
2510 
2511 /*!
2512 @ingroup WORK_SHARING
2513 @{
2514 @param loc Source location
2515 @param gtid Global thread id
2516 @param schedule Schedule type
2517 @param lb  Lower bound
2518 @param ub  Upper bound
2519 @param st  Step (or increment if you prefer)
2520 @param chunk The chunk size to block with
2521 
2522 This function prepares the runtime to start a dynamically scheduled for loop,
2523 saving the loop arguments.
2524 These functions are all identical apart from the types of the arguments.
2525 */
2526 
2527 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2528                             enum sched_type schedule, kmp_int32 lb,
2529                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2530   KMP_DEBUG_ASSERT(__kmp_init_serial);
2531   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2532 }
2533 /*!
2534 See @ref __kmpc_dispatch_init_4
2535 */
2536 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2537                              enum sched_type schedule, kmp_uint32 lb,
2538                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2539   KMP_DEBUG_ASSERT(__kmp_init_serial);
2540   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2541 }
2542 
2543 /*!
2544 See @ref __kmpc_dispatch_init_4
2545 */
2546 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2547                             enum sched_type schedule, kmp_int64 lb,
2548                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2549   KMP_DEBUG_ASSERT(__kmp_init_serial);
2550   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2551 }
2552 
2553 /*!
2554 See @ref __kmpc_dispatch_init_4
2555 */
2556 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2557                              enum sched_type schedule, kmp_uint64 lb,
2558                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2559   KMP_DEBUG_ASSERT(__kmp_init_serial);
2560   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2561 }
2562 
2563 /*!
2564 See @ref __kmpc_dispatch_init_4
2565 
2566 Difference from __kmpc_dispatch_init set of functions is these functions
2567 are called for composite distribute parallel for construct. Thus before
2568 regular iterations dispatching we need to calc per-team iteration space.
2569 
2570 These functions are all identical apart from the types of the arguments.
2571 */
2572 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2573                                  enum sched_type schedule, kmp_int32 *p_last,
2574                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2575                                  kmp_int32 chunk) {
2576   KMP_DEBUG_ASSERT(__kmp_init_serial);
2577   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2578   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2579 }
2580 
2581 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2582                                   enum sched_type schedule, kmp_int32 *p_last,
2583                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2584                                   kmp_int32 chunk) {
2585   KMP_DEBUG_ASSERT(__kmp_init_serial);
2586   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2587   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2588 }
2589 
2590 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2591                                  enum sched_type schedule, kmp_int32 *p_last,
2592                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2593                                  kmp_int64 chunk) {
2594   KMP_DEBUG_ASSERT(__kmp_init_serial);
2595   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2596   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2597 }
2598 
2599 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2600                                   enum sched_type schedule, kmp_int32 *p_last,
2601                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2602                                   kmp_int64 chunk) {
2603   KMP_DEBUG_ASSERT(__kmp_init_serial);
2604   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2605   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2606 }
2607 
2608 /*!
2609 @param loc Source code location
2610 @param gtid Global thread id
2611 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2612 otherwise
2613 @param p_lb   Pointer to the lower bound for the next chunk of work
2614 @param p_ub   Pointer to the upper bound for the next chunk of work
2615 @param p_st   Pointer to the stride for the next chunk of work
2616 @return one if there is work to be done, zero otherwise
2617 
2618 Get the next dynamically allocated chunk of work for this thread.
2619 If there is no more work, then the lb,ub and stride need not be modified.
2620 */
2621 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2622                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2623   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2624 }
2625 
2626 /*!
2627 See @ref __kmpc_dispatch_next_4
2628 */
2629 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2630                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2631                             kmp_int32 *p_st) {
2632   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2633 }
2634 
2635 /*!
2636 See @ref __kmpc_dispatch_next_4
2637 */
2638 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2639                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2640   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2641 }
2642 
2643 /*!
2644 See @ref __kmpc_dispatch_next_4
2645 */
2646 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2647                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2648                             kmp_int64 *p_st) {
2649   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2650 }
2651 
2652 /*!
2653 @param loc Source code location
2654 @param gtid Global thread id
2655 
2656 Mark the end of a dynamic loop.
2657 */
2658 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2659   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2660 }
2661 
2662 /*!
2663 See @ref __kmpc_dispatch_fini_4
2664 */
2665 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2666   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2667 }
2668 
2669 /*!
2670 See @ref __kmpc_dispatch_fini_4
2671 */
2672 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2673   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2674 }
2675 
2676 /*!
2677 See @ref __kmpc_dispatch_fini_4
2678 */
2679 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2680   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2681 }
2682 /*! @} */
2683 
2684 //-----------------------------------------------------------------------------
2685 // Non-template routines from kmp_dispatch.cpp used in other sources
2686 
2687 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2688   return value == checker;
2689 }
2690 
2691 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2692   return value != checker;
2693 }
2694 
2695 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2696   return value < checker;
2697 }
2698 
2699 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2700   return value >= checker;
2701 }
2702 
2703 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2704   return value <= checker;
2705 }
2706 
2707 kmp_uint32
2708 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2709                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2710                    void *obj // Higher-level synchronization object, or NULL.
2711                    ) {
2712   // note: we may not belong to a team at this point
2713   register volatile kmp_uint32 *spin = spinner;
2714   register kmp_uint32 check = checker;
2715   register kmp_uint32 spins;
2716   register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2717   register kmp_uint32 r;
2718 
2719   KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
2720   KMP_INIT_YIELD(spins);
2721   // main wait spin loop
2722   while (!f(r = TCR_4(*spin), check)) {
2723     KMP_FSYNC_SPIN_PREPARE(obj);
2724     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2725        split. It causes problems with infinite recursion because of exit lock */
2726     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2727         __kmp_abort_thread(); */
2728 
2729     /* if we have waited a bit, or are oversubscribed, yield */
2730     /* pause is in the following code */
2731     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2732     KMP_YIELD_SPIN(spins);
2733   }
2734   KMP_FSYNC_SPIN_ACQUIRED(obj);
2735   return r;
2736 }
2737 
2738 void __kmp_wait_yield_4_ptr(
2739     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2740     void *obj // Higher-level synchronization object, or NULL.
2741     ) {
2742   // note: we may not belong to a team at this point
2743   register void *spin = spinner;
2744   register kmp_uint32 check = checker;
2745   register kmp_uint32 spins;
2746   register kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2747 
2748   KMP_FSYNC_SPIN_INIT(obj, spin);
2749   KMP_INIT_YIELD(spins);
2750   // main wait spin loop
2751   while (!f(spin, check)) {
2752     KMP_FSYNC_SPIN_PREPARE(obj);
2753     /* if we have waited a bit, or are oversubscribed, yield */
2754     /* pause is in the following code */
2755     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2756     KMP_YIELD_SPIN(spins);
2757   }
2758   KMP_FSYNC_SPIN_ACQUIRED(obj);
2759 }
2760 
2761 } // extern "C"
2762 
2763 #ifdef KMP_GOMP_COMPAT
2764 
2765 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2766                                enum sched_type schedule, kmp_int32 lb,
2767                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2768                                int push_ws) {
2769   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2770                                  push_ws);
2771 }
2772 
2773 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2774                                 enum sched_type schedule, kmp_uint32 lb,
2775                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2776                                 int push_ws) {
2777   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2778                                   push_ws);
2779 }
2780 
2781 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2782                                enum sched_type schedule, kmp_int64 lb,
2783                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2784                                int push_ws) {
2785   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2786                                  push_ws);
2787 }
2788 
2789 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2790                                 enum sched_type schedule, kmp_uint64 lb,
2791                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2792                                 int push_ws) {
2793   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2794                                   push_ws);
2795 }
2796 
2797 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2798   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2799 }
2800 
2801 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2802   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2803 }
2804 
2805 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2806   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2807 }
2808 
2809 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2810   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2811 }
2812 
2813 #endif /* KMP_GOMP_COMPAT */
2814 
2815 /* ------------------------------------------------------------------------ */
2816