1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  *       it may change values between parallel regions.  __kmp_max_nth
20  *       is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52   typedef typename traits_t<T>::unsigned_t UT;
53   typedef typename traits_t<T>::signed_t ST;
54   UT count; // unsigned
55   T ub;
56   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57   T lb;
58   ST st; // signed
59   UT tc; // unsigned
60   T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62   /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65   //    a) parm3 is properly aligned and
66   //    b) all parm1-4 are in the same cache line.
67   // Because of parm1-4 are used together, performance seems to be better
68   // if they are in the same line (not measured though).
69 
70   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71     T parm1;
72     T parm2;
73     T parm3;
74     T parm4;
75   };
76 
77   UT ordered_lower; // unsigned
78   UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80   T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   T lb;
92   T ub;
93   ST st; // signed
94   UT tc; // unsigned
95 
96   T parm1;
97   T parm2;
98   T parm3;
99   T parm4;
100 
101   UT count; // unsigned
102 
103   UT ordered_lower; // unsigned
104   UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106   T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114   // duplicate alignment here, otherwise size of structure is not correct in our
115   // compiler
116   union KMP_ALIGN_CACHE private_info_tmpl {
117     dispatch_private_infoXX_template<T> p;
118     dispatch_private_info64_t p64;
119   } u;
120   enum sched_type schedule; /* scheduling algorithm */
121   kmp_uint32 ordered; /* ordered clause specified */
122   kmp_uint32 ordered_bumped;
123   // To retain the structure size after making ordered_iteration scalar
124   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126   kmp_uint32 nomerge; /* don't merge iters if serialized */
127   kmp_uint32 type_size;
128   enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134   /* chunk index under dynamic, number of idle threads under static-steal;
135      iteration index otherwise */
136   volatile UT iteration;
137   volatile UT num_done;
138   volatile UT ordered_iteration;
139   // to retain the structure size making ordered_iteration scalar
140   UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145   // we need union here to keep the structure size
146   union shared_info_tmpl {
147     dispatch_shared_infoXX_template<UT> s;
148     dispatch_shared_info64_t s64;
149   } u;
150   volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152   volatile kmp_int32 doacross_buf_idx; // teamwise index
153   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154   kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158   // machines (> 48 cores). Performance analysis showed that a cache thrash
159   // was occurring and this padding helps alleviate the problem.
160   char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173                                                  kmp_int32 d) {
174   kmp_int32 r;
175   r = KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, p), d);
176   return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181                                                  kmp_int64 d) {
182   kmp_int64 r;
183   r = KMP_TEST_THEN_ADD64(CCAST(kmp_int64 *, p), d);
184   return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192   kmp_int32 r;
193   r = KMP_TEST_THEN_INC_ACQ32(CCAST(kmp_int32 *, p));
194   return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199   kmp_int64 r;
200   r = KMP_TEST_THEN_INC_ACQ64(CCAST(kmp_int64 *, p));
201   return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209   kmp_int32 r;
210   r = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, p));
211   return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216   kmp_int64 r;
217   r = KMP_TEST_THEN_INC64(CCAST(kmp_int64 *, p));
218   return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227                                                     kmp_int32 c, kmp_int32 s) {
228   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233                                                     kmp_int64 c, kmp_int64 s) {
234   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238     Waits until function returns non-zero when called with *spinner and check.
239     Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241     Arguments:
242         obj -- is higher-level synchronization object to report to ittnotify.
243         It is used to report locks consistently. For example, if lock is
244         acquired immediately, its address is reported to ittnotify via
245         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247         same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253     __kmp_wait_yield(
254         volatile UT *spinner, UT checker,
255         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256             void *obj) // Higher-level synchronization object, or NULL.
257         ) {
258   // note: we may not belong to a team at this point
259   volatile UT *spin = spinner;
260   UT check = checker;
261   kmp_uint32 spins;
262   kmp_uint32 (*f)(UT, UT) = pred;
263   UT r;
264 
265   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
266   KMP_INIT_YIELD(spins);
267   // main wait spin loop
268   while (!f(r = *spin, check)) {
269     KMP_FSYNC_SPIN_PREPARE(obj);
270     /* GEH - remove this since it was accidentally introduced when kmp_wait was
271        split. It causes problems with infinite recursion because of exit lock */
272     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273         __kmp_abort_thread(); */
274 
275     // if we are oversubscribed, or have waited a bit (and
276     // KMP_LIBRARY=throughput, then yield. pause is in the following code
277     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278     KMP_YIELD_SPIN(spins);
279   }
280   KMP_FSYNC_SPIN_ACQUIRED(obj);
281   return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285   return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289   return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293   return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297   return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301   return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307                                      ident_t *loc_ref) {
308   kmp_info_t *th;
309 
310   KMP_DEBUG_ASSERT(gtid_ref);
311 
312   if (__kmp_env_consistency_check) {
313     th = __kmp_threads[*gtid_ref];
314     if (th->th.th_root->r.r_active &&
315         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321     }
322   }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327   typedef typename traits_t<UT>::signed_t ST;
328   dispatch_private_info_template<UT> *pr;
329 
330   int gtid = *gtid_ref;
331   //    int  cid = *cid_ref;
332   kmp_info_t *th = __kmp_threads[gtid];
333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336   if (__kmp_env_consistency_check) {
337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338         th->th.th_dispatch->th_dispatch_pr_current);
339     if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345     }
346   }
347 
348   if (!th->th.th_team->t.t_serialized) {
349     dispatch_shared_info_template<UT> *sh =
350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
351             th->th.th_dispatch->th_dispatch_sh_current);
352     UT lower;
353 
354     if (!__kmp_env_consistency_check) {
355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356           th->th.th_dispatch->th_dispatch_pr_current);
357     }
358     lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361     if (__kmp_env_consistency_check) {
362       if (pr->ordered_bumped) {
363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365                                ct_ordered_in_pdo, loc_ref,
366                                &p->stack_data[p->w_top]);
367       }
368     }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371     KMP_MB();
372 #ifdef KMP_DEBUG
373     {
374       const char *buff;
375       // create format specifiers before the debug output
376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377                               "ordered_iter:%%%s lower:%%%s\n",
378                               traits_t<UT>::spec, traits_t<UT>::spec);
379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380       __kmp_str_free(&buff);
381     }
382 #endif
383 
384     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386     KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388     {
389       const char *buff;
390       // create format specifiers before the debug output
391       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392                               "ordered_iter:%%%s lower:%%%s\n",
393                               traits_t<UT>::spec, traits_t<UT>::spec);
394       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395       __kmp_str_free(&buff);
396     }
397 #endif
398   }
399   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403                                      ident_t *loc_ref) {
404   kmp_info_t *th;
405 
406   if (__kmp_env_consistency_check) {
407     th = __kmp_threads[*gtid_ref];
408     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410     }
411   }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416   typedef typename traits_t<UT>::signed_t ST;
417   dispatch_private_info_template<UT> *pr;
418 
419   int gtid = *gtid_ref;
420   //    int  cid = *cid_ref;
421   kmp_info_t *th = __kmp_threads[gtid];
422   KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425   if (__kmp_env_consistency_check) {
426     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427         th->th.th_dispatch->th_dispatch_pr_current);
428     if (pr->pushed_ws != ct_none) {
429       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430     }
431   }
432 
433   if (!th->th.th_team->t.t_serialized) {
434     dispatch_shared_info_template<UT> *sh =
435         reinterpret_cast<dispatch_shared_info_template<UT> *>(
436             th->th.th_dispatch->th_dispatch_sh_current);
437 
438     if (!__kmp_env_consistency_check) {
439       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440           th->th.th_dispatch->th_dispatch_pr_current);
441     }
442 
443     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
444 #if !defined(KMP_GOMP_COMPAT)
445     if (__kmp_env_consistency_check) {
446       if (pr->ordered_bumped != 0) {
447         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448         /* How to test it? - OM */
449         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450                                ct_ordered_in_pdo, loc_ref,
451                                &p->stack_data[p->w_top]);
452       }
453     }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456     KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458     pr->ordered_bumped += 1;
459 
460     KD_TRACE(1000,
461              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462               gtid, pr->ordered_bumped));
463 
464     KMP_MB(); /* Flush all pending memory write invalidates.  */
465 
466     /* TODO use general release procedure? */
467     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469     KMP_MB(); /* Flush all pending memory write invalidates.  */
470   }
471   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477   long double s = 1.0L;
478 
479   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481   while (y) {
482     if (y & 1)
483       s *= x;
484     x *= x;
485     y >>= 1;
486   }
487   return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491    have been assigned (the total number of unassigned iterations in chunks with
492    index greater than or equal to idx). __forceinline seems to be broken so that
493    if we __forceinline this function, the behavior is wrong
494    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498                                 typename traits_t<T>::unsigned_t idx) {
499   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500      ICL 8.1, long double arithmetic may not really have long double precision,
501      even with /Qlong_double.  Currently, we workaround that in the caller code,
502      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
503      of precision is not expected to be a correctness issue, though. */
504   typedef typename traits_t<T>::unsigned_t UT;
505 
506   long double x = tc * __kmp_pow<UT>(base, idx);
507   UT r = (UT)x;
508   if (x == r)
509     return r;
510   return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
515 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527                     T ub, typename traits_t<T>::signed_t st,
528                     typename traits_t<T>::signed_t chunk, int push_ws) {
529   typedef typename traits_t<T>::unsigned_t UT;
530   typedef typename traits_t<T>::signed_t ST;
531   typedef typename traits_t<T>::floating_t DBL;
532 
533   int active;
534   T tc;
535   kmp_info_t *th;
536   kmp_team_t *team;
537   kmp_uint32 my_buffer_index;
538   dispatch_private_info_template<T> *pr;
539   dispatch_shared_info_template<UT> volatile *sh;
540 
541   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542                    sizeof(dispatch_private_info));
543   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544                    sizeof(dispatch_shared_info));
545 
546   if (!TCR_4(__kmp_init_parallel))
547     __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550   SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553   {
554     const char *buff;
555     // create format specifiers before the debug output
556     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558                             traits_t<ST>::spec, traits_t<T>::spec,
559                             traits_t<T>::spec, traits_t<ST>::spec);
560     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561     __kmp_str_free(&buff);
562   }
563 #endif
564   /* setup data */
565   th = __kmp_threads[gtid];
566   team = th->th.th_team;
567   active = !team->t.t_serialized;
568   th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571   kmp_uint64 cur_chunk = chunk;
572   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573                                     __kmp_forkjoin_frames_mode == 3 &&
574                                     KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576                                     th->th.th_teams_microtask == NULL &&
577 #endif
578                                     team->t.t_active_level == 1;
579 #endif
580   if (!active) {
581     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583   } else {
584     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587     my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589     /* What happens when number of threads changes, need to resize buffer? */
590     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591         &th->th.th_dispatch
592              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595   }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599     // AC: we now have only one implementation of stealing, so use it
600     schedule = kmp_sch_static_steal;
601   else
602 #endif
603     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605   /* Pick up the nomerge/ordered bits from the scheduling type */
606   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607     pr->nomerge = TRUE;
608     schedule =
609         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610   } else {
611     pr->nomerge = FALSE;
612   }
613   pr->type_size = traits_t<T>::type_size; // remember the size of variables
614   if (kmp_ord_lower & schedule) {
615     pr->ordered = TRUE;
616     schedule =
617         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618   } else {
619     pr->ordered = FALSE;
620   }
621 
622   if (schedule == kmp_sch_static) {
623     schedule = __kmp_static;
624   } else {
625     if (schedule == kmp_sch_runtime) {
626       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627       // not specified)
628       schedule = team->t.t_sched.r_sched_type;
629       // Detail the schedule if needed (global controls are differentiated
630       // appropriately)
631       if (schedule == kmp_sch_guided_chunked) {
632         schedule = __kmp_guided;
633       } else if (schedule == kmp_sch_static) {
634         schedule = __kmp_static;
635       }
636       // Use the chunk size specified by OMP_SCHEDULE (or default if not
637       // specified)
638       chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640       cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643       {
644         const char *buff;
645         // create format specifiers before the debug output
646         buff = __kmp_str_format(
647             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648             traits_t<ST>::spec);
649         KD_TRACE(10, (buff, gtid, schedule, chunk));
650         __kmp_str_free(&buff);
651       }
652 #endif
653     } else {
654       if (schedule == kmp_sch_guided_chunked) {
655         schedule = __kmp_guided;
656       }
657       if (chunk <= 0) {
658         chunk = KMP_DEFAULT_CHUNK;
659       }
660     }
661 
662     if (schedule == kmp_sch_auto) {
663       // mapping and differentiation: in the __kmp_do_serial_initialize()
664       schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666       {
667         const char *buff;
668         // create format specifiers before the debug output
669         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670                                 "schedule:%%d chunk:%%%s\n",
671                                 traits_t<ST>::spec);
672         KD_TRACE(10, (buff, gtid, schedule, chunk));
673         __kmp_str_free(&buff);
674       }
675 #endif
676     }
677 
678     /* guided analytical not safe for too many threads */
679     if (schedule == kmp_sch_guided_analytical_chunked &&
680         th->th.th_team_nproc > 1 << 20) {
681       schedule = kmp_sch_guided_iterative_chunked;
682       KMP_WARNING(DispatchManyThreads);
683     }
684     if (schedule == kmp_sch_runtime_simd) {
685       // compiler provides simd_width in the chunk parameter
686       schedule = team->t.t_sched.r_sched_type;
687       // Detail the schedule if needed (global controls are differentiated
688       // appropriately)
689       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690           schedule == __kmp_static) {
691         schedule = kmp_sch_static_balanced_chunked;
692       } else {
693         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694           schedule = kmp_sch_guided_simd;
695         }
696         chunk = team->t.t_sched.chunk * chunk;
697       }
698 #if USE_ITT_BUILD
699       cur_chunk = chunk;
700 #endif
701 #ifdef KMP_DEBUG
702       {
703         const char *buff;
704         // create format specifiers before the debug output
705         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706                                 " chunk:%%%s\n",
707                                 traits_t<ST>::spec);
708         KD_TRACE(10, (buff, gtid, schedule, chunk));
709         __kmp_str_free(&buff);
710       }
711 #endif
712     }
713     pr->u.p.parm1 = chunk;
714   }
715   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716               "unknown scheduling type");
717 
718   pr->u.p.count = 0;
719 
720   if (__kmp_env_consistency_check) {
721     if (st == 0) {
722       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724     }
725   }
726   // compute trip count
727   if (st == 1) { // most common case
728     if (ub >= lb) {
729       tc = ub - lb + 1;
730     } else { // ub < lb
731       tc = 0; // zero-trip
732     }
733   } else if (st < 0) {
734     if (lb >= ub) {
735       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736       // where the division needs to be unsigned regardless of the result type
737       tc = (UT)(lb - ub) / (-st) + 1;
738     } else { // lb < ub
739       tc = 0; // zero-trip
740     }
741   } else { // st > 0
742     if (ub >= lb) {
743       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744       // where the division needs to be unsigned regardless of the result type
745       tc = (UT)(ub - lb) / st + 1;
746     } else { // ub < lb
747       tc = 0; // zero-trip
748     }
749   }
750 
751   // Any half-decent optimizer will remove this test when the blocks are empty
752   // since the macros expand to nothing when statistics are disabled.
753   if (schedule == __kmp_static) {
754     KMP_COUNT_BLOCK(OMP_FOR_static);
755     KMP_COUNT_VALUE(FOR_static_iterations, tc);
756   } else {
757     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759   }
760 
761   pr->u.p.lb = lb;
762   pr->u.p.ub = ub;
763   pr->u.p.st = st;
764   pr->u.p.tc = tc;
765 
766 #if KMP_OS_WINDOWS
767   pr->u.p.last_upper = ub + st;
768 #endif /* KMP_OS_WINDOWS */
769 
770   /* NOTE: only the active parallel region(s) has active ordered sections */
771 
772   if (active) {
773     if (pr->ordered == 0) {
774       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776     } else {
777       pr->ordered_bumped = 0;
778 
779       pr->u.p.ordered_lower = 1;
780       pr->u.p.ordered_upper = 0;
781 
782       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784     }
785   }
786 
787   if (__kmp_env_consistency_check) {
788     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789     if (push_ws) {
790       __kmp_push_workshare(gtid, ws, loc);
791       pr->pushed_ws = ws;
792     } else {
793       __kmp_check_workshare(gtid, ws, loc);
794       pr->pushed_ws = ct_none;
795     }
796   }
797 
798   switch (schedule) {
799 #if (KMP_STATIC_STEAL_ENABLED)
800   case kmp_sch_static_steal: {
801     T nproc = th->th.th_team_nproc;
802     T ntc, init;
803 
804     KD_TRACE(100,
805              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806 
807     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808     if (nproc > 1 && ntc >= nproc) {
809       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810       T id = __kmp_tid_from_gtid(gtid);
811       T small_chunk, extras;
812 
813       small_chunk = ntc / nproc;
814       extras = ntc % nproc;
815 
816       init = id * small_chunk + (id < extras ? id : extras);
817       pr->u.p.count = init;
818       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819 
820       pr->u.p.parm2 = lb;
821       // pr->pfields.parm3 = 0; // it's not used in static_steal
822       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823       pr->u.p.st = st;
824       if (traits_t<T>::type_size > 4) {
825         // AC: TODO: check if 16-byte CAS available and use it to
826         // improve performance (probably wait for explicit request
827         // before spending time on this).
828         // For now use dynamically allocated per-thread lock,
829         // free memory in __kmp_dispatch_next when status==0.
830         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831         th->th.th_dispatch->th_steal_lock =
832             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834       }
835       break;
836     } else {
837       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838                      "kmp_sch_static_balanced\n",
839                      gtid));
840       schedule = kmp_sch_static_balanced;
841       /* too few iterations: fall-through to kmp_sch_static_balanced */
842     } // if
843     /* FALL-THROUGH to static balanced */
844   } // case
845 #endif
846   case kmp_sch_static_balanced: {
847     T nproc = th->th.th_team_nproc;
848     T init, limit;
849 
850     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851                    gtid));
852 
853     if (nproc > 1) {
854       T id = __kmp_tid_from_gtid(gtid);
855 
856       if (tc < nproc) {
857         if (id < tc) {
858           init = id;
859           limit = id;
860           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861         } else {
862           pr->u.p.count = 1; /* means no more chunks to execute */
863           pr->u.p.parm1 = FALSE;
864           break;
865         }
866       } else {
867         T small_chunk = tc / nproc;
868         T extras = tc % nproc;
869         init = id * small_chunk + (id < extras ? id : extras);
870         limit = init + small_chunk - (id < extras ? 0 : 1);
871         pr->u.p.parm1 = (id == nproc - 1);
872       }
873     } else {
874       if (tc > 0) {
875         init = 0;
876         limit = tc - 1;
877         pr->u.p.parm1 = TRUE;
878       } else { // zero trip count
879         pr->u.p.count = 1; /* means no more chunks to execute */
880         pr->u.p.parm1 = FALSE;
881         break;
882       }
883     }
884 #if USE_ITT_BUILD
885     // Calculate chunk for metadata report
886     if (itt_need_metadata_reporting)
887       cur_chunk = limit - init + 1;
888 #endif
889     if (st == 1) {
890       pr->u.p.lb = lb + init;
891       pr->u.p.ub = lb + limit;
892     } else {
893       // calculated upper bound, "ub" is user-defined upper bound
894       T ub_tmp = lb + limit * st;
895       pr->u.p.lb = lb + init * st;
896       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897       // it exactly
898       if (st > 0) {
899         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900       } else {
901         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902       }
903     }
904     if (pr->ordered) {
905       pr->u.p.ordered_lower = init;
906       pr->u.p.ordered_upper = limit;
907     }
908     break;
909   } // case
910   case kmp_sch_static_balanced_chunked: {
911     // similar to balanced, but chunk adjusted to multiple of simd width
912     T nth = th->th.th_team_nproc;
913     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914                    " -> falling-through to static_greedy\n",
915                    gtid));
916     schedule = kmp_sch_static_greedy;
917     if (nth > 1)
918       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919     else
920       pr->u.p.parm1 = tc;
921     break;
922   } // case
923   case kmp_sch_guided_iterative_chunked:
924   case kmp_sch_guided_simd: {
925     T nproc = th->th.th_team_nproc;
926     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927                    " case\n",
928                    gtid));
929 
930     if (nproc > 1) {
931       if ((2L * chunk + 1) * nproc >= tc) {
932         /* chunk size too large, switch to dynamic */
933         schedule = kmp_sch_dynamic_chunked;
934       } else {
935         // when remaining iters become less than parm2 - switch to dynamic
936         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937         *(double *)&pr->u.p.parm3 =
938             guided_flt_param / nproc; // may occupy parm3 and parm4
939       }
940     } else {
941       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942                      "kmp_sch_static_greedy\n",
943                      gtid));
944       schedule = kmp_sch_static_greedy;
945       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947                      gtid));
948       pr->u.p.parm1 = tc;
949     } // if
950   } // case
951   break;
952   case kmp_sch_guided_analytical_chunked: {
953     T nproc = th->th.th_team_nproc;
954     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955                    " case\n",
956                    gtid));
957     if (nproc > 1) {
958       if ((2L * chunk + 1) * nproc >= tc) {
959         /* chunk size too large, switch to dynamic */
960         schedule = kmp_sch_dynamic_chunked;
961       } else {
962         /* commonly used term: (2 nproc - 1)/(2 nproc) */
963         DBL x;
964 
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966         /* Linux* OS already has 64-bit computation by default for long double,
967            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969            instead of the default 53-bit. Even though long double doesn't work
970            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971            expected to impact the correctness of the algorithm, but this has not
972            been mathematically proven. */
973         // save original FPCW and set precision to 64-bit, as
974         // Windows* OS on IA-32 architecture defaults to 53-bit
975         unsigned int oldFpcw = _control87(0, 0);
976         _control87(_PC_64, _MCW_PC); // 0,0x30000
977 #endif
978         /* value used for comparison in solver for cross-over point */
979         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980 
981         /* crossover point--chunk indexes equal to or greater than
982            this point switch to dynamic-style scheduling */
983         UT cross;
984 
985         /* commonly used term: (2 nproc - 1)/(2 nproc) */
986         x = (long double)1.0 - (long double)0.5 / nproc;
987 
988 #ifdef KMP_DEBUG
989         { // test natural alignment
990           struct _test_a {
991             char a;
992             union {
993               char b;
994               DBL d;
995             };
996           } t;
997           ptrdiff_t natural_alignment =
998               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000           // long)natural_alignment );
1001           KMP_DEBUG_ASSERT(
1002               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003         }
1004 #endif // KMP_DEBUG
1005 
1006         /* save the term in thread private dispatch structure */
1007         *(DBL *)&pr->u.p.parm3 = x;
1008 
1009         /* solve for the crossover point to the nearest integer i for which C_i
1010            <= chunk */
1011         {
1012           UT left, right, mid;
1013           long double p;
1014 
1015           /* estimate initial upper and lower bound */
1016 
1017           /* doesn't matter what value right is as long as it is positive, but
1018              it affects performance of the solver */
1019           right = 229;
1020           p = __kmp_pow<UT>(x, right);
1021           if (p > target) {
1022             do {
1023               p *= p;
1024               right <<= 1;
1025             } while (p > target && right < (1 << 27));
1026             /* lower bound is previous (failed) estimate of upper bound */
1027             left = right >> 1;
1028           } else {
1029             left = 0;
1030           }
1031 
1032           /* bisection root-finding method */
1033           while (left + 1 < right) {
1034             mid = (left + right) / 2;
1035             if (__kmp_pow<UT>(x, mid) > target) {
1036               left = mid;
1037             } else {
1038               right = mid;
1039             }
1040           } // while
1041           cross = right;
1042         }
1043         /* assert sanity of computed crossover point */
1044         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045                    __kmp_pow<UT>(x, cross) <= target);
1046 
1047         /* save the crossover point in thread private dispatch structure */
1048         pr->u.p.parm2 = cross;
1049 
1050 // C75803
1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053 #else
1054 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1055 #endif
1056         /* dynamic-style scheduling offset */
1057         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059                         cross * chunk;
1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1061         // restore FPCW
1062         _control87(oldFpcw, _MCW_PC);
1063 #endif
1064       } // if
1065     } else {
1066       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067                      "kmp_sch_static_greedy\n",
1068                      gtid));
1069       schedule = kmp_sch_static_greedy;
1070       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071       pr->u.p.parm1 = tc;
1072     } // if
1073   } // case
1074   break;
1075   case kmp_sch_static_greedy:
1076     KD_TRACE(100,
1077              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080                         : tc;
1081     break;
1082   case kmp_sch_static_chunked:
1083   case kmp_sch_dynamic_chunked:
1084     if (pr->u.p.parm1 <= 0) {
1085       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086     }
1087     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089                    gtid));
1090     break;
1091   case kmp_sch_trapezoidal: {
1092     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093 
1094     T parm1, parm2, parm3, parm4;
1095     KD_TRACE(100,
1096              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097 
1098     parm1 = chunk;
1099 
1100     /* F : size of the first cycle */
1101     parm2 = (tc / (2 * th->th.th_team_nproc));
1102 
1103     if (parm2 < 1) {
1104       parm2 = 1;
1105     }
1106 
1107     /* L : size of the last cycle.  Make sure the last cycle is not larger
1108        than the first cycle. */
1109     if (parm1 < 1) {
1110       parm1 = 1;
1111     } else if (parm1 > parm2) {
1112       parm1 = parm2;
1113     }
1114 
1115     /* N : number of cycles */
1116     parm3 = (parm2 + parm1);
1117     parm3 = (2 * tc + parm3 - 1) / parm3;
1118 
1119     if (parm3 < 2) {
1120       parm3 = 2;
1121     }
1122 
1123     /* sigma : decreasing incr of the trapezoid */
1124     parm4 = (parm3 - 1);
1125     parm4 = (parm2 - parm1) / parm4;
1126 
1127     // pointless check, because parm4 >= 0 always
1128     // if ( parm4 < 0 ) {
1129     //    parm4 = 0;
1130     //}
1131 
1132     pr->u.p.parm1 = parm1;
1133     pr->u.p.parm2 = parm2;
1134     pr->u.p.parm3 = parm3;
1135     pr->u.p.parm4 = parm4;
1136   } // case
1137   break;
1138 
1139   default: {
1140     __kmp_msg(kmp_ms_fatal, // Severity
1141               KMP_MSG(UnknownSchedTypeDetected), // Primary message
1142               KMP_HNT(GetNewerLibrary), // Hint
1143               __kmp_msg_null // Variadic argument list terminator
1144               );
1145   } break;
1146   } // switch
1147   pr->schedule = schedule;
1148   if (active) {
1149     /* The name of this buffer should be my_buffer_index when it's free to use
1150      * it */
1151 
1152     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1153                    "sh->buffer_index:%d\n",
1154                    gtid, my_buffer_index, sh->buffer_index));
1155     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1156                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1157     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1158     // my_buffer_index are *always* 32-bit integers.
1159     KMP_MB(); /* is this necessary? */
1160     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1161                    "sh->buffer_index:%d\n",
1162                    gtid, my_buffer_index, sh->buffer_index));
1163 
1164     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1165     th->th.th_dispatch->th_dispatch_sh_current =
1166         RCAST(dispatch_shared_info_t *,
1167               CCAST(dispatch_shared_info_template<UT> *, sh));
1168 #if USE_ITT_BUILD
1169     if (pr->ordered) {
1170       __kmp_itt_ordered_init(gtid);
1171     }; // if
1172     // Report loop metadata
1173     if (itt_need_metadata_reporting) {
1174       // Only report metadata by master of active team at level 1
1175       kmp_uint64 schedtype = 0;
1176       switch (schedule) {
1177       case kmp_sch_static_chunked:
1178       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1179         break;
1180       case kmp_sch_static_greedy:
1181         cur_chunk = pr->u.p.parm1;
1182         break;
1183       case kmp_sch_dynamic_chunked:
1184         schedtype = 1;
1185         break;
1186       case kmp_sch_guided_iterative_chunked:
1187       case kmp_sch_guided_analytical_chunked:
1188       case kmp_sch_guided_simd:
1189         schedtype = 2;
1190         break;
1191       default:
1192         // Should we put this case under "static"?
1193         // case kmp_sch_static_steal:
1194         schedtype = 3;
1195         break;
1196       }
1197       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1198     }
1199 #endif /* USE_ITT_BUILD */
1200   }; // if
1201 
1202 #ifdef KMP_DEBUG
1203   {
1204     const char *buff;
1205     // create format specifiers before the debug output
1206     buff = __kmp_str_format(
1207         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1208         "lb:%%%s ub:%%%s"
1209         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1210         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1211         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1212         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1213         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1214         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1215     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1216                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1217                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1218                   pr->u.p.parm3, pr->u.p.parm4));
1219     __kmp_str_free(&buff);
1220   }
1221 #endif
1222 #if (KMP_STATIC_STEAL_ENABLED)
1223   // It cannot be guaranteed that after execution of a loop with some other
1224   // schedule kind all the parm3 variables will contain the same value. Even if
1225   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1226   // rather than program life-time increment. So the dedicated variable is
1227   // required. The 'static_steal_counter' is used.
1228   if (schedule == kmp_sch_static_steal) {
1229     // Other threads will inspect this variable when searching for a victim.
1230     // This is a flag showing that other threads may steal from this thread
1231     // since then.
1232     volatile T *p = &pr->u.p.static_steal_counter;
1233     *p = *p + 1;
1234   }
1235 #endif // ( KMP_STATIC_STEAL_ENABLED )
1236 
1237 #if OMPT_SUPPORT && OMPT_TRACE
1238   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1239     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1240     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1241     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1242         team_info->parallel_id, task_info->task_id, team_info->microtask);
1243   }
1244 #endif
1245 }
1246 
1247 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1248  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1249  * every chunk of iterations.  If the ordered section(s) were not executed
1250  * for this iteration (or every iteration in this chunk), we need to set the
1251  * ordered iteration counters so that the next thread can proceed. */
1252 template <typename UT>
1253 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1254   typedef typename traits_t<UT>::signed_t ST;
1255   kmp_info_t *th = __kmp_threads[gtid];
1256 
1257   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1258   if (!th->th.th_team->t.t_serialized) {
1259 
1260     dispatch_private_info_template<UT> *pr =
1261         reinterpret_cast<dispatch_private_info_template<UT> *>(
1262             th->th.th_dispatch->th_dispatch_pr_current);
1263     dispatch_shared_info_template<UT> volatile *sh =
1264         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1265             th->th.th_dispatch->th_dispatch_sh_current);
1266     KMP_DEBUG_ASSERT(pr);
1267     KMP_DEBUG_ASSERT(sh);
1268     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1269                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1270 
1271     if (pr->ordered_bumped) {
1272       KD_TRACE(
1273           1000,
1274           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1275            gtid));
1276       pr->ordered_bumped = 0;
1277     } else {
1278       UT lower = pr->u.p.ordered_lower;
1279 
1280 #ifdef KMP_DEBUG
1281       {
1282         const char *buff;
1283         // create format specifiers before the debug output
1284         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1285                                 "ordered_iteration:%%%s lower:%%%s\n",
1286                                 traits_t<UT>::spec, traits_t<UT>::spec);
1287         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1288         __kmp_str_free(&buff);
1289       }
1290 #endif
1291 
1292       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1293                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1294       KMP_MB(); /* is this necessary? */
1295 #ifdef KMP_DEBUG
1296       {
1297         const char *buff;
1298         // create format specifiers before the debug output
1299         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1300                                 "ordered_iteration:%%%s lower:%%%s\n",
1301                                 traits_t<UT>::spec, traits_t<UT>::spec);
1302         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1303         __kmp_str_free(&buff);
1304       }
1305 #endif
1306 
1307       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1308     } // if
1309   } // if
1310   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1311 }
1312 
1313 #ifdef KMP_GOMP_COMPAT
1314 
1315 template <typename UT>
1316 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1317   typedef typename traits_t<UT>::signed_t ST;
1318   kmp_info_t *th = __kmp_threads[gtid];
1319 
1320   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1321   if (!th->th.th_team->t.t_serialized) {
1322     //        int cid;
1323     dispatch_private_info_template<UT> *pr =
1324         reinterpret_cast<dispatch_private_info_template<UT> *>(
1325             th->th.th_dispatch->th_dispatch_pr_current);
1326     dispatch_shared_info_template<UT> volatile *sh =
1327         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1328             th->th.th_dispatch->th_dispatch_sh_current);
1329     KMP_DEBUG_ASSERT(pr);
1330     KMP_DEBUG_ASSERT(sh);
1331     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1332                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1333 
1334     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1335     UT lower = pr->u.p.ordered_lower;
1336     UT upper = pr->u.p.ordered_upper;
1337     UT inc = upper - lower + 1;
1338 
1339     if (pr->ordered_bumped == inc) {
1340       KD_TRACE(
1341           1000,
1342           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1343            gtid));
1344       pr->ordered_bumped = 0;
1345     } else {
1346       inc -= pr->ordered_bumped;
1347 
1348 #ifdef KMP_DEBUG
1349       {
1350         const char *buff;
1351         // create format specifiers before the debug output
1352         buff = __kmp_str_format(
1353             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1354             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1355             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1356         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1357         __kmp_str_free(&buff);
1358       }
1359 #endif
1360 
1361       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1362                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1363 
1364       KMP_MB(); /* is this necessary? */
1365       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1366                       "ordered_bumped to zero\n",
1367                       gtid));
1368       pr->ordered_bumped = 0;
1369 //!!!!! TODO check if the inc should be unsigned, or signed???
1370 #ifdef KMP_DEBUG
1371       {
1372         const char *buff;
1373         // create format specifiers before the debug output
1374         buff = __kmp_str_format(
1375             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1376             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1377             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1378             traits_t<UT>::spec);
1379         KD_TRACE(1000,
1380                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1381         __kmp_str_free(&buff);
1382       }
1383 #endif
1384 
1385       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1386     }
1387     //        }
1388   }
1389   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1390 }
1391 
1392 #endif /* KMP_GOMP_COMPAT */
1393 
1394 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1395    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1396    is not called. */
1397 #if OMPT_SUPPORT && OMPT_TRACE
1398 #define OMPT_LOOP_END                                                          \
1399   if (status == 0) {                                                           \
1400     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1401       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1402       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1403       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1404           team_info->parallel_id, task_info->task_id);                         \
1405     }                                                                          \
1406   }
1407 #else
1408 #define OMPT_LOOP_END // no-op
1409 #endif
1410 
1411 template <typename T>
1412 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1413                                T *p_lb, T *p_ub,
1414                                typename traits_t<T>::signed_t *p_st) {
1415 
1416   typedef typename traits_t<T>::unsigned_t UT;
1417   typedef typename traits_t<T>::signed_t ST;
1418   typedef typename traits_t<T>::floating_t DBL;
1419 
1420   // This is potentially slightly misleading, schedule(runtime) will appear here
1421   // even if the actual runtme schedule is static. (Which points out a
1422   // disadavantage of schedule(runtime): even when static scheduling is used it
1423   // costs more than a compile time choice to use static scheduling would.)
1424   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1425 
1426   int status;
1427   dispatch_private_info_template<T> *pr;
1428   kmp_info_t *th = __kmp_threads[gtid];
1429   kmp_team_t *team = th->th.th_team;
1430 
1431   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1432 #ifdef KMP_DEBUG
1433   {
1434     const char *buff;
1435     // create format specifiers before the debug output
1436     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1437                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1438                             traits_t<T>::spec, traits_t<T>::spec,
1439                             traits_t<ST>::spec);
1440     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1441     __kmp_str_free(&buff);
1442   }
1443 #endif
1444 
1445   if (team->t.t_serialized) {
1446     /* NOTE: serialize this dispatch becase we are not at the active level */
1447     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1448         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1449     KMP_DEBUG_ASSERT(pr);
1450 
1451     if ((status = (pr->u.p.tc != 0)) == 0) {
1452       *p_lb = 0;
1453       *p_ub = 0;
1454       //            if ( p_last != NULL )
1455       //                *p_last = 0;
1456       if (p_st != NULL)
1457         *p_st = 0;
1458       if (__kmp_env_consistency_check) {
1459         if (pr->pushed_ws != ct_none) {
1460           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1461         }
1462       }
1463     } else if (pr->nomerge) {
1464       kmp_int32 last;
1465       T start;
1466       UT limit, trip, init;
1467       ST incr;
1468       T chunk = pr->u.p.parm1;
1469 
1470       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1471                      gtid));
1472 
1473       init = chunk * pr->u.p.count++;
1474       trip = pr->u.p.tc - 1;
1475 
1476       if ((status = (init <= trip)) == 0) {
1477         *p_lb = 0;
1478         *p_ub = 0;
1479         //                if ( p_last != NULL )
1480         //                    *p_last = 0;
1481         if (p_st != NULL)
1482           *p_st = 0;
1483         if (__kmp_env_consistency_check) {
1484           if (pr->pushed_ws != ct_none) {
1485             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1486           }
1487         }
1488       } else {
1489         start = pr->u.p.lb;
1490         limit = chunk + init - 1;
1491         incr = pr->u.p.st;
1492 
1493         if ((last = (limit >= trip)) != 0) {
1494           limit = trip;
1495 #if KMP_OS_WINDOWS
1496           pr->u.p.last_upper = pr->u.p.ub;
1497 #endif /* KMP_OS_WINDOWS */
1498         }
1499         if (p_last != NULL)
1500           *p_last = last;
1501         if (p_st != NULL)
1502           *p_st = incr;
1503         if (incr == 1) {
1504           *p_lb = start + init;
1505           *p_ub = start + limit;
1506         } else {
1507           *p_lb = start + init * incr;
1508           *p_ub = start + limit * incr;
1509         }
1510 
1511         if (pr->ordered) {
1512           pr->u.p.ordered_lower = init;
1513           pr->u.p.ordered_upper = limit;
1514 #ifdef KMP_DEBUG
1515           {
1516             const char *buff;
1517             // create format specifiers before the debug output
1518             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1519                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1520                                     traits_t<UT>::spec, traits_t<UT>::spec);
1521             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1522                             pr->u.p.ordered_upper));
1523             __kmp_str_free(&buff);
1524           }
1525 #endif
1526         } // if
1527       } // if
1528     } else {
1529       pr->u.p.tc = 0;
1530       *p_lb = pr->u.p.lb;
1531       *p_ub = pr->u.p.ub;
1532 #if KMP_OS_WINDOWS
1533       pr->u.p.last_upper = *p_ub;
1534 #endif /* KMP_OS_WINDOWS */
1535       if (p_last != NULL)
1536         *p_last = TRUE;
1537       if (p_st != NULL)
1538         *p_st = pr->u.p.st;
1539     } // if
1540 #ifdef KMP_DEBUG
1541     {
1542       const char *buff;
1543       // create format specifiers before the debug output
1544       buff = __kmp_str_format(
1545           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1546           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1547           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1548       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1549       __kmp_str_free(&buff);
1550     }
1551 #endif
1552 #if INCLUDE_SSC_MARKS
1553     SSC_MARK_DISPATCH_NEXT();
1554 #endif
1555     OMPT_LOOP_END;
1556     return status;
1557   } else {
1558     kmp_int32 last = 0;
1559     dispatch_shared_info_template<UT> *sh;
1560     T start;
1561     ST incr;
1562     UT limit, trip, init;
1563 
1564     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1565                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1566 
1567     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1568         th->th.th_dispatch->th_dispatch_pr_current);
1569     KMP_DEBUG_ASSERT(pr);
1570     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1571         th->th.th_dispatch->th_dispatch_sh_current);
1572     KMP_DEBUG_ASSERT(sh);
1573 
1574     if (pr->u.p.tc == 0) {
1575       // zero trip count
1576       status = 0;
1577     } else {
1578       switch (pr->schedule) {
1579 #if (KMP_STATIC_STEAL_ENABLED)
1580       case kmp_sch_static_steal: {
1581         T chunk = pr->u.p.parm1;
1582         int nproc = th->th.th_team_nproc;
1583 
1584         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1585                        gtid));
1586 
1587         trip = pr->u.p.tc - 1;
1588 
1589         if (traits_t<T>::type_size > 4) {
1590           // use lock for 8-byte and CAS for 4-byte induction
1591           // variable. TODO (optional): check and use 16-byte CAS
1592           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1593           KMP_DEBUG_ASSERT(lck != NULL);
1594           if (pr->u.p.count < (UT)pr->u.p.ub) {
1595             __kmp_acquire_lock(lck, gtid);
1596             // try to get own chunk of iterations
1597             init = (pr->u.p.count)++;
1598             status = (init < (UT)pr->u.p.ub);
1599             __kmp_release_lock(lck, gtid);
1600           } else {
1601             status = 0; // no own chunks
1602           }
1603           if (!status) { // try to steal
1604             kmp_info_t **other_threads = team->t.t_threads;
1605             int while_limit = nproc; // nproc attempts to find a victim
1606             int while_index = 0;
1607             // TODO: algorithm of searching for a victim
1608             // should be cleaned up and measured
1609             while ((!status) && (while_limit != ++while_index)) {
1610               T remaining;
1611               T victimIdx = pr->u.p.parm4;
1612               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1613               dispatch_private_info_template<T> *victim =
1614                   reinterpret_cast<dispatch_private_info_template<T> *>(
1615                       other_threads[victimIdx]
1616                           ->th.th_dispatch->th_dispatch_pr_current);
1617               while ((victim == NULL || victim == pr ||
1618                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1619                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1620                      oldVictimIdx != victimIdx) {
1621                 victimIdx = (victimIdx + 1) % nproc;
1622                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1623                     other_threads[victimIdx]
1624                         ->th.th_dispatch->th_dispatch_pr_current);
1625               };
1626               if (!victim ||
1627                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1628                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1629                 continue; // try once more (nproc attempts in total)
1630                 // no victim is ready yet to participate in stealing
1631                 // because all victims are still in kmp_init_dispatch
1632               }
1633               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1634                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1635                 continue; // not enough chunks to steal, goto next victim
1636               }
1637 
1638               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1639               KMP_ASSERT(lck != NULL);
1640               __kmp_acquire_lock(lck, gtid);
1641               limit = victim->u.p.ub; // keep initial ub
1642               if (victim->u.p.count >= limit ||
1643                   (remaining = limit - victim->u.p.count) < 2) {
1644                 __kmp_release_lock(lck, gtid);
1645                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1646                 continue; // not enough chunks to steal
1647               }
1648               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1649               // or by 1
1650               if (remaining > 3) {
1651                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1652                 init = (victim->u.p.ub -=
1653                         (remaining >> 2)); // steal 1/4 of remaining
1654               } else {
1655                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1656                 init =
1657                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1658               }
1659               __kmp_release_lock(lck, gtid);
1660 
1661               KMP_DEBUG_ASSERT(init + 1 <= limit);
1662               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1663               status = 1;
1664               while_index = 0;
1665               // now update own count and ub with stolen range but init chunk
1666               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1667               pr->u.p.count = init + 1;
1668               pr->u.p.ub = limit;
1669               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1670             } // while (search for victim)
1671           } // if (try to find victim and steal)
1672         } else {
1673           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1674           typedef union {
1675             struct {
1676               UT count;
1677               T ub;
1678             } p;
1679             kmp_int64 b;
1680           } union_i4;
1681           // All operations on 'count' or 'ub' must be combined atomically
1682           // together.
1683           {
1684             union_i4 vold, vnew;
1685             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1686             vnew = vold;
1687             vnew.p.count++;
1688             while (!KMP_COMPARE_AND_STORE_ACQ64(
1689                 (volatile kmp_int64 *)&pr->u.p.count,
1690                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1691                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1692               KMP_CPU_PAUSE();
1693               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1694               vnew = vold;
1695               vnew.p.count++;
1696             }
1697             vnew = vold;
1698             init = vnew.p.count;
1699             status = (init < (UT)vnew.p.ub);
1700           }
1701 
1702           if (!status) {
1703             kmp_info_t **other_threads = team->t.t_threads;
1704             int while_limit = nproc; // nproc attempts to find a victim
1705             int while_index = 0;
1706 
1707             // TODO: algorithm of searching for a victim
1708             // should be cleaned up and measured
1709             while ((!status) && (while_limit != ++while_index)) {
1710               union_i4 vold, vnew;
1711               kmp_int32 remaining;
1712               T victimIdx = pr->u.p.parm4;
1713               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1714               dispatch_private_info_template<T> *victim =
1715                   reinterpret_cast<dispatch_private_info_template<T> *>(
1716                       other_threads[victimIdx]
1717                           ->th.th_dispatch->th_dispatch_pr_current);
1718               while ((victim == NULL || victim == pr ||
1719                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1720                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1721                      oldVictimIdx != victimIdx) {
1722                 victimIdx = (victimIdx + 1) % nproc;
1723                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1724                     other_threads[victimIdx]
1725                         ->th.th_dispatch->th_dispatch_pr_current);
1726               };
1727               if (!victim ||
1728                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1729                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1730                 continue; // try once more (nproc attempts in total)
1731                 // no victim is ready yet to participate in stealing
1732                 // because all victims are still in kmp_init_dispatch
1733               }
1734               pr->u.p.parm4 = victimIdx; // new victim found
1735               while (1) { // CAS loop if victim has enough chunks to steal
1736                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1737                 vnew = vold;
1738 
1739                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1740                 if (vnew.p.count >= (UT)vnew.p.ub ||
1741                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1742                   pr->u.p.parm4 =
1743                       (victimIdx + 1) % nproc; // shift start victim id
1744                   break; // not enough chunks to steal, goto next victim
1745                 }
1746                 if (remaining > 3) {
1747                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1748                 } else {
1749                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1750                 }
1751                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1752                 // TODO: Should this be acquire or release?
1753                 if (KMP_COMPARE_AND_STORE_ACQ64(
1754                         (volatile kmp_int64 *)&victim->u.p.count,
1755                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1756                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1757                   // stealing succeeded
1758                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1759                                   vold.p.ub - vnew.p.ub);
1760                   status = 1;
1761                   while_index = 0;
1762                   // now update own count and ub
1763                   init = vnew.p.ub;
1764                   vold.p.count = init + 1;
1765 #if KMP_ARCH_X86
1766                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1767                                    vold.b);
1768 #else
1769                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1770 #endif
1771                   break;
1772                 } // if (check CAS result)
1773                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1774               } // while (try to steal from particular victim)
1775             } // while (search for victim)
1776           } // if (try to find victim and steal)
1777         } // if (4-byte induction variable)
1778         if (!status) {
1779           *p_lb = 0;
1780           *p_ub = 0;
1781           if (p_st != NULL)
1782             *p_st = 0;
1783         } else {
1784           start = pr->u.p.parm2;
1785           init *= chunk;
1786           limit = chunk + init - 1;
1787           incr = pr->u.p.st;
1788           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1789 
1790           KMP_DEBUG_ASSERT(init <= trip);
1791           if ((last = (limit >= trip)) != 0)
1792             limit = trip;
1793           if (p_st != NULL)
1794             *p_st = incr;
1795 
1796           if (incr == 1) {
1797             *p_lb = start + init;
1798             *p_ub = start + limit;
1799           } else {
1800             *p_lb = start + init * incr;
1801             *p_ub = start + limit * incr;
1802           }
1803 
1804           if (pr->ordered) {
1805             pr->u.p.ordered_lower = init;
1806             pr->u.p.ordered_upper = limit;
1807 #ifdef KMP_DEBUG
1808             {
1809               const char *buff;
1810               // create format specifiers before the debug output
1811               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1812                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1813                                       traits_t<UT>::spec, traits_t<UT>::spec);
1814               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1815                               pr->u.p.ordered_upper));
1816               __kmp_str_free(&buff);
1817             }
1818 #endif
1819           } // if
1820         } // if
1821         break;
1822       } // case
1823 #endif // ( KMP_STATIC_STEAL_ENABLED )
1824       case kmp_sch_static_balanced: {
1825         KD_TRACE(
1826             100,
1827             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1828         if ((status = !pr->u.p.count) !=
1829             0) { /* check if thread has any iteration to do */
1830           pr->u.p.count = 1;
1831           *p_lb = pr->u.p.lb;
1832           *p_ub = pr->u.p.ub;
1833           last = pr->u.p.parm1;
1834           if (p_st != NULL)
1835             *p_st = pr->u.p.st;
1836         } else { /* no iterations to do */
1837           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1838         }
1839         if (pr->ordered) {
1840 #ifdef KMP_DEBUG
1841           {
1842             const char *buff;
1843             // create format specifiers before the debug output
1844             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1845                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1846                                     traits_t<UT>::spec, traits_t<UT>::spec);
1847             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1848                             pr->u.p.ordered_upper));
1849             __kmp_str_free(&buff);
1850           }
1851 #endif
1852         } // if
1853       } // case
1854       break;
1855       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1856                                      merged here */
1857       case kmp_sch_static_chunked: {
1858         T parm1;
1859 
1860         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1861                        "kmp_sch_static_[affinity|chunked] case\n",
1862                        gtid));
1863         parm1 = pr->u.p.parm1;
1864 
1865         trip = pr->u.p.tc - 1;
1866         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1867 
1868         if ((status = (init <= trip)) != 0) {
1869           start = pr->u.p.lb;
1870           incr = pr->u.p.st;
1871           limit = parm1 + init - 1;
1872 
1873           if ((last = (limit >= trip)) != 0)
1874             limit = trip;
1875 
1876           if (p_st != NULL)
1877             *p_st = incr;
1878 
1879           pr->u.p.count += th->th.th_team_nproc;
1880 
1881           if (incr == 1) {
1882             *p_lb = start + init;
1883             *p_ub = start + limit;
1884           } else {
1885             *p_lb = start + init * incr;
1886             *p_ub = start + limit * incr;
1887           }
1888 
1889           if (pr->ordered) {
1890             pr->u.p.ordered_lower = init;
1891             pr->u.p.ordered_upper = limit;
1892 #ifdef KMP_DEBUG
1893             {
1894               const char *buff;
1895               // create format specifiers before the debug output
1896               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1897                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1898                                       traits_t<UT>::spec, traits_t<UT>::spec);
1899               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1900                               pr->u.p.ordered_upper));
1901               __kmp_str_free(&buff);
1902             }
1903 #endif
1904           } // if
1905         } // if
1906       } // case
1907       break;
1908 
1909       case kmp_sch_dynamic_chunked: {
1910         T chunk = pr->u.p.parm1;
1911 
1912         KD_TRACE(
1913             100,
1914             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1915 
1916         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1917         trip = pr->u.p.tc - 1;
1918 
1919         if ((status = (init <= trip)) == 0) {
1920           *p_lb = 0;
1921           *p_ub = 0;
1922           if (p_st != NULL)
1923             *p_st = 0;
1924         } else {
1925           start = pr->u.p.lb;
1926           limit = chunk + init - 1;
1927           incr = pr->u.p.st;
1928 
1929           if ((last = (limit >= trip)) != 0)
1930             limit = trip;
1931 
1932           if (p_st != NULL)
1933             *p_st = incr;
1934 
1935           if (incr == 1) {
1936             *p_lb = start + init;
1937             *p_ub = start + limit;
1938           } else {
1939             *p_lb = start + init * incr;
1940             *p_ub = start + limit * incr;
1941           }
1942 
1943           if (pr->ordered) {
1944             pr->u.p.ordered_lower = init;
1945             pr->u.p.ordered_upper = limit;
1946 #ifdef KMP_DEBUG
1947             {
1948               const char *buff;
1949               // create format specifiers before the debug output
1950               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1951                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1952                                       traits_t<UT>::spec, traits_t<UT>::spec);
1953               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1954                               pr->u.p.ordered_upper));
1955               __kmp_str_free(&buff);
1956             }
1957 #endif
1958           } // if
1959         } // if
1960       } // case
1961       break;
1962 
1963       case kmp_sch_guided_iterative_chunked: {
1964         T chunkspec = pr->u.p.parm1;
1965         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1966                        "iterative case\n",
1967                        gtid));
1968         trip = pr->u.p.tc;
1969         // Start atomic part of calculations
1970         while (1) {
1971           ST remaining; // signed, because can be < 0
1972           init = sh->u.s.iteration; // shared value
1973           remaining = trip - init;
1974           if (remaining <= 0) { // AC: need to compare with 0 first
1975             // nothing to do, don't try atomic op
1976             status = 0;
1977             break;
1978           }
1979           if ((T)remaining <
1980               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1981             // use dynamic-style shcedule
1982             // atomically inrement iterations, get old value
1983             init = test_then_add<ST>(
1984                 RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunkspec);
1985             remaining = trip - init;
1986             if (remaining <= 0) {
1987               status = 0; // all iterations got by other threads
1988             } else { // got some iterations to work on
1989               status = 1;
1990               if ((T)remaining > chunkspec) {
1991                 limit = init + chunkspec - 1;
1992               } else {
1993                 last = 1; // the last chunk
1994                 limit = init + remaining - 1;
1995               } // if
1996             } // if
1997             break;
1998           } // if
1999           limit = init + (UT)(remaining *
2000                               *(double *)&pr->u.p.parm3); // divide by K*nproc
2001           if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)),
2002                                    (ST)init, (ST)limit)) {
2003             // CAS was successful, chunk obtained
2004             status = 1;
2005             --limit;
2006             break;
2007           } // if
2008         } // while
2009         if (status != 0) {
2010           start = pr->u.p.lb;
2011           incr = pr->u.p.st;
2012           if (p_st != NULL)
2013             *p_st = incr;
2014           *p_lb = start + init * incr;
2015           *p_ub = start + limit * incr;
2016           if (pr->ordered) {
2017             pr->u.p.ordered_lower = init;
2018             pr->u.p.ordered_upper = limit;
2019 #ifdef KMP_DEBUG
2020             {
2021               const char *buff;
2022               // create format specifiers before the debug output
2023               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2024                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2025                                       traits_t<UT>::spec, traits_t<UT>::spec);
2026               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2027                               pr->u.p.ordered_upper));
2028               __kmp_str_free(&buff);
2029             }
2030 #endif
2031           } // if
2032         } else {
2033           *p_lb = 0;
2034           *p_ub = 0;
2035           if (p_st != NULL)
2036             *p_st = 0;
2037         } // if
2038       } // case
2039       break;
2040 
2041       case kmp_sch_guided_simd: {
2042         // same as iterative but curr-chunk adjusted to be multiple of given
2043         // chunk
2044         T chunk = pr->u.p.parm1;
2045         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2046                        gtid));
2047         trip = pr->u.p.tc;
2048         // Start atomic part of calculations
2049         while (1) {
2050           ST remaining; // signed, because can be < 0
2051           init = sh->u.s.iteration; // shared value
2052           remaining = trip - init;
2053           if (remaining <= 0) { // AC: need to compare with 0 first
2054             status = 0; // nothing to do, don't try atomic op
2055             break;
2056           }
2057           KMP_DEBUG_ASSERT(init % chunk == 0);
2058           // compare with K*nproc*(chunk+1), K=2 by default
2059           if ((T)remaining < pr->u.p.parm2) {
2060             // use dynamic-style shcedule
2061             // atomically inrement iterations, get old value
2062             init = test_then_add<ST>(
2063                 RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunk);
2064             remaining = trip - init;
2065             if (remaining <= 0) {
2066               status = 0; // all iterations got by other threads
2067             } else {
2068               // got some iterations to work on
2069               status = 1;
2070               if ((T)remaining > chunk) {
2071                 limit = init + chunk - 1;
2072               } else {
2073                 last = 1; // the last chunk
2074                 limit = init + remaining - 1;
2075               } // if
2076             } // if
2077             break;
2078           } // if
2079           // divide by K*nproc
2080           UT span = remaining * (*(double *)&pr->u.p.parm3);
2081           UT rem = span % chunk;
2082           if (rem) // adjust so that span%chunk == 0
2083             span += chunk - rem;
2084           limit = init + span;
2085           if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)),
2086                                    (ST)init, (ST)limit)) {
2087             // CAS was successful, chunk obtained
2088             status = 1;
2089             --limit;
2090             break;
2091           } // if
2092         } // while
2093         if (status != 0) {
2094           start = pr->u.p.lb;
2095           incr = pr->u.p.st;
2096           if (p_st != NULL)
2097             *p_st = incr;
2098           *p_lb = start + init * incr;
2099           *p_ub = start + limit * incr;
2100           if (pr->ordered) {
2101             pr->u.p.ordered_lower = init;
2102             pr->u.p.ordered_upper = limit;
2103 #ifdef KMP_DEBUG
2104             {
2105               const char *buff;
2106               // create format specifiers before the debug output
2107               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2108                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2109                                       traits_t<UT>::spec, traits_t<UT>::spec);
2110               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2111                               pr->u.p.ordered_upper));
2112               __kmp_str_free(&buff);
2113             }
2114 #endif
2115           } // if
2116         } else {
2117           *p_lb = 0;
2118           *p_ub = 0;
2119           if (p_st != NULL)
2120             *p_st = 0;
2121         } // if
2122       } // case
2123       break;
2124 
2125       case kmp_sch_guided_analytical_chunked: {
2126         T chunkspec = pr->u.p.parm1;
2127         UT chunkIdx;
2128 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2129         /* for storing original FPCW value for Windows* OS on
2130            IA-32 architecture 8-byte version */
2131         unsigned int oldFpcw;
2132         unsigned int fpcwSet = 0;
2133 #endif
2134         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2135                        "analytical case\n",
2136                        gtid));
2137 
2138         trip = pr->u.p.tc;
2139 
2140         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2141         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2142                          trip);
2143 
2144         while (1) { /* this while loop is a safeguard against unexpected zero
2145                        chunk sizes */
2146           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2147           if (chunkIdx >= (UT)pr->u.p.parm2) {
2148             --trip;
2149             /* use dynamic-style scheduling */
2150             init = chunkIdx * chunkspec + pr->u.p.count;
2151             /* need to verify init > 0 in case of overflow in the above
2152              * calculation */
2153             if ((status = (init > 0 && init <= trip)) != 0) {
2154               limit = init + chunkspec - 1;
2155 
2156               if ((last = (limit >= trip)) != 0)
2157                 limit = trip;
2158             }
2159             break;
2160           } else {
2161 /* use exponential-style scheduling */
2162 /* The following check is to workaround the lack of long double precision on
2163    Windows* OS.
2164    This check works around the possible effect that init != 0 for chunkIdx == 0.
2165  */
2166 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2167             /* If we haven't already done so, save original FPCW and set
2168                precision to 64-bit, as Windows* OS on IA-32 architecture
2169                defaults to 53-bit */
2170             if (!fpcwSet) {
2171               oldFpcw = _control87(0, 0);
2172               _control87(_PC_64, _MCW_PC);
2173               fpcwSet = 0x30000;
2174             }
2175 #endif
2176             if (chunkIdx) {
2177               init = __kmp_dispatch_guided_remaining<T>(
2178                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2179               KMP_DEBUG_ASSERT(init);
2180               init = trip - init;
2181             } else
2182               init = 0;
2183             limit = trip - __kmp_dispatch_guided_remaining<T>(
2184                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2185             KMP_ASSERT(init <= limit);
2186             if (init < limit) {
2187               KMP_DEBUG_ASSERT(limit <= trip);
2188               --limit;
2189               status = 1;
2190               break;
2191             } // if
2192           } // if
2193         } // while (1)
2194 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2195         /* restore FPCW if necessary
2196            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2197            here */
2198         if (fpcwSet && (oldFpcw & fpcwSet))
2199           _control87(oldFpcw, _MCW_PC);
2200 #endif
2201         if (status != 0) {
2202           start = pr->u.p.lb;
2203           incr = pr->u.p.st;
2204           if (p_st != NULL)
2205             *p_st = incr;
2206           *p_lb = start + init * incr;
2207           *p_ub = start + limit * incr;
2208           if (pr->ordered) {
2209             pr->u.p.ordered_lower = init;
2210             pr->u.p.ordered_upper = limit;
2211 #ifdef KMP_DEBUG
2212             {
2213               const char *buff;
2214               // create format specifiers before the debug output
2215               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2216                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2217                                       traits_t<UT>::spec, traits_t<UT>::spec);
2218               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2219                               pr->u.p.ordered_upper));
2220               __kmp_str_free(&buff);
2221             }
2222 #endif
2223           }
2224         } else {
2225           *p_lb = 0;
2226           *p_ub = 0;
2227           if (p_st != NULL)
2228             *p_st = 0;
2229         }
2230       } // case
2231       break;
2232 
2233       case kmp_sch_trapezoidal: {
2234         UT index;
2235         T parm2 = pr->u.p.parm2;
2236         T parm3 = pr->u.p.parm3;
2237         T parm4 = pr->u.p.parm4;
2238         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2239                        gtid));
2240 
2241         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2242 
2243         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2244         trip = pr->u.p.tc - 1;
2245 
2246         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2247           *p_lb = 0;
2248           *p_ub = 0;
2249           if (p_st != NULL)
2250             *p_st = 0;
2251         } else {
2252           start = pr->u.p.lb;
2253           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2254           incr = pr->u.p.st;
2255 
2256           if ((last = (limit >= trip)) != 0)
2257             limit = trip;
2258 
2259           if (p_st != NULL)
2260             *p_st = incr;
2261 
2262           if (incr == 1) {
2263             *p_lb = start + init;
2264             *p_ub = start + limit;
2265           } else {
2266             *p_lb = start + init * incr;
2267             *p_ub = start + limit * incr;
2268           }
2269 
2270           if (pr->ordered) {
2271             pr->u.p.ordered_lower = init;
2272             pr->u.p.ordered_upper = limit;
2273 #ifdef KMP_DEBUG
2274             {
2275               const char *buff;
2276               // create format specifiers before the debug output
2277               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2278                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2279                                       traits_t<UT>::spec, traits_t<UT>::spec);
2280               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2281                               pr->u.p.ordered_upper));
2282               __kmp_str_free(&buff);
2283             }
2284 #endif
2285           } // if
2286         } // if
2287       } // case
2288       break;
2289       default: {
2290         status = 0; // to avoid complaints on uninitialized variable use
2291         __kmp_msg(kmp_ms_fatal, // Severity
2292                   KMP_MSG(UnknownSchedTypeDetected), // Primary message
2293                   KMP_HNT(GetNewerLibrary), // Hint
2294                   __kmp_msg_null // Variadic argument list terminator
2295                   );
2296       } break;
2297       } // switch
2298     } // if tc == 0;
2299 
2300     if (status == 0) {
2301       UT num_done;
2302 
2303       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2304 #ifdef KMP_DEBUG
2305       {
2306         const char *buff;
2307         // create format specifiers before the debug output
2308         buff = __kmp_str_format(
2309             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2310             traits_t<UT>::spec);
2311         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2312         __kmp_str_free(&buff);
2313       }
2314 #endif
2315 
2316       if ((ST)num_done == th->th.th_team_nproc - 1) {
2317 #if (KMP_STATIC_STEAL_ENABLED)
2318         if (pr->schedule == kmp_sch_static_steal &&
2319             traits_t<T>::type_size > 4) {
2320           int i;
2321           kmp_info_t **other_threads = team->t.t_threads;
2322           // loop complete, safe to destroy locks used for stealing
2323           for (i = 0; i < th->th.th_team_nproc; ++i) {
2324             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2325             KMP_ASSERT(lck != NULL);
2326             __kmp_destroy_lock(lck);
2327             __kmp_free(lck);
2328             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2329           }
2330         }
2331 #endif
2332         /* NOTE: release this buffer to be reused */
2333 
2334         KMP_MB(); /* Flush all pending memory write invalidates.  */
2335 
2336         sh->u.s.num_done = 0;
2337         sh->u.s.iteration = 0;
2338 
2339         /* TODO replace with general release procedure? */
2340         if (pr->ordered) {
2341           sh->u.s.ordered_iteration = 0;
2342         }
2343 
2344         KMP_MB(); /* Flush all pending memory write invalidates.  */
2345 
2346         sh->buffer_index += __kmp_dispatch_num_buffers;
2347         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2348                        gtid, sh->buffer_index));
2349 
2350         KMP_MB(); /* Flush all pending memory write invalidates.  */
2351 
2352       } // if
2353       if (__kmp_env_consistency_check) {
2354         if (pr->pushed_ws != ct_none) {
2355           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2356         }
2357       }
2358 
2359       th->th.th_dispatch->th_deo_fcn = NULL;
2360       th->th.th_dispatch->th_dxo_fcn = NULL;
2361       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2362       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2363     } // if (status == 0)
2364 #if KMP_OS_WINDOWS
2365     else if (last) {
2366       pr->u.p.last_upper = pr->u.p.ub;
2367     }
2368 #endif /* KMP_OS_WINDOWS */
2369     if (p_last != NULL && status != 0)
2370       *p_last = last;
2371   } // if
2372 
2373 #ifdef KMP_DEBUG
2374   {
2375     const char *buff;
2376     // create format specifiers before the debug output
2377     buff = __kmp_str_format(
2378         "__kmp_dispatch_next: T#%%d normal case: "
2379         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2380         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2381     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2382     __kmp_str_free(&buff);
2383   }
2384 #endif
2385 #if INCLUDE_SSC_MARKS
2386   SSC_MARK_DISPATCH_NEXT();
2387 #endif
2388   OMPT_LOOP_END;
2389   return status;
2390 }
2391 
2392 template <typename T>
2393 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2394                                   kmp_int32 *plastiter, T *plower, T *pupper,
2395                                   typename traits_t<T>::signed_t incr) {
2396   typedef typename traits_t<T>::unsigned_t UT;
2397   typedef typename traits_t<T>::signed_t ST;
2398   kmp_uint32 team_id;
2399   kmp_uint32 nteams;
2400   UT trip_count;
2401   kmp_team_t *team;
2402   kmp_info_t *th;
2403 
2404   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2405   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2406 #ifdef KMP_DEBUG
2407   {
2408     const char *buff;
2409     // create format specifiers before the debug output
2410     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2411                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2412                             traits_t<T>::spec, traits_t<T>::spec,
2413                             traits_t<ST>::spec, traits_t<T>::spec);
2414     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2415     __kmp_str_free(&buff);
2416   }
2417 #endif
2418 
2419   if (__kmp_env_consistency_check) {
2420     if (incr == 0) {
2421       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2422                             loc);
2423     }
2424     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2425       // The loop is illegal.
2426       // Some zero-trip loops maintained by compiler, e.g.:
2427       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2428       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2429       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2430       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2431       // Compiler does not check the following illegal loops:
2432       //   for(i=0;i<10;i+=incr) // where incr<0
2433       //   for(i=10;i>0;i-=incr) // where incr<0
2434       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2435     }
2436   }
2437   th = __kmp_threads[gtid];
2438   team = th->th.th_team;
2439 #if OMP_40_ENABLED
2440   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2441   nteams = th->th.th_teams_size.nteams;
2442 #endif
2443   team_id = team->t.t_master_tid;
2444   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2445 
2446   // compute global trip count
2447   if (incr == 1) {
2448     trip_count = *pupper - *plower + 1;
2449   } else if (incr == -1) {
2450     trip_count = *plower - *pupper + 1;
2451   } else if (incr > 0) {
2452     // upper-lower can exceed the limit of signed type
2453     trip_count = (UT)(*pupper - *plower) / incr + 1;
2454   } else {
2455     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2456   }
2457 
2458   if (trip_count <= nteams) {
2459     KMP_DEBUG_ASSERT(
2460         __kmp_static == kmp_sch_static_greedy ||
2461         __kmp_static ==
2462             kmp_sch_static_balanced); // Unknown static scheduling type.
2463     // only some teams get single iteration, others get nothing
2464     if (team_id < trip_count) {
2465       *pupper = *plower = *plower + team_id * incr;
2466     } else {
2467       *plower = *pupper + incr; // zero-trip loop
2468     }
2469     if (plastiter != NULL)
2470       *plastiter = (team_id == trip_count - 1);
2471   } else {
2472     if (__kmp_static == kmp_sch_static_balanced) {
2473       UT chunk = trip_count / nteams;
2474       UT extras = trip_count % nteams;
2475       *plower +=
2476           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2477       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2478       if (plastiter != NULL)
2479         *plastiter = (team_id == nteams - 1);
2480     } else {
2481       T chunk_inc_count =
2482           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2483       T upper = *pupper;
2484       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2485       // Unknown static scheduling type.
2486       *plower += team_id * chunk_inc_count;
2487       *pupper = *plower + chunk_inc_count - incr;
2488       // Check/correct bounds if needed
2489       if (incr > 0) {
2490         if (*pupper < *plower)
2491           *pupper = traits_t<T>::max_value;
2492         if (plastiter != NULL)
2493           *plastiter = *plower <= upper && *pupper > upper - incr;
2494         if (*pupper > upper)
2495           *pupper = upper; // tracker C73258
2496       } else {
2497         if (*pupper > *plower)
2498           *pupper = traits_t<T>::min_value;
2499         if (plastiter != NULL)
2500           *plastiter = *plower >= upper && *pupper < upper - incr;
2501         if (*pupper < upper)
2502           *pupper = upper; // tracker C73258
2503       }
2504     }
2505   }
2506 }
2507 
2508 //-----------------------------------------------------------------------------
2509 // Dispatch routines
2510 //    Transfer call to template< type T >
2511 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2512 //                         T lb, T ub, ST st, ST chunk )
2513 extern "C" {
2514 
2515 /*!
2516 @ingroup WORK_SHARING
2517 @{
2518 @param loc Source location
2519 @param gtid Global thread id
2520 @param schedule Schedule type
2521 @param lb  Lower bound
2522 @param ub  Upper bound
2523 @param st  Step (or increment if you prefer)
2524 @param chunk The chunk size to block with
2525 
2526 This function prepares the runtime to start a dynamically scheduled for loop,
2527 saving the loop arguments.
2528 These functions are all identical apart from the types of the arguments.
2529 */
2530 
2531 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2532                             enum sched_type schedule, kmp_int32 lb,
2533                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2534   KMP_DEBUG_ASSERT(__kmp_init_serial);
2535   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2536 }
2537 /*!
2538 See @ref __kmpc_dispatch_init_4
2539 */
2540 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2541                              enum sched_type schedule, kmp_uint32 lb,
2542                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2543   KMP_DEBUG_ASSERT(__kmp_init_serial);
2544   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2545 }
2546 
2547 /*!
2548 See @ref __kmpc_dispatch_init_4
2549 */
2550 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2551                             enum sched_type schedule, kmp_int64 lb,
2552                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2553   KMP_DEBUG_ASSERT(__kmp_init_serial);
2554   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2555 }
2556 
2557 /*!
2558 See @ref __kmpc_dispatch_init_4
2559 */
2560 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2561                              enum sched_type schedule, kmp_uint64 lb,
2562                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2563   KMP_DEBUG_ASSERT(__kmp_init_serial);
2564   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2565 }
2566 
2567 /*!
2568 See @ref __kmpc_dispatch_init_4
2569 
2570 Difference from __kmpc_dispatch_init set of functions is these functions
2571 are called for composite distribute parallel for construct. Thus before
2572 regular iterations dispatching we need to calc per-team iteration space.
2573 
2574 These functions are all identical apart from the types of the arguments.
2575 */
2576 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2577                                  enum sched_type schedule, kmp_int32 *p_last,
2578                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2579                                  kmp_int32 chunk) {
2580   KMP_DEBUG_ASSERT(__kmp_init_serial);
2581   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2582   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2583 }
2584 
2585 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2586                                   enum sched_type schedule, kmp_int32 *p_last,
2587                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2588                                   kmp_int32 chunk) {
2589   KMP_DEBUG_ASSERT(__kmp_init_serial);
2590   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2591   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2592 }
2593 
2594 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2595                                  enum sched_type schedule, kmp_int32 *p_last,
2596                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2597                                  kmp_int64 chunk) {
2598   KMP_DEBUG_ASSERT(__kmp_init_serial);
2599   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2600   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2601 }
2602 
2603 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2604                                   enum sched_type schedule, kmp_int32 *p_last,
2605                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2606                                   kmp_int64 chunk) {
2607   KMP_DEBUG_ASSERT(__kmp_init_serial);
2608   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2609   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2610 }
2611 
2612 /*!
2613 @param loc Source code location
2614 @param gtid Global thread id
2615 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2616 otherwise
2617 @param p_lb   Pointer to the lower bound for the next chunk of work
2618 @param p_ub   Pointer to the upper bound for the next chunk of work
2619 @param p_st   Pointer to the stride for the next chunk of work
2620 @return one if there is work to be done, zero otherwise
2621 
2622 Get the next dynamically allocated chunk of work for this thread.
2623 If there is no more work, then the lb,ub and stride need not be modified.
2624 */
2625 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2626                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2627   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2628 }
2629 
2630 /*!
2631 See @ref __kmpc_dispatch_next_4
2632 */
2633 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2634                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2635                             kmp_int32 *p_st) {
2636   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2637 }
2638 
2639 /*!
2640 See @ref __kmpc_dispatch_next_4
2641 */
2642 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2643                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2644   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2645 }
2646 
2647 /*!
2648 See @ref __kmpc_dispatch_next_4
2649 */
2650 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2651                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2652                             kmp_int64 *p_st) {
2653   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2654 }
2655 
2656 /*!
2657 @param loc Source code location
2658 @param gtid Global thread id
2659 
2660 Mark the end of a dynamic loop.
2661 */
2662 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2663   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2664 }
2665 
2666 /*!
2667 See @ref __kmpc_dispatch_fini_4
2668 */
2669 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2670   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2671 }
2672 
2673 /*!
2674 See @ref __kmpc_dispatch_fini_4
2675 */
2676 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2677   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2678 }
2679 
2680 /*!
2681 See @ref __kmpc_dispatch_fini_4
2682 */
2683 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2684   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2685 }
2686 /*! @} */
2687 
2688 //-----------------------------------------------------------------------------
2689 // Non-template routines from kmp_dispatch.cpp used in other sources
2690 
2691 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2692   return value == checker;
2693 }
2694 
2695 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2696   return value != checker;
2697 }
2698 
2699 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2700   return value < checker;
2701 }
2702 
2703 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2704   return value >= checker;
2705 }
2706 
2707 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2708   return value <= checker;
2709 }
2710 
2711 kmp_uint32
2712 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2713                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2714                    void *obj // Higher-level synchronization object, or NULL.
2715                    ) {
2716   // note: we may not belong to a team at this point
2717   volatile kmp_uint32 *spin = spinner;
2718   kmp_uint32 check = checker;
2719   kmp_uint32 spins;
2720   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2721   kmp_uint32 r;
2722 
2723   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2724   KMP_INIT_YIELD(spins);
2725   // main wait spin loop
2726   while (!f(r = TCR_4(*spin), check)) {
2727     KMP_FSYNC_SPIN_PREPARE(obj);
2728     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2729        split. It causes problems with infinite recursion because of exit lock */
2730     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2731         __kmp_abort_thread(); */
2732 
2733     /* if we have waited a bit, or are oversubscribed, yield */
2734     /* pause is in the following code */
2735     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2736     KMP_YIELD_SPIN(spins);
2737   }
2738   KMP_FSYNC_SPIN_ACQUIRED(obj);
2739   return r;
2740 }
2741 
2742 void __kmp_wait_yield_4_ptr(
2743     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2744     void *obj // Higher-level synchronization object, or NULL.
2745     ) {
2746   // note: we may not belong to a team at this point
2747   void *spin = spinner;
2748   kmp_uint32 check = checker;
2749   kmp_uint32 spins;
2750   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2751 
2752   KMP_FSYNC_SPIN_INIT(obj, spin);
2753   KMP_INIT_YIELD(spins);
2754   // main wait spin loop
2755   while (!f(spin, check)) {
2756     KMP_FSYNC_SPIN_PREPARE(obj);
2757     /* if we have waited a bit, or are oversubscribed, yield */
2758     /* pause is in the following code */
2759     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2760     KMP_YIELD_SPIN(spins);
2761   }
2762   KMP_FSYNC_SPIN_ACQUIRED(obj);
2763 }
2764 
2765 } // extern "C"
2766 
2767 #ifdef KMP_GOMP_COMPAT
2768 
2769 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2770                                enum sched_type schedule, kmp_int32 lb,
2771                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2772                                int push_ws) {
2773   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2774                                  push_ws);
2775 }
2776 
2777 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2778                                 enum sched_type schedule, kmp_uint32 lb,
2779                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2780                                 int push_ws) {
2781   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2782                                   push_ws);
2783 }
2784 
2785 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2786                                enum sched_type schedule, kmp_int64 lb,
2787                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2788                                int push_ws) {
2789   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2790                                  push_ws);
2791 }
2792 
2793 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2794                                 enum sched_type schedule, kmp_uint64 lb,
2795                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2796                                 int push_ws) {
2797   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2798                                   push_ws);
2799 }
2800 
2801 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2802   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2803 }
2804 
2805 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2806   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2807 }
2808 
2809 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2810   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2811 }
2812 
2813 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2814   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2815 }
2816 
2817 #endif /* KMP_GOMP_COMPAT */
2818 
2819 /* ------------------------------------------------------------------------ */
2820