1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  *       it may change values between parallel regions.  __kmp_max_nth
20  *       is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52   typedef typename traits_t<T>::unsigned_t UT;
53   typedef typename traits_t<T>::signed_t ST;
54   UT count; // unsigned
55   T ub;
56   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57   T lb;
58   ST st; // signed
59   UT tc; // unsigned
60   T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62   /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65   //    a) parm3 is properly aligned and
66   //    b) all parm1-4 are in the same cache line.
67   // Because of parm1-4 are used together, performance seems to be better
68   // if they are in the same line (not measured though).
69 
70   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71     T parm1;
72     T parm2;
73     T parm3;
74     T parm4;
75   };
76 
77   UT ordered_lower; // unsigned
78   UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80   T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   T lb;
92   T ub;
93   ST st; // signed
94   UT tc; // unsigned
95 
96   T parm1;
97   T parm2;
98   T parm3;
99   T parm4;
100 
101   UT count; // unsigned
102 
103   UT ordered_lower; // unsigned
104   UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106   T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114   // duplicate alignment here, otherwise size of structure is not correct in our
115   // compiler
116   union KMP_ALIGN_CACHE private_info_tmpl {
117     dispatch_private_infoXX_template<T> p;
118     dispatch_private_info64_t p64;
119   } u;
120   enum sched_type schedule; /* scheduling algorithm */
121   kmp_uint32 ordered; /* ordered clause specified */
122   kmp_uint32 ordered_bumped;
123   // To retain the structure size after making ordered_iteration scalar
124   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126   kmp_uint32 nomerge; /* don't merge iters if serialized */
127   kmp_uint32 type_size;
128   enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134   /* chunk index under dynamic, number of idle threads under static-steal;
135      iteration index otherwise */
136   volatile UT iteration;
137   volatile UT num_done;
138   volatile UT ordered_iteration;
139   // to retain the structure size making ordered_iteration scalar
140   UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145   // we need union here to keep the structure size
146   union shared_info_tmpl {
147     dispatch_shared_infoXX_template<UT> s;
148     dispatch_shared_info64_t s64;
149   } u;
150   volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152   volatile kmp_int32 doacross_buf_idx; // teamwise index
153   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154   kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158   // machines (> 48 cores). Performance analysis showed that a cache thrash
159   // was occurring and this padding helps alleviate the problem.
160   char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173                                                  kmp_int32 d) {
174   kmp_int32 r;
175   r = KMP_TEST_THEN_ADD32(p, d);
176   return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181                                                  kmp_int64 d) {
182   kmp_int64 r;
183   r = KMP_TEST_THEN_ADD64(p, d);
184   return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192   kmp_int32 r;
193   r = KMP_TEST_THEN_INC_ACQ32(p);
194   return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199   kmp_int64 r;
200   r = KMP_TEST_THEN_INC_ACQ64(p);
201   return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209   kmp_int32 r;
210   r = KMP_TEST_THEN_INC32(p);
211   return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216   kmp_int64 r;
217   r = KMP_TEST_THEN_INC64(p);
218   return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227                                                     kmp_int32 c, kmp_int32 s) {
228   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233                                                     kmp_int64 c, kmp_int64 s) {
234   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238     Waits until function returns non-zero when called with *spinner and check.
239     Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241     Arguments:
242         obj -- is higher-level synchronization object to report to ittnotify.
243         It is used to report locks consistently. For example, if lock is
244         acquired immediately, its address is reported to ittnotify via
245         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247         same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253     __kmp_wait_yield(
254         volatile UT *spinner, UT checker,
255         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256             void *obj) // Higher-level synchronization object, or NULL.
257         ) {
258   // note: we may not belong to a team at this point
259   volatile UT *spin = spinner;
260   UT check = checker;
261   kmp_uint32 spins;
262   kmp_uint32 (*f)(UT, UT) = pred;
263   UT r;
264 
265   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
266   KMP_INIT_YIELD(spins);
267   // main wait spin loop
268   while (!f(r = *spin, check)) {
269     KMP_FSYNC_SPIN_PREPARE(obj);
270     /* GEH - remove this since it was accidentally introduced when kmp_wait was
271        split. It causes problems with infinite recursion because of exit lock */
272     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273         __kmp_abort_thread(); */
274 
275     // if we are oversubscribed, or have waited a bit (and
276     // KMP_LIBRARY=throughput, then yield. pause is in the following code
277     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278     KMP_YIELD_SPIN(spins);
279   }
280   KMP_FSYNC_SPIN_ACQUIRED(obj);
281   return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285   return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289   return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293   return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297   return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301   return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307                                      ident_t *loc_ref) {
308   kmp_info_t *th;
309 
310   KMP_DEBUG_ASSERT(gtid_ref);
311 
312   if (__kmp_env_consistency_check) {
313     th = __kmp_threads[*gtid_ref];
314     if (th->th.th_root->r.r_active &&
315         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321     }
322   }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327   typedef typename traits_t<UT>::signed_t ST;
328   dispatch_private_info_template<UT> *pr;
329 
330   int gtid = *gtid_ref;
331   //    int  cid = *cid_ref;
332   kmp_info_t *th = __kmp_threads[gtid];
333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336   if (__kmp_env_consistency_check) {
337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338         th->th.th_dispatch->th_dispatch_pr_current);
339     if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345     }
346   }
347 
348   if (!th->th.th_team->t.t_serialized) {
349     dispatch_shared_info_template<UT> *sh =
350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
351             th->th.th_dispatch->th_dispatch_sh_current);
352     UT lower;
353 
354     if (!__kmp_env_consistency_check) {
355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356           th->th.th_dispatch->th_dispatch_pr_current);
357     }
358     lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361     if (__kmp_env_consistency_check) {
362       if (pr->ordered_bumped) {
363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365                                ct_ordered_in_pdo, loc_ref,
366                                &p->stack_data[p->w_top]);
367       }
368     }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371     KMP_MB();
372 #ifdef KMP_DEBUG
373     {
374       const char *buff;
375       // create format specifiers before the debug output
376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377                               "ordered_iter:%%%s lower:%%%s\n",
378                               traits_t<UT>::spec, traits_t<UT>::spec);
379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380       __kmp_str_free(&buff);
381     }
382 #endif
383 
384     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386     KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388     {
389       const char *buff;
390       // create format specifiers before the debug output
391       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392                               "ordered_iter:%%%s lower:%%%s\n",
393                               traits_t<UT>::spec, traits_t<UT>::spec);
394       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395       __kmp_str_free(&buff);
396     }
397 #endif
398   }
399   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403                                      ident_t *loc_ref) {
404   kmp_info_t *th;
405 
406   if (__kmp_env_consistency_check) {
407     th = __kmp_threads[*gtid_ref];
408     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410     }
411   }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416   typedef typename traits_t<UT>::signed_t ST;
417   dispatch_private_info_template<UT> *pr;
418 
419   int gtid = *gtid_ref;
420   //    int  cid = *cid_ref;
421   kmp_info_t *th = __kmp_threads[gtid];
422   KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425   if (__kmp_env_consistency_check) {
426     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427         th->th.th_dispatch->th_dispatch_pr_current);
428     if (pr->pushed_ws != ct_none) {
429       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430     }
431   }
432 
433   if (!th->th.th_team->t.t_serialized) {
434     dispatch_shared_info_template<UT> *sh =
435         reinterpret_cast<dispatch_shared_info_template<UT> *>(
436             th->th.th_dispatch->th_dispatch_sh_current);
437 
438     if (!__kmp_env_consistency_check) {
439       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440           th->th.th_dispatch->th_dispatch_pr_current);
441     }
442 
443     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
444 #if !defined(KMP_GOMP_COMPAT)
445     if (__kmp_env_consistency_check) {
446       if (pr->ordered_bumped != 0) {
447         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448         /* How to test it? - OM */
449         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450                                ct_ordered_in_pdo, loc_ref,
451                                &p->stack_data[p->w_top]);
452       }
453     }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456     KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458     pr->ordered_bumped += 1;
459 
460     KD_TRACE(1000,
461              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462               gtid, pr->ordered_bumped));
463 
464     KMP_MB(); /* Flush all pending memory write invalidates.  */
465 
466     /* TODO use general release procedure? */
467     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469     KMP_MB(); /* Flush all pending memory write invalidates.  */
470   }
471   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477   long double s = 1.0L;
478 
479   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481   while (y) {
482     if (y & 1)
483       s *= x;
484     x *= x;
485     y >>= 1;
486   }
487   return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491    have been assigned (the total number of unassigned iterations in chunks with
492    index greater than or equal to idx). __forceinline seems to be broken so that
493    if we __forceinline this function, the behavior is wrong
494    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498                                 typename traits_t<T>::unsigned_t idx) {
499   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500      ICL 8.1, long double arithmetic may not really have long double precision,
501      even with /Qlong_double.  Currently, we workaround that in the caller code,
502      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
503      of precision is not expected to be a correctness issue, though. */
504   typedef typename traits_t<T>::unsigned_t UT;
505 
506   long double x = tc * __kmp_pow<UT>(base, idx);
507   UT r = (UT)x;
508   if (x == r)
509     return r;
510   return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
515 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527                     T ub, typename traits_t<T>::signed_t st,
528                     typename traits_t<T>::signed_t chunk, int push_ws) {
529   typedef typename traits_t<T>::unsigned_t UT;
530   typedef typename traits_t<T>::signed_t ST;
531   typedef typename traits_t<T>::floating_t DBL;
532 
533   int active;
534   T tc;
535   kmp_info_t *th;
536   kmp_team_t *team;
537   kmp_uint32 my_buffer_index;
538   dispatch_private_info_template<T> *pr;
539   dispatch_shared_info_template<UT> volatile *sh;
540 
541   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542                    sizeof(dispatch_private_info));
543   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544                    sizeof(dispatch_shared_info));
545 
546   if (!TCR_4(__kmp_init_parallel))
547     __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550   SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553   {
554     const char *buff;
555     // create format specifiers before the debug output
556     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558                             traits_t<ST>::spec, traits_t<T>::spec,
559                             traits_t<T>::spec, traits_t<ST>::spec);
560     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561     __kmp_str_free(&buff);
562   }
563 #endif
564   /* setup data */
565   th = __kmp_threads[gtid];
566   team = th->th.th_team;
567   active = !team->t.t_serialized;
568   th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571   kmp_uint64 cur_chunk = chunk;
572   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573                                     __kmp_forkjoin_frames_mode == 3 &&
574                                     KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576                                     th->th.th_teams_microtask == NULL &&
577 #endif
578                                     team->t.t_active_level == 1;
579 #endif
580   if (!active) {
581     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583   } else {
584     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587     my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589     /* What happens when number of threads changes, need to resize buffer? */
590     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591         &th->th.th_dispatch
592              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595   }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599     // AC: we now have only one implementation of stealing, so use it
600     schedule = kmp_sch_static_steal;
601   else
602 #endif
603     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605   /* Pick up the nomerge/ordered bits from the scheduling type */
606   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607     pr->nomerge = TRUE;
608     schedule =
609         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610   } else {
611     pr->nomerge = FALSE;
612   }
613   pr->type_size = traits_t<T>::type_size; // remember the size of variables
614   if (kmp_ord_lower & schedule) {
615     pr->ordered = TRUE;
616     schedule =
617         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618   } else {
619     pr->ordered = FALSE;
620   }
621 
622   if (schedule == kmp_sch_static) {
623     schedule = __kmp_static;
624   } else {
625     if (schedule == kmp_sch_runtime) {
626       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627       // not specified)
628       schedule = team->t.t_sched.r_sched_type;
629       // Detail the schedule if needed (global controls are differentiated
630       // appropriately)
631       if (schedule == kmp_sch_guided_chunked) {
632         schedule = __kmp_guided;
633       } else if (schedule == kmp_sch_static) {
634         schedule = __kmp_static;
635       }
636       // Use the chunk size specified by OMP_SCHEDULE (or default if not
637       // specified)
638       chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640       cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643       {
644         const char *buff;
645         // create format specifiers before the debug output
646         buff = __kmp_str_format(
647             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648             traits_t<ST>::spec);
649         KD_TRACE(10, (buff, gtid, schedule, chunk));
650         __kmp_str_free(&buff);
651       }
652 #endif
653     } else {
654       if (schedule == kmp_sch_guided_chunked) {
655         schedule = __kmp_guided;
656       }
657       if (chunk <= 0) {
658         chunk = KMP_DEFAULT_CHUNK;
659       }
660     }
661 
662     if (schedule == kmp_sch_auto) {
663       // mapping and differentiation: in the __kmp_do_serial_initialize()
664       schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666       {
667         const char *buff;
668         // create format specifiers before the debug output
669         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670                                 "schedule:%%d chunk:%%%s\n",
671                                 traits_t<ST>::spec);
672         KD_TRACE(10, (buff, gtid, schedule, chunk));
673         __kmp_str_free(&buff);
674       }
675 #endif
676     }
677 
678     /* guided analytical not safe for too many threads */
679     if (schedule == kmp_sch_guided_analytical_chunked &&
680         th->th.th_team_nproc > 1 << 20) {
681       schedule = kmp_sch_guided_iterative_chunked;
682       KMP_WARNING(DispatchManyThreads);
683     }
684     if (schedule == kmp_sch_runtime_simd) {
685       // compiler provides simd_width in the chunk parameter
686       schedule = team->t.t_sched.r_sched_type;
687       // Detail the schedule if needed (global controls are differentiated
688       // appropriately)
689       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690           schedule == __kmp_static) {
691         schedule = kmp_sch_static_balanced_chunked;
692       } else {
693         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694           schedule = kmp_sch_guided_simd;
695         }
696         chunk = team->t.t_sched.chunk * chunk;
697       }
698 #if USE_ITT_BUILD
699       cur_chunk = chunk;
700 #endif
701 #ifdef KMP_DEBUG
702       {
703         const char *buff;
704         // create format specifiers before the debug output
705         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706                                 " chunk:%%%s\n",
707                                 traits_t<ST>::spec);
708         KD_TRACE(10, (buff, gtid, schedule, chunk));
709         __kmp_str_free(&buff);
710       }
711 #endif
712     }
713     pr->u.p.parm1 = chunk;
714   }
715   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716               "unknown scheduling type");
717 
718   pr->u.p.count = 0;
719 
720   if (__kmp_env_consistency_check) {
721     if (st == 0) {
722       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724     }
725   }
726   // compute trip count
727   if (st == 1) { // most common case
728     if (ub >= lb) {
729       tc = ub - lb + 1;
730     } else { // ub < lb
731       tc = 0; // zero-trip
732     }
733   } else if (st < 0) {
734     if (lb >= ub) {
735       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736       // where the division needs to be unsigned regardless of the result type
737       tc = (UT)(lb - ub) / (-st) + 1;
738     } else { // lb < ub
739       tc = 0; // zero-trip
740     }
741   } else { // st > 0
742     if (ub >= lb) {
743       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744       // where the division needs to be unsigned regardless of the result type
745       tc = (UT)(ub - lb) / st + 1;
746     } else { // ub < lb
747       tc = 0; // zero-trip
748     }
749   }
750 
751   // Any half-decent optimizer will remove this test when the blocks are empty
752   // since the macros expand to nothing when statistics are disabled.
753   if (schedule == __kmp_static) {
754     KMP_COUNT_BLOCK(OMP_FOR_static);
755     KMP_COUNT_VALUE(FOR_static_iterations, tc);
756   } else {
757     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759   }
760 
761   pr->u.p.lb = lb;
762   pr->u.p.ub = ub;
763   pr->u.p.st = st;
764   pr->u.p.tc = tc;
765 
766 #if KMP_OS_WINDOWS
767   pr->u.p.last_upper = ub + st;
768 #endif /* KMP_OS_WINDOWS */
769 
770   /* NOTE: only the active parallel region(s) has active ordered sections */
771 
772   if (active) {
773     if (pr->ordered == 0) {
774       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776     } else {
777       pr->ordered_bumped = 0;
778 
779       pr->u.p.ordered_lower = 1;
780       pr->u.p.ordered_upper = 0;
781 
782       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784     }
785   }
786 
787   if (__kmp_env_consistency_check) {
788     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789     if (push_ws) {
790       __kmp_push_workshare(gtid, ws, loc);
791       pr->pushed_ws = ws;
792     } else {
793       __kmp_check_workshare(gtid, ws, loc);
794       pr->pushed_ws = ct_none;
795     }
796   }
797 
798   switch (schedule) {
799 #if (KMP_STATIC_STEAL_ENABLED)
800   case kmp_sch_static_steal: {
801     T nproc = th->th.th_team_nproc;
802     T ntc, init;
803 
804     KD_TRACE(100,
805              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806 
807     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808     if (nproc > 1 && ntc >= nproc) {
809       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810       T id = __kmp_tid_from_gtid(gtid);
811       T small_chunk, extras;
812 
813       small_chunk = ntc / nproc;
814       extras = ntc % nproc;
815 
816       init = id * small_chunk + (id < extras ? id : extras);
817       pr->u.p.count = init;
818       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819 
820       pr->u.p.parm2 = lb;
821       // pr->pfields.parm3 = 0; // it's not used in static_steal
822       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823       pr->u.p.st = st;
824       if (traits_t<T>::type_size > 4) {
825         // AC: TODO: check if 16-byte CAS available and use it to
826         // improve performance (probably wait for explicit request
827         // before spending time on this).
828         // For now use dynamically allocated per-thread lock,
829         // free memory in __kmp_dispatch_next when status==0.
830         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831         th->th.th_dispatch->th_steal_lock =
832             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834       }
835       break;
836     } else {
837       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838                      "kmp_sch_static_balanced\n",
839                      gtid));
840       schedule = kmp_sch_static_balanced;
841       /* too few iterations: fall-through to kmp_sch_static_balanced */
842     } // if
843     /* FALL-THROUGH to static balanced */
844   } // case
845 #endif
846   case kmp_sch_static_balanced: {
847     T nproc = th->th.th_team_nproc;
848     T init, limit;
849 
850     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851                    gtid));
852 
853     if (nproc > 1) {
854       T id = __kmp_tid_from_gtid(gtid);
855 
856       if (tc < nproc) {
857         if (id < tc) {
858           init = id;
859           limit = id;
860           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861         } else {
862           pr->u.p.count = 1; /* means no more chunks to execute */
863           pr->u.p.parm1 = FALSE;
864           break;
865         }
866       } else {
867         T small_chunk = tc / nproc;
868         T extras = tc % nproc;
869         init = id * small_chunk + (id < extras ? id : extras);
870         limit = init + small_chunk - (id < extras ? 0 : 1);
871         pr->u.p.parm1 = (id == nproc - 1);
872       }
873     } else {
874       if (tc > 0) {
875         init = 0;
876         limit = tc - 1;
877         pr->u.p.parm1 = TRUE;
878       } else { // zero trip count
879         pr->u.p.count = 1; /* means no more chunks to execute */
880         pr->u.p.parm1 = FALSE;
881         break;
882       }
883     }
884 #if USE_ITT_BUILD
885     // Calculate chunk for metadata report
886     if (itt_need_metadata_reporting)
887       cur_chunk = limit - init + 1;
888 #endif
889     if (st == 1) {
890       pr->u.p.lb = lb + init;
891       pr->u.p.ub = lb + limit;
892     } else {
893       // calculated upper bound, "ub" is user-defined upper bound
894       T ub_tmp = lb + limit * st;
895       pr->u.p.lb = lb + init * st;
896       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897       // it exactly
898       if (st > 0) {
899         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900       } else {
901         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902       }
903     }
904     if (pr->ordered) {
905       pr->u.p.ordered_lower = init;
906       pr->u.p.ordered_upper = limit;
907     }
908     break;
909   } // case
910   case kmp_sch_static_balanced_chunked: {
911     // similar to balanced, but chunk adjusted to multiple of simd width
912     T nth = th->th.th_team_nproc;
913     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914                    " -> falling-through to static_greedy\n",
915                    gtid));
916     schedule = kmp_sch_static_greedy;
917     if (nth > 1)
918       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919     else
920       pr->u.p.parm1 = tc;
921     break;
922   } // case
923   case kmp_sch_guided_iterative_chunked:
924   case kmp_sch_guided_simd: {
925     T nproc = th->th.th_team_nproc;
926     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927                    " case\n",
928                    gtid));
929 
930     if (nproc > 1) {
931       if ((2L * chunk + 1) * nproc >= tc) {
932         /* chunk size too large, switch to dynamic */
933         schedule = kmp_sch_dynamic_chunked;
934       } else {
935         // when remaining iters become less than parm2 - switch to dynamic
936         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937         *(double *)&pr->u.p.parm3 =
938             guided_flt_param / nproc; // may occupy parm3 and parm4
939       }
940     } else {
941       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942                      "kmp_sch_static_greedy\n",
943                      gtid));
944       schedule = kmp_sch_static_greedy;
945       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947                      gtid));
948       pr->u.p.parm1 = tc;
949     } // if
950   } // case
951   break;
952   case kmp_sch_guided_analytical_chunked: {
953     T nproc = th->th.th_team_nproc;
954     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955                    " case\n",
956                    gtid));
957     if (nproc > 1) {
958       if ((2L * chunk + 1) * nproc >= tc) {
959         /* chunk size too large, switch to dynamic */
960         schedule = kmp_sch_dynamic_chunked;
961       } else {
962         /* commonly used term: (2 nproc - 1)/(2 nproc) */
963         DBL x;
964 
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966         /* Linux* OS already has 64-bit computation by default for long double,
967            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969            instead of the default 53-bit. Even though long double doesn't work
970            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971            expected to impact the correctness of the algorithm, but this has not
972            been mathematically proven. */
973         // save original FPCW and set precision to 64-bit, as
974         // Windows* OS on IA-32 architecture defaults to 53-bit
975         unsigned int oldFpcw = _control87(0, 0);
976         _control87(_PC_64, _MCW_PC); // 0,0x30000
977 #endif
978         /* value used for comparison in solver for cross-over point */
979         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980 
981         /* crossover point--chunk indexes equal to or greater than
982            this point switch to dynamic-style scheduling */
983         UT cross;
984 
985         /* commonly used term: (2 nproc - 1)/(2 nproc) */
986         x = (long double)1.0 - (long double)0.5 / nproc;
987 
988 #ifdef KMP_DEBUG
989         { // test natural alignment
990           struct _test_a {
991             char a;
992             union {
993               char b;
994               DBL d;
995             };
996           } t;
997           ptrdiff_t natural_alignment =
998               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000           // long)natural_alignment );
1001           KMP_DEBUG_ASSERT(
1002               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003         }
1004 #endif // KMP_DEBUG
1005 
1006         /* save the term in thread private dispatch structure */
1007         *(DBL *)&pr->u.p.parm3 = x;
1008 
1009         /* solve for the crossover point to the nearest integer i for which C_i
1010            <= chunk */
1011         {
1012           UT left, right, mid;
1013           long double p;
1014 
1015           /* estimate initial upper and lower bound */
1016 
1017           /* doesn't matter what value right is as long as it is positive, but
1018              it affects performance of the solver */
1019           right = 229;
1020           p = __kmp_pow<UT>(x, right);
1021           if (p > target) {
1022             do {
1023               p *= p;
1024               right <<= 1;
1025             } while (p > target && right < (1 << 27));
1026             /* lower bound is previous (failed) estimate of upper bound */
1027             left = right >> 1;
1028           } else {
1029             left = 0;
1030           }
1031 
1032           /* bisection root-finding method */
1033           while (left + 1 < right) {
1034             mid = (left + right) / 2;
1035             if (__kmp_pow<UT>(x, mid) > target) {
1036               left = mid;
1037             } else {
1038               right = mid;
1039             }
1040           } // while
1041           cross = right;
1042         }
1043         /* assert sanity of computed crossover point */
1044         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045                    __kmp_pow<UT>(x, cross) <= target);
1046 
1047         /* save the crossover point in thread private dispatch structure */
1048         pr->u.p.parm2 = cross;
1049 
1050 // C75803
1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053 #else
1054 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1055 #endif
1056         /* dynamic-style scheduling offset */
1057         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059                         cross * chunk;
1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1061         // restore FPCW
1062         _control87(oldFpcw, _MCW_PC);
1063 #endif
1064       } // if
1065     } else {
1066       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067                      "kmp_sch_static_greedy\n",
1068                      gtid));
1069       schedule = kmp_sch_static_greedy;
1070       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071       pr->u.p.parm1 = tc;
1072     } // if
1073   } // case
1074   break;
1075   case kmp_sch_static_greedy:
1076     KD_TRACE(100,
1077              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080                         : tc;
1081     break;
1082   case kmp_sch_static_chunked:
1083   case kmp_sch_dynamic_chunked:
1084     if (pr->u.p.parm1 <= 0) {
1085       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086     }
1087     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089                    gtid));
1090     break;
1091   case kmp_sch_trapezoidal: {
1092     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093 
1094     T parm1, parm2, parm3, parm4;
1095     KD_TRACE(100,
1096              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097 
1098     parm1 = chunk;
1099 
1100     /* F : size of the first cycle */
1101     parm2 = (tc / (2 * th->th.th_team_nproc));
1102 
1103     if (parm2 < 1) {
1104       parm2 = 1;
1105     }
1106 
1107     /* L : size of the last cycle.  Make sure the last cycle is not larger
1108        than the first cycle. */
1109     if (parm1 < 1) {
1110       parm1 = 1;
1111     } else if (parm1 > parm2) {
1112       parm1 = parm2;
1113     }
1114 
1115     /* N : number of cycles */
1116     parm3 = (parm2 + parm1);
1117     parm3 = (2 * tc + parm3 - 1) / parm3;
1118 
1119     if (parm3 < 2) {
1120       parm3 = 2;
1121     }
1122 
1123     /* sigma : decreasing incr of the trapezoid */
1124     parm4 = (parm3 - 1);
1125     parm4 = (parm2 - parm1) / parm4;
1126 
1127     // pointless check, because parm4 >= 0 always
1128     // if ( parm4 < 0 ) {
1129     //    parm4 = 0;
1130     //}
1131 
1132     pr->u.p.parm1 = parm1;
1133     pr->u.p.parm2 = parm2;
1134     pr->u.p.parm3 = parm3;
1135     pr->u.p.parm4 = parm4;
1136   } // case
1137   break;
1138 
1139   default: {
1140     __kmp_msg(kmp_ms_fatal, // Severity
1141               KMP_MSG(UnknownSchedTypeDetected), // Primary message
1142               KMP_HNT(GetNewerLibrary), // Hint
1143               __kmp_msg_null // Variadic argument list terminator
1144               );
1145   } break;
1146   } // switch
1147   pr->schedule = schedule;
1148   if (active) {
1149     /* The name of this buffer should be my_buffer_index when it's free to use
1150      * it */
1151 
1152     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1153                    "sh->buffer_index:%d\n",
1154                    gtid, my_buffer_index, sh->buffer_index));
1155     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1156                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1157     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1158     // my_buffer_index are *always* 32-bit integers.
1159     KMP_MB(); /* is this necessary? */
1160     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1161                    "sh->buffer_index:%d\n",
1162                    gtid, my_buffer_index, sh->buffer_index));
1163 
1164     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1165     th->th.th_dispatch->th_dispatch_sh_current =
1166         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1167 #if USE_ITT_BUILD
1168     if (pr->ordered) {
1169       __kmp_itt_ordered_init(gtid);
1170     }; // if
1171     // Report loop metadata
1172     if (itt_need_metadata_reporting) {
1173       // Only report metadata by master of active team at level 1
1174       kmp_uint64 schedtype = 0;
1175       switch (schedule) {
1176       case kmp_sch_static_chunked:
1177       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1178         break;
1179       case kmp_sch_static_greedy:
1180         cur_chunk = pr->u.p.parm1;
1181         break;
1182       case kmp_sch_dynamic_chunked:
1183         schedtype = 1;
1184         break;
1185       case kmp_sch_guided_iterative_chunked:
1186       case kmp_sch_guided_analytical_chunked:
1187       case kmp_sch_guided_simd:
1188         schedtype = 2;
1189         break;
1190       default:
1191         // Should we put this case under "static"?
1192         // case kmp_sch_static_steal:
1193         schedtype = 3;
1194         break;
1195       }
1196       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1197     }
1198 #endif /* USE_ITT_BUILD */
1199   }; // if
1200 
1201 #ifdef KMP_DEBUG
1202   {
1203     const char *buff;
1204     // create format specifiers before the debug output
1205     buff = __kmp_str_format(
1206         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1207         "lb:%%%s ub:%%%s"
1208         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1209         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1210         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1211         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1212         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1213         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1214     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1215                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1216                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1217                   pr->u.p.parm3, pr->u.p.parm4));
1218     __kmp_str_free(&buff);
1219   }
1220 #endif
1221 #if (KMP_STATIC_STEAL_ENABLED)
1222   // It cannot be guaranteed that after execution of a loop with some other
1223   // schedule kind all the parm3 variables will contain the same value. Even if
1224   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1225   // rather than program life-time increment. So the dedicated variable is
1226   // required. The 'static_steal_counter' is used.
1227   if (schedule == kmp_sch_static_steal) {
1228     // Other threads will inspect this variable when searching for a victim.
1229     // This is a flag showing that other threads may steal from this thread
1230     // since then.
1231     volatile T *p = &pr->u.p.static_steal_counter;
1232     *p = *p + 1;
1233   }
1234 #endif // ( KMP_STATIC_STEAL_ENABLED )
1235 
1236 #if OMPT_SUPPORT && OMPT_TRACE
1237   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1238     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1239     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1240     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1241         team_info->parallel_id, task_info->task_id, team_info->microtask);
1242   }
1243 #endif
1244 }
1245 
1246 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1247  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1248  * every chunk of iterations.  If the ordered section(s) were not executed
1249  * for this iteration (or every iteration in this chunk), we need to set the
1250  * ordered iteration counters so that the next thread can proceed. */
1251 template <typename UT>
1252 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1253   typedef typename traits_t<UT>::signed_t ST;
1254   kmp_info_t *th = __kmp_threads[gtid];
1255 
1256   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1257   if (!th->th.th_team->t.t_serialized) {
1258 
1259     dispatch_private_info_template<UT> *pr =
1260         reinterpret_cast<dispatch_private_info_template<UT> *>(
1261             th->th.th_dispatch->th_dispatch_pr_current);
1262     dispatch_shared_info_template<UT> volatile *sh =
1263         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1264             th->th.th_dispatch->th_dispatch_sh_current);
1265     KMP_DEBUG_ASSERT(pr);
1266     KMP_DEBUG_ASSERT(sh);
1267     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1268                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1269 
1270     if (pr->ordered_bumped) {
1271       KD_TRACE(
1272           1000,
1273           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1274            gtid));
1275       pr->ordered_bumped = 0;
1276     } else {
1277       UT lower = pr->u.p.ordered_lower;
1278 
1279 #ifdef KMP_DEBUG
1280       {
1281         const char *buff;
1282         // create format specifiers before the debug output
1283         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1284                                 "ordered_iteration:%%%s lower:%%%s\n",
1285                                 traits_t<UT>::spec, traits_t<UT>::spec);
1286         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1287         __kmp_str_free(&buff);
1288       }
1289 #endif
1290 
1291       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1292                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1293       KMP_MB(); /* is this necessary? */
1294 #ifdef KMP_DEBUG
1295       {
1296         const char *buff;
1297         // create format specifiers before the debug output
1298         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1299                                 "ordered_iteration:%%%s lower:%%%s\n",
1300                                 traits_t<UT>::spec, traits_t<UT>::spec);
1301         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1302         __kmp_str_free(&buff);
1303       }
1304 #endif
1305 
1306       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1307     } // if
1308   } // if
1309   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1310 }
1311 
1312 #ifdef KMP_GOMP_COMPAT
1313 
1314 template <typename UT>
1315 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1316   typedef typename traits_t<UT>::signed_t ST;
1317   kmp_info_t *th = __kmp_threads[gtid];
1318 
1319   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1320   if (!th->th.th_team->t.t_serialized) {
1321     //        int cid;
1322     dispatch_private_info_template<UT> *pr =
1323         reinterpret_cast<dispatch_private_info_template<UT> *>(
1324             th->th.th_dispatch->th_dispatch_pr_current);
1325     dispatch_shared_info_template<UT> volatile *sh =
1326         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1327             th->th.th_dispatch->th_dispatch_sh_current);
1328     KMP_DEBUG_ASSERT(pr);
1329     KMP_DEBUG_ASSERT(sh);
1330     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1331                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1332 
1333     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1334     UT lower = pr->u.p.ordered_lower;
1335     UT upper = pr->u.p.ordered_upper;
1336     UT inc = upper - lower + 1;
1337 
1338     if (pr->ordered_bumped == inc) {
1339       KD_TRACE(
1340           1000,
1341           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1342            gtid));
1343       pr->ordered_bumped = 0;
1344     } else {
1345       inc -= pr->ordered_bumped;
1346 
1347 #ifdef KMP_DEBUG
1348       {
1349         const char *buff;
1350         // create format specifiers before the debug output
1351         buff = __kmp_str_format(
1352             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1353             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1354             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1355         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1356         __kmp_str_free(&buff);
1357       }
1358 #endif
1359 
1360       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1361                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1362 
1363       KMP_MB(); /* is this necessary? */
1364       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1365                       "ordered_bumped to zero\n",
1366                       gtid));
1367       pr->ordered_bumped = 0;
1368 //!!!!! TODO check if the inc should be unsigned, or signed???
1369 #ifdef KMP_DEBUG
1370       {
1371         const char *buff;
1372         // create format specifiers before the debug output
1373         buff = __kmp_str_format(
1374             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1375             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1376             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1377             traits_t<UT>::spec);
1378         KD_TRACE(1000,
1379                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1380         __kmp_str_free(&buff);
1381       }
1382 #endif
1383 
1384       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1385     }
1386     //        }
1387   }
1388   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1389 }
1390 
1391 #endif /* KMP_GOMP_COMPAT */
1392 
1393 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1394    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1395    is not called. */
1396 #if OMPT_SUPPORT && OMPT_TRACE
1397 #define OMPT_LOOP_END                                                          \
1398   if (status == 0) {                                                           \
1399     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1400       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1401       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1402       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1403           team_info->parallel_id, task_info->task_id);                         \
1404     }                                                                          \
1405   }
1406 #else
1407 #define OMPT_LOOP_END // no-op
1408 #endif
1409 
1410 template <typename T>
1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1412                                T *p_lb, T *p_ub,
1413                                typename traits_t<T>::signed_t *p_st) {
1414 
1415   typedef typename traits_t<T>::unsigned_t UT;
1416   typedef typename traits_t<T>::signed_t ST;
1417   typedef typename traits_t<T>::floating_t DBL;
1418 
1419   // This is potentially slightly misleading, schedule(runtime) will appear here
1420   // even if the actual runtme schedule is static. (Which points out a
1421   // disadavantage of schedule(runtime): even when static scheduling is used it
1422   // costs more than a compile time choice to use static scheduling would.)
1423   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1424 
1425   int status;
1426   dispatch_private_info_template<T> *pr;
1427   kmp_info_t *th = __kmp_threads[gtid];
1428   kmp_team_t *team = th->th.th_team;
1429 
1430   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1431 #ifdef KMP_DEBUG
1432   {
1433     const char *buff;
1434     // create format specifiers before the debug output
1435     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1436                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1437                             traits_t<T>::spec, traits_t<T>::spec,
1438                             traits_t<ST>::spec);
1439     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1440     __kmp_str_free(&buff);
1441   }
1442 #endif
1443 
1444   if (team->t.t_serialized) {
1445     /* NOTE: serialize this dispatch becase we are not at the active level */
1446     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1447         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1448     KMP_DEBUG_ASSERT(pr);
1449 
1450     if ((status = (pr->u.p.tc != 0)) == 0) {
1451       *p_lb = 0;
1452       *p_ub = 0;
1453       //            if ( p_last != NULL )
1454       //                *p_last = 0;
1455       if (p_st != NULL)
1456         *p_st = 0;
1457       if (__kmp_env_consistency_check) {
1458         if (pr->pushed_ws != ct_none) {
1459           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1460         }
1461       }
1462     } else if (pr->nomerge) {
1463       kmp_int32 last;
1464       T start;
1465       UT limit, trip, init;
1466       ST incr;
1467       T chunk = pr->u.p.parm1;
1468 
1469       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1470                      gtid));
1471 
1472       init = chunk * pr->u.p.count++;
1473       trip = pr->u.p.tc - 1;
1474 
1475       if ((status = (init <= trip)) == 0) {
1476         *p_lb = 0;
1477         *p_ub = 0;
1478         //                if ( p_last != NULL )
1479         //                    *p_last = 0;
1480         if (p_st != NULL)
1481           *p_st = 0;
1482         if (__kmp_env_consistency_check) {
1483           if (pr->pushed_ws != ct_none) {
1484             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1485           }
1486         }
1487       } else {
1488         start = pr->u.p.lb;
1489         limit = chunk + init - 1;
1490         incr = pr->u.p.st;
1491 
1492         if ((last = (limit >= trip)) != 0) {
1493           limit = trip;
1494 #if KMP_OS_WINDOWS
1495           pr->u.p.last_upper = pr->u.p.ub;
1496 #endif /* KMP_OS_WINDOWS */
1497         }
1498         if (p_last != NULL)
1499           *p_last = last;
1500         if (p_st != NULL)
1501           *p_st = incr;
1502         if (incr == 1) {
1503           *p_lb = start + init;
1504           *p_ub = start + limit;
1505         } else {
1506           *p_lb = start + init * incr;
1507           *p_ub = start + limit * incr;
1508         }
1509 
1510         if (pr->ordered) {
1511           pr->u.p.ordered_lower = init;
1512           pr->u.p.ordered_upper = limit;
1513 #ifdef KMP_DEBUG
1514           {
1515             const char *buff;
1516             // create format specifiers before the debug output
1517             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1518                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1519                                     traits_t<UT>::spec, traits_t<UT>::spec);
1520             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1521                             pr->u.p.ordered_upper));
1522             __kmp_str_free(&buff);
1523           }
1524 #endif
1525         } // if
1526       } // if
1527     } else {
1528       pr->u.p.tc = 0;
1529       *p_lb = pr->u.p.lb;
1530       *p_ub = pr->u.p.ub;
1531 #if KMP_OS_WINDOWS
1532       pr->u.p.last_upper = *p_ub;
1533 #endif /* KMP_OS_WINDOWS */
1534       if (p_last != NULL)
1535         *p_last = TRUE;
1536       if (p_st != NULL)
1537         *p_st = pr->u.p.st;
1538     } // if
1539 #ifdef KMP_DEBUG
1540     {
1541       const char *buff;
1542       // create format specifiers before the debug output
1543       buff = __kmp_str_format(
1544           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1545           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1546           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1547       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1548       __kmp_str_free(&buff);
1549     }
1550 #endif
1551 #if INCLUDE_SSC_MARKS
1552     SSC_MARK_DISPATCH_NEXT();
1553 #endif
1554     OMPT_LOOP_END;
1555     return status;
1556   } else {
1557     kmp_int32 last = 0;
1558     dispatch_shared_info_template<UT> *sh;
1559     T start;
1560     ST incr;
1561     UT limit, trip, init;
1562 
1563     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1564                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1565 
1566     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1567         th->th.th_dispatch->th_dispatch_pr_current);
1568     KMP_DEBUG_ASSERT(pr);
1569     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1570         th->th.th_dispatch->th_dispatch_sh_current);
1571     KMP_DEBUG_ASSERT(sh);
1572 
1573     if (pr->u.p.tc == 0) {
1574       // zero trip count
1575       status = 0;
1576     } else {
1577       switch (pr->schedule) {
1578 #if (KMP_STATIC_STEAL_ENABLED)
1579       case kmp_sch_static_steal: {
1580         T chunk = pr->u.p.parm1;
1581         int nproc = th->th.th_team_nproc;
1582 
1583         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1584                        gtid));
1585 
1586         trip = pr->u.p.tc - 1;
1587 
1588         if (traits_t<T>::type_size > 4) {
1589           // use lock for 8-byte and CAS for 4-byte induction
1590           // variable. TODO (optional): check and use 16-byte CAS
1591           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1592           KMP_DEBUG_ASSERT(lck != NULL);
1593           if (pr->u.p.count < (UT)pr->u.p.ub) {
1594             __kmp_acquire_lock(lck, gtid);
1595             // try to get own chunk of iterations
1596             init = (pr->u.p.count)++;
1597             status = (init < (UT)pr->u.p.ub);
1598             __kmp_release_lock(lck, gtid);
1599           } else {
1600             status = 0; // no own chunks
1601           }
1602           if (!status) { // try to steal
1603             kmp_info_t **other_threads = team->t.t_threads;
1604             int while_limit = nproc; // nproc attempts to find a victim
1605             int while_index = 0;
1606             // TODO: algorithm of searching for a victim
1607             // should be cleaned up and measured
1608             while ((!status) && (while_limit != ++while_index)) {
1609               T remaining;
1610               T victimIdx = pr->u.p.parm4;
1611               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1612               dispatch_private_info_template<T> *victim =
1613                   reinterpret_cast<dispatch_private_info_template<T> *>(
1614                       other_threads[victimIdx]
1615                           ->th.th_dispatch->th_dispatch_pr_current);
1616               while ((victim == NULL || victim == pr ||
1617                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1618                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1619                      oldVictimIdx != victimIdx) {
1620                 victimIdx = (victimIdx + 1) % nproc;
1621                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1622                     other_threads[victimIdx]
1623                         ->th.th_dispatch->th_dispatch_pr_current);
1624               };
1625               if (!victim ||
1626                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1627                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1628                 continue; // try once more (nproc attempts in total)
1629                 // no victim is ready yet to participate in stealing
1630                 // because all victims are still in kmp_init_dispatch
1631               }
1632               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1633                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1634                 continue; // not enough chunks to steal, goto next victim
1635               }
1636 
1637               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1638               KMP_ASSERT(lck != NULL);
1639               __kmp_acquire_lock(lck, gtid);
1640               limit = victim->u.p.ub; // keep initial ub
1641               if (victim->u.p.count >= limit ||
1642                   (remaining = limit - victim->u.p.count) < 2) {
1643                 __kmp_release_lock(lck, gtid);
1644                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1645                 continue; // not enough chunks to steal
1646               }
1647               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1648               // or by 1
1649               if (remaining > 3) {
1650                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1651                 init = (victim->u.p.ub -=
1652                         (remaining >> 2)); // steal 1/4 of remaining
1653               } else {
1654                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1655                 init =
1656                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1657               }
1658               __kmp_release_lock(lck, gtid);
1659 
1660               KMP_DEBUG_ASSERT(init + 1 <= limit);
1661               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1662               status = 1;
1663               while_index = 0;
1664               // now update own count and ub with stolen range but init chunk
1665               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1666               pr->u.p.count = init + 1;
1667               pr->u.p.ub = limit;
1668               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1669             } // while (search for victim)
1670           } // if (try to find victim and steal)
1671         } else {
1672           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1673           typedef union {
1674             struct {
1675               UT count;
1676               T ub;
1677             } p;
1678             kmp_int64 b;
1679           } union_i4;
1680           // All operations on 'count' or 'ub' must be combined atomically
1681           // together.
1682           {
1683             union_i4 vold, vnew;
1684             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1685             vnew = vold;
1686             vnew.p.count++;
1687             while (!KMP_COMPARE_AND_STORE_ACQ64(
1688                 (volatile kmp_int64 *)&pr->u.p.count,
1689                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1690                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1691               KMP_CPU_PAUSE();
1692               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1693               vnew = vold;
1694               vnew.p.count++;
1695             }
1696             vnew = vold;
1697             init = vnew.p.count;
1698             status = (init < (UT)vnew.p.ub);
1699           }
1700 
1701           if (!status) {
1702             kmp_info_t **other_threads = team->t.t_threads;
1703             int while_limit = nproc; // nproc attempts to find a victim
1704             int while_index = 0;
1705 
1706             // TODO: algorithm of searching for a victim
1707             // should be cleaned up and measured
1708             while ((!status) && (while_limit != ++while_index)) {
1709               union_i4 vold, vnew;
1710               kmp_int32 remaining;
1711               T victimIdx = pr->u.p.parm4;
1712               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1713               dispatch_private_info_template<T> *victim =
1714                   reinterpret_cast<dispatch_private_info_template<T> *>(
1715                       other_threads[victimIdx]
1716                           ->th.th_dispatch->th_dispatch_pr_current);
1717               while ((victim == NULL || victim == pr ||
1718                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1719                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1720                      oldVictimIdx != victimIdx) {
1721                 victimIdx = (victimIdx + 1) % nproc;
1722                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1723                     other_threads[victimIdx]
1724                         ->th.th_dispatch->th_dispatch_pr_current);
1725               };
1726               if (!victim ||
1727                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1728                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1729                 continue; // try once more (nproc attempts in total)
1730                 // no victim is ready yet to participate in stealing
1731                 // because all victims are still in kmp_init_dispatch
1732               }
1733               pr->u.p.parm4 = victimIdx; // new victim found
1734               while (1) { // CAS loop if victim has enough chunks to steal
1735                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1736                 vnew = vold;
1737 
1738                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1739                 if (vnew.p.count >= (UT)vnew.p.ub ||
1740                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1741                   pr->u.p.parm4 =
1742                       (victimIdx + 1) % nproc; // shift start victim id
1743                   break; // not enough chunks to steal, goto next victim
1744                 }
1745                 if (remaining > 3) {
1746                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1747                 } else {
1748                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1749                 }
1750                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1751                 // TODO: Should this be acquire or release?
1752                 if (KMP_COMPARE_AND_STORE_ACQ64(
1753                         (volatile kmp_int64 *)&victim->u.p.count,
1754                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1755                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1756                   // stealing succeeded
1757                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1758                                   vold.p.ub - vnew.p.ub);
1759                   status = 1;
1760                   while_index = 0;
1761                   // now update own count and ub
1762                   init = vnew.p.ub;
1763                   vold.p.count = init + 1;
1764 #if KMP_ARCH_X86
1765                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1766                                    vold.b);
1767 #else
1768                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1769 #endif
1770                   break;
1771                 } // if (check CAS result)
1772                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1773               } // while (try to steal from particular victim)
1774             } // while (search for victim)
1775           } // if (try to find victim and steal)
1776         } // if (4-byte induction variable)
1777         if (!status) {
1778           *p_lb = 0;
1779           *p_ub = 0;
1780           if (p_st != NULL)
1781             *p_st = 0;
1782         } else {
1783           start = pr->u.p.parm2;
1784           init *= chunk;
1785           limit = chunk + init - 1;
1786           incr = pr->u.p.st;
1787           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1788 
1789           KMP_DEBUG_ASSERT(init <= trip);
1790           if ((last = (limit >= trip)) != 0)
1791             limit = trip;
1792           if (p_st != NULL)
1793             *p_st = incr;
1794 
1795           if (incr == 1) {
1796             *p_lb = start + init;
1797             *p_ub = start + limit;
1798           } else {
1799             *p_lb = start + init * incr;
1800             *p_ub = start + limit * incr;
1801           }
1802 
1803           if (pr->ordered) {
1804             pr->u.p.ordered_lower = init;
1805             pr->u.p.ordered_upper = limit;
1806 #ifdef KMP_DEBUG
1807             {
1808               const char *buff;
1809               // create format specifiers before the debug output
1810               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1811                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1812                                       traits_t<UT>::spec, traits_t<UT>::spec);
1813               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1814                               pr->u.p.ordered_upper));
1815               __kmp_str_free(&buff);
1816             }
1817 #endif
1818           } // if
1819         } // if
1820         break;
1821       } // case
1822 #endif // ( KMP_STATIC_STEAL_ENABLED )
1823       case kmp_sch_static_balanced: {
1824         KD_TRACE(
1825             100,
1826             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1827         if ((status = !pr->u.p.count) !=
1828             0) { /* check if thread has any iteration to do */
1829           pr->u.p.count = 1;
1830           *p_lb = pr->u.p.lb;
1831           *p_ub = pr->u.p.ub;
1832           last = pr->u.p.parm1;
1833           if (p_st != NULL)
1834             *p_st = pr->u.p.st;
1835         } else { /* no iterations to do */
1836           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1837         }
1838         if (pr->ordered) {
1839 #ifdef KMP_DEBUG
1840           {
1841             const char *buff;
1842             // create format specifiers before the debug output
1843             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1844                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1845                                     traits_t<UT>::spec, traits_t<UT>::spec);
1846             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1847                             pr->u.p.ordered_upper));
1848             __kmp_str_free(&buff);
1849           }
1850 #endif
1851         } // if
1852       } // case
1853       break;
1854       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1855                                      merged here */
1856       case kmp_sch_static_chunked: {
1857         T parm1;
1858 
1859         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1860                        "kmp_sch_static_[affinity|chunked] case\n",
1861                        gtid));
1862         parm1 = pr->u.p.parm1;
1863 
1864         trip = pr->u.p.tc - 1;
1865         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1866 
1867         if ((status = (init <= trip)) != 0) {
1868           start = pr->u.p.lb;
1869           incr = pr->u.p.st;
1870           limit = parm1 + init - 1;
1871 
1872           if ((last = (limit >= trip)) != 0)
1873             limit = trip;
1874 
1875           if (p_st != NULL)
1876             *p_st = incr;
1877 
1878           pr->u.p.count += th->th.th_team_nproc;
1879 
1880           if (incr == 1) {
1881             *p_lb = start + init;
1882             *p_ub = start + limit;
1883           } else {
1884             *p_lb = start + init * incr;
1885             *p_ub = start + limit * incr;
1886           }
1887 
1888           if (pr->ordered) {
1889             pr->u.p.ordered_lower = init;
1890             pr->u.p.ordered_upper = limit;
1891 #ifdef KMP_DEBUG
1892             {
1893               const char *buff;
1894               // create format specifiers before the debug output
1895               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1896                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1897                                       traits_t<UT>::spec, traits_t<UT>::spec);
1898               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1899                               pr->u.p.ordered_upper));
1900               __kmp_str_free(&buff);
1901             }
1902 #endif
1903           } // if
1904         } // if
1905       } // case
1906       break;
1907 
1908       case kmp_sch_dynamic_chunked: {
1909         T chunk = pr->u.p.parm1;
1910 
1911         KD_TRACE(
1912             100,
1913             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1914 
1915         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1916         trip = pr->u.p.tc - 1;
1917 
1918         if ((status = (init <= trip)) == 0) {
1919           *p_lb = 0;
1920           *p_ub = 0;
1921           if (p_st != NULL)
1922             *p_st = 0;
1923         } else {
1924           start = pr->u.p.lb;
1925           limit = chunk + init - 1;
1926           incr = pr->u.p.st;
1927 
1928           if ((last = (limit >= trip)) != 0)
1929             limit = trip;
1930 
1931           if (p_st != NULL)
1932             *p_st = incr;
1933 
1934           if (incr == 1) {
1935             *p_lb = start + init;
1936             *p_ub = start + limit;
1937           } else {
1938             *p_lb = start + init * incr;
1939             *p_ub = start + limit * incr;
1940           }
1941 
1942           if (pr->ordered) {
1943             pr->u.p.ordered_lower = init;
1944             pr->u.p.ordered_upper = limit;
1945 #ifdef KMP_DEBUG
1946             {
1947               const char *buff;
1948               // create format specifiers before the debug output
1949               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1950                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1951                                       traits_t<UT>::spec, traits_t<UT>::spec);
1952               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1953                               pr->u.p.ordered_upper));
1954               __kmp_str_free(&buff);
1955             }
1956 #endif
1957           } // if
1958         } // if
1959       } // case
1960       break;
1961 
1962       case kmp_sch_guided_iterative_chunked: {
1963         T chunkspec = pr->u.p.parm1;
1964         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1965                        "iterative case\n",
1966                        gtid));
1967         trip = pr->u.p.tc;
1968         // Start atomic part of calculations
1969         while (1) {
1970           ST remaining; // signed, because can be < 0
1971           init = sh->u.s.iteration; // shared value
1972           remaining = trip - init;
1973           if (remaining <= 0) { // AC: need to compare with 0 first
1974             // nothing to do, don't try atomic op
1975             status = 0;
1976             break;
1977           }
1978           if ((T)remaining <
1979               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1980             // use dynamic-style shcedule
1981             // atomically inrement iterations, get old value
1982             init = test_then_add<ST>(
1983                 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec);
1984             remaining = trip - init;
1985             if (remaining <= 0) {
1986               status = 0; // all iterations got by other threads
1987             } else { // got some iterations to work on
1988               status = 1;
1989               if ((T)remaining > chunkspec) {
1990                 limit = init + chunkspec - 1;
1991               } else {
1992                 last = 1; // the last chunk
1993                 limit = init + remaining - 1;
1994               } // if
1995             } // if
1996             break;
1997           } // if
1998           limit = init + (UT)(remaining *
1999                               *(double *)&pr->u.p.parm3); // divide by K*nproc
2000           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2001                                    (ST)init, (ST)limit)) {
2002             // CAS was successful, chunk obtained
2003             status = 1;
2004             --limit;
2005             break;
2006           } // if
2007         } // while
2008         if (status != 0) {
2009           start = pr->u.p.lb;
2010           incr = pr->u.p.st;
2011           if (p_st != NULL)
2012             *p_st = incr;
2013           *p_lb = start + init * incr;
2014           *p_ub = start + limit * incr;
2015           if (pr->ordered) {
2016             pr->u.p.ordered_lower = init;
2017             pr->u.p.ordered_upper = limit;
2018 #ifdef KMP_DEBUG
2019             {
2020               const char *buff;
2021               // create format specifiers before the debug output
2022               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2023                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2024                                       traits_t<UT>::spec, traits_t<UT>::spec);
2025               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2026                               pr->u.p.ordered_upper));
2027               __kmp_str_free(&buff);
2028             }
2029 #endif
2030           } // if
2031         } else {
2032           *p_lb = 0;
2033           *p_ub = 0;
2034           if (p_st != NULL)
2035             *p_st = 0;
2036         } // if
2037       } // case
2038       break;
2039 
2040       case kmp_sch_guided_simd: {
2041         // same as iterative but curr-chunk adjusted to be multiple of given
2042         // chunk
2043         T chunk = pr->u.p.parm1;
2044         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2045                        gtid));
2046         trip = pr->u.p.tc;
2047         // Start atomic part of calculations
2048         while (1) {
2049           ST remaining; // signed, because can be < 0
2050           init = sh->u.s.iteration; // shared value
2051           remaining = trip - init;
2052           if (remaining <= 0) { // AC: need to compare with 0 first
2053             status = 0; // nothing to do, don't try atomic op
2054             break;
2055           }
2056           KMP_DEBUG_ASSERT(init % chunk == 0);
2057           // compare with K*nproc*(chunk+1), K=2 by default
2058           if ((T)remaining < pr->u.p.parm2) {
2059             // use dynamic-style shcedule
2060             // atomically inrement iterations, get old value
2061             init = test_then_add<ST>(
2062                 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk);
2063             remaining = trip - init;
2064             if (remaining <= 0) {
2065               status = 0; // all iterations got by other threads
2066             } else {
2067               // got some iterations to work on
2068               status = 1;
2069               if ((T)remaining > chunk) {
2070                 limit = init + chunk - 1;
2071               } else {
2072                 last = 1; // the last chunk
2073                 limit = init + remaining - 1;
2074               } // if
2075             } // if
2076             break;
2077           } // if
2078           // divide by K*nproc
2079           UT span = remaining * (*(double *)&pr->u.p.parm3);
2080           UT rem = span % chunk;
2081           if (rem) // adjust so that span%chunk == 0
2082             span += chunk - rem;
2083           limit = init + span;
2084           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2085                                    (ST)init, (ST)limit)) {
2086             // CAS was successful, chunk obtained
2087             status = 1;
2088             --limit;
2089             break;
2090           } // if
2091         } // while
2092         if (status != 0) {
2093           start = pr->u.p.lb;
2094           incr = pr->u.p.st;
2095           if (p_st != NULL)
2096             *p_st = incr;
2097           *p_lb = start + init * incr;
2098           *p_ub = start + limit * incr;
2099           if (pr->ordered) {
2100             pr->u.p.ordered_lower = init;
2101             pr->u.p.ordered_upper = limit;
2102 #ifdef KMP_DEBUG
2103             {
2104               const char *buff;
2105               // create format specifiers before the debug output
2106               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2107                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2108                                       traits_t<UT>::spec, traits_t<UT>::spec);
2109               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2110                               pr->u.p.ordered_upper));
2111               __kmp_str_free(&buff);
2112             }
2113 #endif
2114           } // if
2115         } else {
2116           *p_lb = 0;
2117           *p_ub = 0;
2118           if (p_st != NULL)
2119             *p_st = 0;
2120         } // if
2121       } // case
2122       break;
2123 
2124       case kmp_sch_guided_analytical_chunked: {
2125         T chunkspec = pr->u.p.parm1;
2126         UT chunkIdx;
2127 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2128         /* for storing original FPCW value for Windows* OS on
2129            IA-32 architecture 8-byte version */
2130         unsigned int oldFpcw;
2131         unsigned int fpcwSet = 0;
2132 #endif
2133         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2134                        "analytical case\n",
2135                        gtid));
2136 
2137         trip = pr->u.p.tc;
2138 
2139         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2140         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2141                          trip);
2142 
2143         while (1) { /* this while loop is a safeguard against unexpected zero
2144                        chunk sizes */
2145           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2146           if (chunkIdx >= (UT)pr->u.p.parm2) {
2147             --trip;
2148             /* use dynamic-style scheduling */
2149             init = chunkIdx * chunkspec + pr->u.p.count;
2150             /* need to verify init > 0 in case of overflow in the above
2151              * calculation */
2152             if ((status = (init > 0 && init <= trip)) != 0) {
2153               limit = init + chunkspec - 1;
2154 
2155               if ((last = (limit >= trip)) != 0)
2156                 limit = trip;
2157             }
2158             break;
2159           } else {
2160 /* use exponential-style scheduling */
2161 /* The following check is to workaround the lack of long double precision on
2162    Windows* OS.
2163    This check works around the possible effect that init != 0 for chunkIdx == 0.
2164  */
2165 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2166             /* If we haven't already done so, save original FPCW and set
2167                precision to 64-bit, as Windows* OS on IA-32 architecture
2168                defaults to 53-bit */
2169             if (!fpcwSet) {
2170               oldFpcw = _control87(0, 0);
2171               _control87(_PC_64, _MCW_PC);
2172               fpcwSet = 0x30000;
2173             }
2174 #endif
2175             if (chunkIdx) {
2176               init = __kmp_dispatch_guided_remaining<T>(
2177                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2178               KMP_DEBUG_ASSERT(init);
2179               init = trip - init;
2180             } else
2181               init = 0;
2182             limit = trip - __kmp_dispatch_guided_remaining<T>(
2183                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2184             KMP_ASSERT(init <= limit);
2185             if (init < limit) {
2186               KMP_DEBUG_ASSERT(limit <= trip);
2187               --limit;
2188               status = 1;
2189               break;
2190             } // if
2191           } // if
2192         } // while (1)
2193 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2194         /* restore FPCW if necessary
2195            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2196            here */
2197         if (fpcwSet && (oldFpcw & fpcwSet))
2198           _control87(oldFpcw, _MCW_PC);
2199 #endif
2200         if (status != 0) {
2201           start = pr->u.p.lb;
2202           incr = pr->u.p.st;
2203           if (p_st != NULL)
2204             *p_st = incr;
2205           *p_lb = start + init * incr;
2206           *p_ub = start + limit * incr;
2207           if (pr->ordered) {
2208             pr->u.p.ordered_lower = init;
2209             pr->u.p.ordered_upper = limit;
2210 #ifdef KMP_DEBUG
2211             {
2212               const char *buff;
2213               // create format specifiers before the debug output
2214               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2215                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2216                                       traits_t<UT>::spec, traits_t<UT>::spec);
2217               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2218                               pr->u.p.ordered_upper));
2219               __kmp_str_free(&buff);
2220             }
2221 #endif
2222           }
2223         } else {
2224           *p_lb = 0;
2225           *p_ub = 0;
2226           if (p_st != NULL)
2227             *p_st = 0;
2228         }
2229       } // case
2230       break;
2231 
2232       case kmp_sch_trapezoidal: {
2233         UT index;
2234         T parm2 = pr->u.p.parm2;
2235         T parm3 = pr->u.p.parm3;
2236         T parm4 = pr->u.p.parm4;
2237         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2238                        gtid));
2239 
2240         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2241 
2242         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2243         trip = pr->u.p.tc - 1;
2244 
2245         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2246           *p_lb = 0;
2247           *p_ub = 0;
2248           if (p_st != NULL)
2249             *p_st = 0;
2250         } else {
2251           start = pr->u.p.lb;
2252           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2253           incr = pr->u.p.st;
2254 
2255           if ((last = (limit >= trip)) != 0)
2256             limit = trip;
2257 
2258           if (p_st != NULL)
2259             *p_st = incr;
2260 
2261           if (incr == 1) {
2262             *p_lb = start + init;
2263             *p_ub = start + limit;
2264           } else {
2265             *p_lb = start + init * incr;
2266             *p_ub = start + limit * incr;
2267           }
2268 
2269           if (pr->ordered) {
2270             pr->u.p.ordered_lower = init;
2271             pr->u.p.ordered_upper = limit;
2272 #ifdef KMP_DEBUG
2273             {
2274               const char *buff;
2275               // create format specifiers before the debug output
2276               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2277                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2278                                       traits_t<UT>::spec, traits_t<UT>::spec);
2279               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2280                               pr->u.p.ordered_upper));
2281               __kmp_str_free(&buff);
2282             }
2283 #endif
2284           } // if
2285         } // if
2286       } // case
2287       break;
2288       default: {
2289         status = 0; // to avoid complaints on uninitialized variable use
2290         __kmp_msg(kmp_ms_fatal, // Severity
2291                   KMP_MSG(UnknownSchedTypeDetected), // Primary message
2292                   KMP_HNT(GetNewerLibrary), // Hint
2293                   __kmp_msg_null // Variadic argument list terminator
2294                   );
2295       } break;
2296       } // switch
2297     } // if tc == 0;
2298 
2299     if (status == 0) {
2300       UT num_done;
2301 
2302       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2303 #ifdef KMP_DEBUG
2304       {
2305         const char *buff;
2306         // create format specifiers before the debug output
2307         buff = __kmp_str_format(
2308             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2309             traits_t<UT>::spec);
2310         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2311         __kmp_str_free(&buff);
2312       }
2313 #endif
2314 
2315       if ((ST)num_done == th->th.th_team_nproc - 1) {
2316 #if (KMP_STATIC_STEAL_ENABLED)
2317         if (pr->schedule == kmp_sch_static_steal &&
2318             traits_t<T>::type_size > 4) {
2319           int i;
2320           kmp_info_t **other_threads = team->t.t_threads;
2321           // loop complete, safe to destroy locks used for stealing
2322           for (i = 0; i < th->th.th_team_nproc; ++i) {
2323             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2324             KMP_ASSERT(lck != NULL);
2325             __kmp_destroy_lock(lck);
2326             __kmp_free(lck);
2327             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2328           }
2329         }
2330 #endif
2331         /* NOTE: release this buffer to be reused */
2332 
2333         KMP_MB(); /* Flush all pending memory write invalidates.  */
2334 
2335         sh->u.s.num_done = 0;
2336         sh->u.s.iteration = 0;
2337 
2338         /* TODO replace with general release procedure? */
2339         if (pr->ordered) {
2340           sh->u.s.ordered_iteration = 0;
2341         }
2342 
2343         KMP_MB(); /* Flush all pending memory write invalidates.  */
2344 
2345         sh->buffer_index += __kmp_dispatch_num_buffers;
2346         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2347                        gtid, sh->buffer_index));
2348 
2349         KMP_MB(); /* Flush all pending memory write invalidates.  */
2350 
2351       } // if
2352       if (__kmp_env_consistency_check) {
2353         if (pr->pushed_ws != ct_none) {
2354           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2355         }
2356       }
2357 
2358       th->th.th_dispatch->th_deo_fcn = NULL;
2359       th->th.th_dispatch->th_dxo_fcn = NULL;
2360       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2361       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2362     } // if (status == 0)
2363 #if KMP_OS_WINDOWS
2364     else if (last) {
2365       pr->u.p.last_upper = pr->u.p.ub;
2366     }
2367 #endif /* KMP_OS_WINDOWS */
2368     if (p_last != NULL && status != 0)
2369       *p_last = last;
2370   } // if
2371 
2372 #ifdef KMP_DEBUG
2373   {
2374     const char *buff;
2375     // create format specifiers before the debug output
2376     buff = __kmp_str_format(
2377         "__kmp_dispatch_next: T#%%d normal case: "
2378         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2379         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2380     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2381     __kmp_str_free(&buff);
2382   }
2383 #endif
2384 #if INCLUDE_SSC_MARKS
2385   SSC_MARK_DISPATCH_NEXT();
2386 #endif
2387   OMPT_LOOP_END;
2388   return status;
2389 }
2390 
2391 template <typename T>
2392 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2393                                   kmp_int32 *plastiter, T *plower, T *pupper,
2394                                   typename traits_t<T>::signed_t incr) {
2395   typedef typename traits_t<T>::unsigned_t UT;
2396   typedef typename traits_t<T>::signed_t ST;
2397   kmp_uint32 team_id;
2398   kmp_uint32 nteams;
2399   UT trip_count;
2400   kmp_team_t *team;
2401   kmp_info_t *th;
2402 
2403   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2404   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2405 #ifdef KMP_DEBUG
2406   {
2407     const char *buff;
2408     // create format specifiers before the debug output
2409     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2410                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2411                             traits_t<T>::spec, traits_t<T>::spec,
2412                             traits_t<ST>::spec, traits_t<T>::spec);
2413     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2414     __kmp_str_free(&buff);
2415   }
2416 #endif
2417 
2418   if (__kmp_env_consistency_check) {
2419     if (incr == 0) {
2420       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2421                             loc);
2422     }
2423     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2424       // The loop is illegal.
2425       // Some zero-trip loops maintained by compiler, e.g.:
2426       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2427       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2428       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2429       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2430       // Compiler does not check the following illegal loops:
2431       //   for(i=0;i<10;i+=incr) // where incr<0
2432       //   for(i=10;i>0;i-=incr) // where incr<0
2433       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2434     }
2435   }
2436   th = __kmp_threads[gtid];
2437   team = th->th.th_team;
2438 #if OMP_40_ENABLED
2439   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2440   nteams = th->th.th_teams_size.nteams;
2441 #endif
2442   team_id = team->t.t_master_tid;
2443   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2444 
2445   // compute global trip count
2446   if (incr == 1) {
2447     trip_count = *pupper - *plower + 1;
2448   } else if (incr == -1) {
2449     trip_count = *plower - *pupper + 1;
2450   } else if (incr > 0) {
2451     // upper-lower can exceed the limit of signed type
2452     trip_count = (UT)(*pupper - *plower) / incr + 1;
2453   } else {
2454     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2455   }
2456 
2457   if (trip_count <= nteams) {
2458     KMP_DEBUG_ASSERT(
2459         __kmp_static == kmp_sch_static_greedy ||
2460         __kmp_static ==
2461             kmp_sch_static_balanced); // Unknown static scheduling type.
2462     // only some teams get single iteration, others get nothing
2463     if (team_id < trip_count) {
2464       *pupper = *plower = *plower + team_id * incr;
2465     } else {
2466       *plower = *pupper + incr; // zero-trip loop
2467     }
2468     if (plastiter != NULL)
2469       *plastiter = (team_id == trip_count - 1);
2470   } else {
2471     if (__kmp_static == kmp_sch_static_balanced) {
2472       UT chunk = trip_count / nteams;
2473       UT extras = trip_count % nteams;
2474       *plower +=
2475           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2476       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2477       if (plastiter != NULL)
2478         *plastiter = (team_id == nteams - 1);
2479     } else {
2480       T chunk_inc_count =
2481           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2482       T upper = *pupper;
2483       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2484       // Unknown static scheduling type.
2485       *plower += team_id * chunk_inc_count;
2486       *pupper = *plower + chunk_inc_count - incr;
2487       // Check/correct bounds if needed
2488       if (incr > 0) {
2489         if (*pupper < *plower)
2490           *pupper = traits_t<T>::max_value;
2491         if (plastiter != NULL)
2492           *plastiter = *plower <= upper && *pupper > upper - incr;
2493         if (*pupper > upper)
2494           *pupper = upper; // tracker C73258
2495       } else {
2496         if (*pupper > *plower)
2497           *pupper = traits_t<T>::min_value;
2498         if (plastiter != NULL)
2499           *plastiter = *plower >= upper && *pupper < upper - incr;
2500         if (*pupper < upper)
2501           *pupper = upper; // tracker C73258
2502       }
2503     }
2504   }
2505 }
2506 
2507 //-----------------------------------------------------------------------------
2508 // Dispatch routines
2509 //    Transfer call to template< type T >
2510 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2511 //                         T lb, T ub, ST st, ST chunk )
2512 extern "C" {
2513 
2514 /*!
2515 @ingroup WORK_SHARING
2516 @{
2517 @param loc Source location
2518 @param gtid Global thread id
2519 @param schedule Schedule type
2520 @param lb  Lower bound
2521 @param ub  Upper bound
2522 @param st  Step (or increment if you prefer)
2523 @param chunk The chunk size to block with
2524 
2525 This function prepares the runtime to start a dynamically scheduled for loop,
2526 saving the loop arguments.
2527 These functions are all identical apart from the types of the arguments.
2528 */
2529 
2530 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2531                             enum sched_type schedule, kmp_int32 lb,
2532                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2533   KMP_DEBUG_ASSERT(__kmp_init_serial);
2534   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2535 }
2536 /*!
2537 See @ref __kmpc_dispatch_init_4
2538 */
2539 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2540                              enum sched_type schedule, kmp_uint32 lb,
2541                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2542   KMP_DEBUG_ASSERT(__kmp_init_serial);
2543   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2544 }
2545 
2546 /*!
2547 See @ref __kmpc_dispatch_init_4
2548 */
2549 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2550                             enum sched_type schedule, kmp_int64 lb,
2551                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2552   KMP_DEBUG_ASSERT(__kmp_init_serial);
2553   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2554 }
2555 
2556 /*!
2557 See @ref __kmpc_dispatch_init_4
2558 */
2559 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2560                              enum sched_type schedule, kmp_uint64 lb,
2561                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2562   KMP_DEBUG_ASSERT(__kmp_init_serial);
2563   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2564 }
2565 
2566 /*!
2567 See @ref __kmpc_dispatch_init_4
2568 
2569 Difference from __kmpc_dispatch_init set of functions is these functions
2570 are called for composite distribute parallel for construct. Thus before
2571 regular iterations dispatching we need to calc per-team iteration space.
2572 
2573 These functions are all identical apart from the types of the arguments.
2574 */
2575 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2576                                  enum sched_type schedule, kmp_int32 *p_last,
2577                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2578                                  kmp_int32 chunk) {
2579   KMP_DEBUG_ASSERT(__kmp_init_serial);
2580   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2581   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2582 }
2583 
2584 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2585                                   enum sched_type schedule, kmp_int32 *p_last,
2586                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2587                                   kmp_int32 chunk) {
2588   KMP_DEBUG_ASSERT(__kmp_init_serial);
2589   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2590   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2591 }
2592 
2593 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2594                                  enum sched_type schedule, kmp_int32 *p_last,
2595                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2596                                  kmp_int64 chunk) {
2597   KMP_DEBUG_ASSERT(__kmp_init_serial);
2598   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2599   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2600 }
2601 
2602 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2603                                   enum sched_type schedule, kmp_int32 *p_last,
2604                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2605                                   kmp_int64 chunk) {
2606   KMP_DEBUG_ASSERT(__kmp_init_serial);
2607   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2608   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2609 }
2610 
2611 /*!
2612 @param loc Source code location
2613 @param gtid Global thread id
2614 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2615 otherwise
2616 @param p_lb   Pointer to the lower bound for the next chunk of work
2617 @param p_ub   Pointer to the upper bound for the next chunk of work
2618 @param p_st   Pointer to the stride for the next chunk of work
2619 @return one if there is work to be done, zero otherwise
2620 
2621 Get the next dynamically allocated chunk of work for this thread.
2622 If there is no more work, then the lb,ub and stride need not be modified.
2623 */
2624 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2625                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2626   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2627 }
2628 
2629 /*!
2630 See @ref __kmpc_dispatch_next_4
2631 */
2632 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2633                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2634                             kmp_int32 *p_st) {
2635   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2636 }
2637 
2638 /*!
2639 See @ref __kmpc_dispatch_next_4
2640 */
2641 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2642                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2643   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2644 }
2645 
2646 /*!
2647 See @ref __kmpc_dispatch_next_4
2648 */
2649 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2650                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2651                             kmp_int64 *p_st) {
2652   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2653 }
2654 
2655 /*!
2656 @param loc Source code location
2657 @param gtid Global thread id
2658 
2659 Mark the end of a dynamic loop.
2660 */
2661 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2662   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2663 }
2664 
2665 /*!
2666 See @ref __kmpc_dispatch_fini_4
2667 */
2668 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2669   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2670 }
2671 
2672 /*!
2673 See @ref __kmpc_dispatch_fini_4
2674 */
2675 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2676   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2677 }
2678 
2679 /*!
2680 See @ref __kmpc_dispatch_fini_4
2681 */
2682 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2683   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2684 }
2685 /*! @} */
2686 
2687 //-----------------------------------------------------------------------------
2688 // Non-template routines from kmp_dispatch.cpp used in other sources
2689 
2690 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2691   return value == checker;
2692 }
2693 
2694 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2695   return value != checker;
2696 }
2697 
2698 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2699   return value < checker;
2700 }
2701 
2702 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2703   return value >= checker;
2704 }
2705 
2706 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2707   return value <= checker;
2708 }
2709 
2710 kmp_uint32
2711 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2712                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2713                    void *obj // Higher-level synchronization object, or NULL.
2714                    ) {
2715   // note: we may not belong to a team at this point
2716   volatile kmp_uint32 *spin = spinner;
2717   kmp_uint32 check = checker;
2718   kmp_uint32 spins;
2719   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2720   kmp_uint32 r;
2721 
2722   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2723   KMP_INIT_YIELD(spins);
2724   // main wait spin loop
2725   while (!f(r = TCR_4(*spin), check)) {
2726     KMP_FSYNC_SPIN_PREPARE(obj);
2727     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2728        split. It causes problems with infinite recursion because of exit lock */
2729     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2730         __kmp_abort_thread(); */
2731 
2732     /* if we have waited a bit, or are oversubscribed, yield */
2733     /* pause is in the following code */
2734     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2735     KMP_YIELD_SPIN(spins);
2736   }
2737   KMP_FSYNC_SPIN_ACQUIRED(obj);
2738   return r;
2739 }
2740 
2741 void __kmp_wait_yield_4_ptr(
2742     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2743     void *obj // Higher-level synchronization object, or NULL.
2744     ) {
2745   // note: we may not belong to a team at this point
2746   void *spin = spinner;
2747   kmp_uint32 check = checker;
2748   kmp_uint32 spins;
2749   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2750 
2751   KMP_FSYNC_SPIN_INIT(obj, spin);
2752   KMP_INIT_YIELD(spins);
2753   // main wait spin loop
2754   while (!f(spin, check)) {
2755     KMP_FSYNC_SPIN_PREPARE(obj);
2756     /* if we have waited a bit, or are oversubscribed, yield */
2757     /* pause is in the following code */
2758     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2759     KMP_YIELD_SPIN(spins);
2760   }
2761   KMP_FSYNC_SPIN_ACQUIRED(obj);
2762 }
2763 
2764 } // extern "C"
2765 
2766 #ifdef KMP_GOMP_COMPAT
2767 
2768 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2769                                enum sched_type schedule, kmp_int32 lb,
2770                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2771                                int push_ws) {
2772   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2773                                  push_ws);
2774 }
2775 
2776 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2777                                 enum sched_type schedule, kmp_uint32 lb,
2778                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2779                                 int push_ws) {
2780   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2781                                   push_ws);
2782 }
2783 
2784 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2785                                enum sched_type schedule, kmp_int64 lb,
2786                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2787                                int push_ws) {
2788   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2789                                  push_ws);
2790 }
2791 
2792 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2793                                 enum sched_type schedule, kmp_uint64 lb,
2794                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2795                                 int push_ws) {
2796   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2797                                   push_ws);
2798 }
2799 
2800 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2801   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2802 }
2803 
2804 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2805   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2806 }
2807 
2808 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2809   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2810 }
2811 
2812 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2813   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2814 }
2815 
2816 #endif /* KMP_GOMP_COMPAT */
2817 
2818 /* ------------------------------------------------------------------------ */
2819