1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 
44 #if KMP_STATIC_STEAL_ENABLED
45 
46 // replaces dispatch_private_info{32,64} structures and
47 // dispatch_private_info{32,64}_t types
48 template <typename T> struct dispatch_private_infoXX_template {
49   typedef typename traits_t<T>::unsigned_t UT;
50   typedef typename traits_t<T>::signed_t ST;
51   UT count; // unsigned
52   T ub;
53   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
54   T lb;
55   ST st; // signed
56   UT tc; // unsigned
57   T static_steal_counter; // for static_steal only; maybe better to put after ub
58 
59   /* parm[1-4] are used in different ways by different scheduling algorithms */
60 
61   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
62   //    a) parm3 is properly aligned and
63   //    b) all parm1-4 are in the same cache line.
64   // Because of parm1-4 are used together, performance seems to be better
65   // if they are in the same line (not measured though).
66 
67   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
68     T parm1;
69     T parm2;
70     T parm3;
71     T parm4;
72   };
73 
74   UT ordered_lower; // unsigned
75   UT ordered_upper; // unsigned
76 #if KMP_OS_WINDOWS
77   T last_upper;
78 #endif /* KMP_OS_WINDOWS */
79 };
80 
81 #else /* KMP_STATIC_STEAL_ENABLED */
82 
83 // replaces dispatch_private_info{32,64} structures and
84 // dispatch_private_info{32,64}_t types
85 template <typename T> struct dispatch_private_infoXX_template {
86   typedef typename traits_t<T>::unsigned_t UT;
87   typedef typename traits_t<T>::signed_t ST;
88   T lb;
89   T ub;
90   ST st; // signed
91   UT tc; // unsigned
92 
93   T parm1;
94   T parm2;
95   T parm3;
96   T parm4;
97 
98   UT count; // unsigned
99 
100   UT ordered_lower; // unsigned
101   UT ordered_upper; // unsigned
102 #if KMP_OS_WINDOWS
103   T last_upper;
104 #endif /* KMP_OS_WINDOWS */
105 };
106 
107 #endif /* KMP_STATIC_STEAL_ENABLED */
108 
109 // replaces dispatch_private_info structure and dispatch_private_info_t type
110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
111   // duplicate alignment here, otherwise size of structure is not correct in our
112   // compiler
113   union KMP_ALIGN_CACHE private_info_tmpl {
114     dispatch_private_infoXX_template<T> p;
115     dispatch_private_info64_t p64;
116   } u;
117   enum sched_type schedule; /* scheduling algorithm */
118   kmp_uint32 ordered; /* ordered clause specified */
119   kmp_uint32 ordered_bumped;
120   // To retain the structure size after making ordered_iteration scalar
121   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
122   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
123   kmp_uint32 nomerge; /* don't merge iters if serialized */
124   kmp_uint32 type_size;
125   enum cons_type pushed_ws;
126 };
127 
128 // replaces dispatch_shared_info{32,64} structures and
129 // dispatch_shared_info{32,64}_t types
130 template <typename UT> struct dispatch_shared_infoXX_template {
131   /* chunk index under dynamic, number of idle threads under static-steal;
132      iteration index otherwise */
133   volatile UT iteration;
134   volatile UT num_done;
135   volatile UT ordered_iteration;
136   // to retain the structure size making ordered_iteration scalar
137   UT ordered_dummy[KMP_MAX_ORDERED - 3];
138 };
139 
140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
141 template <typename UT> struct dispatch_shared_info_template {
142   // we need union here to keep the structure size
143   union shared_info_tmpl {
144     dispatch_shared_infoXX_template<UT> s;
145     dispatch_shared_info64_t s64;
146   } u;
147   volatile kmp_uint32 buffer_index;
148 #if OMP_45_ENABLED
149   volatile kmp_int32 doacross_buf_idx; // teamwise index
150   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
151   kmp_int32 doacross_num_done; // count finished threads
152 #endif
153 #if KMP_USE_HWLOC
154   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
155   // machines (> 48 cores). Performance analysis showed that a cache thrash
156   // was occurring and this padding helps alleviate the problem.
157   char padding[64];
158 #endif
159 };
160 
161 /* ------------------------------------------------------------------------ */
162 
163 #undef USE_TEST_LOCKS
164 
165 // test_then_add template (general template should NOT be used)
166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
167 
168 template <>
169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
170                                                  kmp_int32 d) {
171   kmp_int32 r;
172   r = KMP_TEST_THEN_ADD32(p, d);
173   return r;
174 }
175 
176 template <>
177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
178                                                  kmp_int64 d) {
179   kmp_int64 r;
180   r = KMP_TEST_THEN_ADD64(p, d);
181   return r;
182 }
183 
184 // test_then_inc_acq template (general template should NOT be used)
185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
186 
187 template <>
188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
189   kmp_int32 r;
190   r = KMP_TEST_THEN_INC_ACQ32(p);
191   return r;
192 }
193 
194 template <>
195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
196   kmp_int64 r;
197   r = KMP_TEST_THEN_INC_ACQ64(p);
198   return r;
199 }
200 
201 // test_then_inc template (general template should NOT be used)
202 template <typename T> static __forceinline T test_then_inc(volatile T *p);
203 
204 template <>
205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
206   kmp_int32 r;
207   r = KMP_TEST_THEN_INC32(p);
208   return r;
209 }
210 
211 template <>
212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
213   kmp_int64 r;
214   r = KMP_TEST_THEN_INC64(p);
215   return r;
216 }
217 
218 // compare_and_swap template (general template should NOT be used)
219 template <typename T>
220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
221 
222 template <>
223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
224                                                     kmp_int32 c, kmp_int32 s) {
225   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
226 }
227 
228 template <>
229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
230                                                     kmp_int64 c, kmp_int64 s) {
231   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
232 }
233 
234 /* Spin wait loop that first does pause, then yield.
235     Waits until function returns non-zero when called with *spinner and check.
236     Does NOT put threads to sleep.
237 #if USE_ITT_BUILD
238     Arguments:
239         obj -- is higher-level synchronization object to report to ittnotify.
240         It is used to report locks consistently. For example, if lock is
241         acquired immediately, its address is reported to ittnotify via
242         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
243         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
244         same address, not an address of low-level spinner.
245 #endif // USE_ITT_BUILD
246 */
247 template <typename UT>
248 // ToDo: make inline function (move to header file for icl)
249 static UT // unsigned 4- or 8-byte type
250     __kmp_wait_yield(
251         volatile UT *spinner, UT checker,
252         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
253             void *obj) // Higher-level synchronization object, or NULL.
254         ) {
255   // note: we may not belong to a team at this point
256   volatile UT *spin = spinner;
257   UT check = checker;
258   kmp_uint32 spins;
259   kmp_uint32 (*f)(UT, UT) = pred;
260   UT r;
261 
262   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
263   KMP_INIT_YIELD(spins);
264   // main wait spin loop
265   while (!f(r = *spin, check)) {
266     KMP_FSYNC_SPIN_PREPARE(obj);
267     /* GEH - remove this since it was accidentally introduced when kmp_wait was
268        split. It causes problems with infinite recursion because of exit lock */
269     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
270         __kmp_abort_thread(); */
271 
272     // if we are oversubscribed, or have waited a bit (and
273     // KMP_LIBRARY=throughput, then yield. pause is in the following code
274     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
275     KMP_YIELD_SPIN(spins);
276   }
277   KMP_FSYNC_SPIN_ACQUIRED(obj);
278   return r;
279 }
280 
281 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
282   return value == checker;
283 }
284 
285 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
286   return value != checker;
287 }
288 
289 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
290   return value < checker;
291 }
292 
293 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
294   return value >= checker;
295 }
296 
297 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
298   return value <= checker;
299 }
300 
301 /* ------------------------------------------------------------------------ */
302 
303 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
304                                      ident_t *loc_ref) {
305   kmp_info_t *th;
306 
307   KMP_DEBUG_ASSERT(gtid_ref);
308 
309   if (__kmp_env_consistency_check) {
310     th = __kmp_threads[*gtid_ref];
311     if (th->th.th_root->r.r_active &&
312         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
313 #if KMP_USE_DYNAMIC_LOCK
314       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
315 #else
316       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
317 #endif
318     }
319   }
320 }
321 
322 template <typename UT>
323 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
324   typedef typename traits_t<UT>::signed_t ST;
325   dispatch_private_info_template<UT> *pr;
326 
327   int gtid = *gtid_ref;
328   //    int  cid = *cid_ref;
329   kmp_info_t *th = __kmp_threads[gtid];
330   KMP_DEBUG_ASSERT(th->th.th_dispatch);
331 
332   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
333   if (__kmp_env_consistency_check) {
334     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
335         th->th.th_dispatch->th_dispatch_pr_current);
336     if (pr->pushed_ws != ct_none) {
337 #if KMP_USE_DYNAMIC_LOCK
338       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
339 #else
340       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
341 #endif
342     }
343   }
344 
345   if (!th->th.th_team->t.t_serialized) {
346     dispatch_shared_info_template<UT> *sh =
347         reinterpret_cast<dispatch_shared_info_template<UT> *>(
348             th->th.th_dispatch->th_dispatch_sh_current);
349     UT lower;
350 
351     if (!__kmp_env_consistency_check) {
352       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
353           th->th.th_dispatch->th_dispatch_pr_current);
354     }
355     lower = pr->u.p.ordered_lower;
356 
357 #if !defined(KMP_GOMP_COMPAT)
358     if (__kmp_env_consistency_check) {
359       if (pr->ordered_bumped) {
360         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
361         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
362                                ct_ordered_in_pdo, loc_ref,
363                                &p->stack_data[p->w_top]);
364       }
365     }
366 #endif /* !defined(KMP_GOMP_COMPAT) */
367 
368     KMP_MB();
369 #ifdef KMP_DEBUG
370     {
371       char *buff;
372       // create format specifiers before the debug output
373       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
374                               "ordered_iter:%%%s lower:%%%s\n",
375                               traits_t<UT>::spec, traits_t<UT>::spec);
376       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
377       __kmp_str_free(&buff);
378     }
379 #endif
380 
381     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
382                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
383     KMP_MB(); /* is this necessary? */
384 #ifdef KMP_DEBUG
385     {
386       char *buff;
387       // create format specifiers before the debug output
388       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
389                               "ordered_iter:%%%s lower:%%%s\n",
390                               traits_t<UT>::spec, traits_t<UT>::spec);
391       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
392       __kmp_str_free(&buff);
393     }
394 #endif
395   }
396   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
397 }
398 
399 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
400                                      ident_t *loc_ref) {
401   kmp_info_t *th;
402 
403   if (__kmp_env_consistency_check) {
404     th = __kmp_threads[*gtid_ref];
405     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
406       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
407     }
408   }
409 }
410 
411 template <typename UT>
412 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
413   typedef typename traits_t<UT>::signed_t ST;
414   dispatch_private_info_template<UT> *pr;
415 
416   int gtid = *gtid_ref;
417   //    int  cid = *cid_ref;
418   kmp_info_t *th = __kmp_threads[gtid];
419   KMP_DEBUG_ASSERT(th->th.th_dispatch);
420 
421   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
422   if (__kmp_env_consistency_check) {
423     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
424         th->th.th_dispatch->th_dispatch_pr_current);
425     if (pr->pushed_ws != ct_none) {
426       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
427     }
428   }
429 
430   if (!th->th.th_team->t.t_serialized) {
431     dispatch_shared_info_template<UT> *sh =
432         reinterpret_cast<dispatch_shared_info_template<UT> *>(
433             th->th.th_dispatch->th_dispatch_sh_current);
434 
435     if (!__kmp_env_consistency_check) {
436       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
437           th->th.th_dispatch->th_dispatch_pr_current);
438     }
439 
440     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
441 #if !defined(KMP_GOMP_COMPAT)
442     if (__kmp_env_consistency_check) {
443       if (pr->ordered_bumped != 0) {
444         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
445         /* How to test it? - OM */
446         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
447                                ct_ordered_in_pdo, loc_ref,
448                                &p->stack_data[p->w_top]);
449       }
450     }
451 #endif /* !defined(KMP_GOMP_COMPAT) */
452 
453     KMP_MB(); /* Flush all pending memory write invalidates.  */
454 
455     pr->ordered_bumped += 1;
456 
457     KD_TRACE(1000,
458              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
459               gtid, pr->ordered_bumped));
460 
461     KMP_MB(); /* Flush all pending memory write invalidates.  */
462 
463     /* TODO use general release procedure? */
464     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
465 
466     KMP_MB(); /* Flush all pending memory write invalidates.  */
467   }
468   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
469 }
470 
471 // Computes and returns x to the power of y, where y must a non-negative integer
472 template <typename UT>
473 static __forceinline long double __kmp_pow(long double x, UT y) {
474   long double s = 1.0L;
475 
476   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
477   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
478   while (y) {
479     if (y & 1)
480       s *= x;
481     x *= x;
482     y >>= 1;
483   }
484   return s;
485 }
486 
487 /* Computes and returns the number of unassigned iterations after idx chunks
488    have been assigned (the total number of unassigned iterations in chunks with
489    index greater than or equal to idx). __forceinline seems to be broken so that
490    if we __forceinline this function, the behavior is wrong
491    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
492 template <typename T>
493 static __inline typename traits_t<T>::unsigned_t
494 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
495                                 typename traits_t<T>::unsigned_t idx) {
496   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
497      ICL 8.1, long double arithmetic may not really have long double precision,
498      even with /Qlong_double.  Currently, we workaround that in the caller code,
499      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
500      of precision is not expected to be a correctness issue, though. */
501   typedef typename traits_t<T>::unsigned_t UT;
502 
503   long double x = tc * __kmp_pow<UT>(base, idx);
504   UT r = (UT)x;
505   if (x == r)
506     return r;
507   return r + 1;
508 }
509 
510 // Parameters of the guided-iterative algorithm:
511 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
512 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
513 // by default n = 2. For example with n = 3 the chunks distribution will be more
514 // flat.
515 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
516 static int guided_int_param = 2;
517 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
518 
519 // UT - unsigned flavor of T, ST - signed flavor of T,
520 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
521 template <typename T>
522 static void
523 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
524                     T ub, typename traits_t<T>::signed_t st,
525                     typename traits_t<T>::signed_t chunk, int push_ws) {
526   typedef typename traits_t<T>::unsigned_t UT;
527   typedef typename traits_t<T>::signed_t ST;
528   typedef typename traits_t<T>::floating_t DBL;
529 
530   int active;
531   T tc;
532   kmp_info_t *th;
533   kmp_team_t *team;
534   kmp_uint32 my_buffer_index;
535   dispatch_private_info_template<T> *pr;
536   dispatch_shared_info_template<UT> volatile *sh;
537 
538   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
539                    sizeof(dispatch_private_info));
540   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
541                    sizeof(dispatch_shared_info));
542 
543   if (!TCR_4(__kmp_init_parallel))
544     __kmp_parallel_initialize();
545 
546 #if INCLUDE_SSC_MARKS
547   SSC_MARK_DISPATCH_INIT();
548 #endif
549 #ifdef KMP_DEBUG
550   {
551     char *buff;
552     // create format specifiers before the debug output
553     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
554                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
555                             traits_t<ST>::spec, traits_t<T>::spec,
556                             traits_t<T>::spec, traits_t<ST>::spec);
557     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
558     __kmp_str_free(&buff);
559   }
560 #endif
561   /* setup data */
562   th = __kmp_threads[gtid];
563   team = th->th.th_team;
564   active = !team->t.t_serialized;
565   th->th.th_ident = loc;
566 
567 #if USE_ITT_BUILD
568   kmp_uint64 cur_chunk = chunk;
569   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
570                                     __kmp_forkjoin_frames_mode == 3 &&
571                                     KMP_MASTER_GTID(gtid) &&
572 #if OMP_40_ENABLED
573                                     th->th.th_teams_microtask == NULL &&
574 #endif
575                                     team->t.t_active_level == 1;
576 #endif
577   if (!active) {
578     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
579         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
580   } else {
581     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
582                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
583 
584     my_buffer_index = th->th.th_dispatch->th_disp_index++;
585 
586     /* What happens when number of threads changes, need to resize buffer? */
587     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
588         &th->th.th_dispatch
589              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
590     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
591         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
592   }
593 
594 #if (KMP_STATIC_STEAL_ENABLED)
595   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
596     // AC: we now have only one implementation of stealing, so use it
597     schedule = kmp_sch_static_steal;
598   else
599 #endif
600     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
601 
602   /* Pick up the nomerge/ordered bits from the scheduling type */
603   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
604     pr->nomerge = TRUE;
605     schedule =
606         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
607   } else {
608     pr->nomerge = FALSE;
609   }
610   pr->type_size = traits_t<T>::type_size; // remember the size of variables
611   if (kmp_ord_lower & schedule) {
612     pr->ordered = TRUE;
613     schedule =
614         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
615   } else {
616     pr->ordered = FALSE;
617   }
618 
619   if (schedule == kmp_sch_static) {
620     schedule = __kmp_static;
621   } else {
622     if (schedule == kmp_sch_runtime) {
623       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
624       // not specified)
625       schedule = team->t.t_sched.r_sched_type;
626       // Detail the schedule if needed (global controls are differentiated
627       // appropriately)
628       if (schedule == kmp_sch_guided_chunked) {
629         schedule = __kmp_guided;
630       } else if (schedule == kmp_sch_static) {
631         schedule = __kmp_static;
632       }
633       // Use the chunk size specified by OMP_SCHEDULE (or default if not
634       // specified)
635       chunk = team->t.t_sched.chunk;
636 #if USE_ITT_BUILD
637       cur_chunk = chunk;
638 #endif
639 #ifdef KMP_DEBUG
640       {
641         char *buff;
642         // create format specifiers before the debug output
643         buff = __kmp_str_format(
644             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
645             traits_t<ST>::spec);
646         KD_TRACE(10, (buff, gtid, schedule, chunk));
647         __kmp_str_free(&buff);
648       }
649 #endif
650     } else {
651       if (schedule == kmp_sch_guided_chunked) {
652         schedule = __kmp_guided;
653       }
654       if (chunk <= 0) {
655         chunk = KMP_DEFAULT_CHUNK;
656       }
657     }
658 
659     if (schedule == kmp_sch_auto) {
660       // mapping and differentiation: in the __kmp_do_serial_initialize()
661       schedule = __kmp_auto;
662 #ifdef KMP_DEBUG
663       {
664         char *buff;
665         // create format specifiers before the debug output
666         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
667                                 "schedule:%%d chunk:%%%s\n",
668                                 traits_t<ST>::spec);
669         KD_TRACE(10, (buff, gtid, schedule, chunk));
670         __kmp_str_free(&buff);
671       }
672 #endif
673     }
674 
675     /* guided analytical not safe for too many threads */
676     if (schedule == kmp_sch_guided_analytical_chunked &&
677         th->th.th_team_nproc > 1 << 20) {
678       schedule = kmp_sch_guided_iterative_chunked;
679       KMP_WARNING(DispatchManyThreads);
680     }
681     if (schedule == kmp_sch_runtime_simd) {
682       // compiler provides simd_width in the chunk parameter
683       schedule = team->t.t_sched.r_sched_type;
684       // Detail the schedule if needed (global controls are differentiated
685       // appropriately)
686       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
687           schedule == __kmp_static) {
688         schedule = kmp_sch_static_balanced_chunked;
689       } else {
690         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
691           schedule = kmp_sch_guided_simd;
692         }
693         chunk = team->t.t_sched.chunk * chunk;
694       }
695 #if USE_ITT_BUILD
696       cur_chunk = chunk;
697 #endif
698 #ifdef KMP_DEBUG
699       {
700         char *buff;
701         // create format specifiers before the debug output
702         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
703                                 " chunk:%%%s\n",
704                                 traits_t<ST>::spec);
705         KD_TRACE(10, (buff, gtid, schedule, chunk));
706         __kmp_str_free(&buff);
707       }
708 #endif
709     }
710     pr->u.p.parm1 = chunk;
711   }
712   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
713               "unknown scheduling type");
714 
715   pr->u.p.count = 0;
716 
717   if (__kmp_env_consistency_check) {
718     if (st == 0) {
719       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
720                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
721     }
722   }
723   // compute trip count
724   if (st == 1) { // most common case
725     if (ub >= lb) {
726       tc = ub - lb + 1;
727     } else { // ub < lb
728       tc = 0; // zero-trip
729     }
730   } else if (st < 0) {
731     if (lb >= ub) {
732       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
733       // where the division needs to be unsigned regardless of the result type
734       tc = (UT)(lb - ub) / (-st) + 1;
735     } else { // lb < ub
736       tc = 0; // zero-trip
737     }
738   } else { // st > 0
739     if (ub >= lb) {
740       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
741       // where the division needs to be unsigned regardless of the result type
742       tc = (UT)(ub - lb) / st + 1;
743     } else { // ub < lb
744       tc = 0; // zero-trip
745     }
746   }
747 
748   // Any half-decent optimizer will remove this test when the blocks are empty
749   // since the macros expand to nothing when statistics are disabled.
750   if (schedule == __kmp_static) {
751     KMP_COUNT_BLOCK(OMP_FOR_static);
752     KMP_COUNT_VALUE(FOR_static_iterations, tc);
753   } else {
754     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
755     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
756   }
757 
758   pr->u.p.lb = lb;
759   pr->u.p.ub = ub;
760   pr->u.p.st = st;
761   pr->u.p.tc = tc;
762 
763 #if KMP_OS_WINDOWS
764   pr->u.p.last_upper = ub + st;
765 #endif /* KMP_OS_WINDOWS */
766 
767   /* NOTE: only the active parallel region(s) has active ordered sections */
768 
769   if (active) {
770     if (pr->ordered == 0) {
771       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
772       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
773     } else {
774       pr->ordered_bumped = 0;
775 
776       pr->u.p.ordered_lower = 1;
777       pr->u.p.ordered_upper = 0;
778 
779       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
780       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
781     }
782   }
783 
784   if (__kmp_env_consistency_check) {
785     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786     if (push_ws) {
787       __kmp_push_workshare(gtid, ws, loc);
788       pr->pushed_ws = ws;
789     } else {
790       __kmp_check_workshare(gtid, ws, loc);
791       pr->pushed_ws = ct_none;
792     }
793   }
794 
795   switch (schedule) {
796 #if (KMP_STATIC_STEAL_ENABLED)
797   case kmp_sch_static_steal: {
798     T nproc = th->th.th_team_nproc;
799     T ntc, init;
800 
801     KD_TRACE(100,
802              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
803 
804     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805     if (nproc > 1 && ntc >= nproc) {
806       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
807       T id = __kmp_tid_from_gtid(gtid);
808       T small_chunk, extras;
809 
810       small_chunk = ntc / nproc;
811       extras = ntc % nproc;
812 
813       init = id * small_chunk + (id < extras ? id : extras);
814       pr->u.p.count = init;
815       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
816 
817       pr->u.p.parm2 = lb;
818       // pr->pfields.parm3 = 0; // it's not used in static_steal
819       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
820       pr->u.p.st = st;
821       if (traits_t<T>::type_size > 4) {
822         // AC: TODO: check if 16-byte CAS available and use it to
823         // improve performance (probably wait for explicit request
824         // before spending time on this).
825         // For now use dynamically allocated per-thread lock,
826         // free memory in __kmp_dispatch_next when status==0.
827         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
828         th->th.th_dispatch->th_steal_lock =
829             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
830         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
831       }
832       break;
833     } else {
834       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
835                      "kmp_sch_static_balanced\n",
836                      gtid));
837       schedule = kmp_sch_static_balanced;
838       /* too few iterations: fall-through to kmp_sch_static_balanced */
839     } // if
840     /* FALL-THROUGH to static balanced */
841   } // case
842 #endif
843   case kmp_sch_static_balanced: {
844     T nproc = th->th.th_team_nproc;
845     T init, limit;
846 
847     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
848                    gtid));
849 
850     if (nproc > 1) {
851       T id = __kmp_tid_from_gtid(gtid);
852 
853       if (tc < nproc) {
854         if (id < tc) {
855           init = id;
856           limit = id;
857           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
858         } else {
859           pr->u.p.count = 1; /* means no more chunks to execute */
860           pr->u.p.parm1 = FALSE;
861           break;
862         }
863       } else {
864         T small_chunk = tc / nproc;
865         T extras = tc % nproc;
866         init = id * small_chunk + (id < extras ? id : extras);
867         limit = init + small_chunk - (id < extras ? 0 : 1);
868         pr->u.p.parm1 = (id == nproc - 1);
869       }
870     } else {
871       if (tc > 0) {
872         init = 0;
873         limit = tc - 1;
874         pr->u.p.parm1 = TRUE;
875       } else { // zero trip count
876         pr->u.p.count = 1; /* means no more chunks to execute */
877         pr->u.p.parm1 = FALSE;
878         break;
879       }
880     }
881 #if USE_ITT_BUILD
882     // Calculate chunk for metadata report
883     if (itt_need_metadata_reporting)
884       cur_chunk = limit - init + 1;
885 #endif
886     if (st == 1) {
887       pr->u.p.lb = lb + init;
888       pr->u.p.ub = lb + limit;
889     } else {
890       // calculated upper bound, "ub" is user-defined upper bound
891       T ub_tmp = lb + limit * st;
892       pr->u.p.lb = lb + init * st;
893       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
894       // it exactly
895       if (st > 0) {
896         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
897       } else {
898         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
899       }
900     }
901     if (pr->ordered) {
902       pr->u.p.ordered_lower = init;
903       pr->u.p.ordered_upper = limit;
904     }
905     break;
906   } // case
907   case kmp_sch_static_balanced_chunked: {
908     // similar to balanced, but chunk adjusted to multiple of simd width
909     T nth = th->th.th_team_nproc;
910     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
911                    " -> falling-through to static_greedy\n",
912                    gtid));
913     schedule = kmp_sch_static_greedy;
914     if (nth > 1)
915       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
916     else
917       pr->u.p.parm1 = tc;
918     break;
919   } // case
920   case kmp_sch_guided_iterative_chunked:
921   case kmp_sch_guided_simd: {
922     T nproc = th->th.th_team_nproc;
923     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
924                    " case\n",
925                    gtid));
926 
927     if (nproc > 1) {
928       if ((2L * chunk + 1) * nproc >= tc) {
929         /* chunk size too large, switch to dynamic */
930         schedule = kmp_sch_dynamic_chunked;
931       } else {
932         // when remaining iters become less than parm2 - switch to dynamic
933         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
934         *(double *)&pr->u.p.parm3 =
935             guided_flt_param / nproc; // may occupy parm3 and parm4
936       }
937     } else {
938       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
939                      "kmp_sch_static_greedy\n",
940                      gtid));
941       schedule = kmp_sch_static_greedy;
942       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
943       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
944                      gtid));
945       pr->u.p.parm1 = tc;
946     } // if
947   } // case
948   break;
949   case kmp_sch_guided_analytical_chunked: {
950     T nproc = th->th.th_team_nproc;
951     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
952                    " case\n",
953                    gtid));
954     if (nproc > 1) {
955       if ((2L * chunk + 1) * nproc >= tc) {
956         /* chunk size too large, switch to dynamic */
957         schedule = kmp_sch_dynamic_chunked;
958       } else {
959         /* commonly used term: (2 nproc - 1)/(2 nproc) */
960         DBL x;
961 
962 #if KMP_OS_WINDOWS && KMP_ARCH_X86
963         /* Linux* OS already has 64-bit computation by default for long double,
964            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
965            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
966            instead of the default 53-bit. Even though long double doesn't work
967            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
968            expected to impact the correctness of the algorithm, but this has not
969            been mathematically proven. */
970         // save original FPCW and set precision to 64-bit, as
971         // Windows* OS on IA-32 architecture defaults to 53-bit
972         unsigned int oldFpcw = _control87(0, 0);
973         _control87(_PC_64, _MCW_PC); // 0,0x30000
974 #endif
975         /* value used for comparison in solver for cross-over point */
976         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
977 
978         /* crossover point--chunk indexes equal to or greater than
979            this point switch to dynamic-style scheduling */
980         UT cross;
981 
982         /* commonly used term: (2 nproc - 1)/(2 nproc) */
983         x = (long double)1.0 - (long double)0.5 / nproc;
984 
985 #ifdef KMP_DEBUG
986         { // test natural alignment
987           struct _test_a {
988             char a;
989             union {
990               char b;
991               DBL d;
992             };
993           } t;
994           ptrdiff_t natural_alignment =
995               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
996           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
997           // long)natural_alignment );
998           KMP_DEBUG_ASSERT(
999               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1000         }
1001 #endif // KMP_DEBUG
1002 
1003         /* save the term in thread private dispatch structure */
1004         *(DBL *)&pr->u.p.parm3 = x;
1005 
1006         /* solve for the crossover point to the nearest integer i for which C_i
1007            <= chunk */
1008         {
1009           UT left, right, mid;
1010           long double p;
1011 
1012           /* estimate initial upper and lower bound */
1013 
1014           /* doesn't matter what value right is as long as it is positive, but
1015              it affects performance of the solver */
1016           right = 229;
1017           p = __kmp_pow<UT>(x, right);
1018           if (p > target) {
1019             do {
1020               p *= p;
1021               right <<= 1;
1022             } while (p > target && right < (1 << 27));
1023             /* lower bound is previous (failed) estimate of upper bound */
1024             left = right >> 1;
1025           } else {
1026             left = 0;
1027           }
1028 
1029           /* bisection root-finding method */
1030           while (left + 1 < right) {
1031             mid = (left + right) / 2;
1032             if (__kmp_pow<UT>(x, mid) > target) {
1033               left = mid;
1034             } else {
1035               right = mid;
1036             }
1037           } // while
1038           cross = right;
1039         }
1040         /* assert sanity of computed crossover point */
1041         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1042                    __kmp_pow<UT>(x, cross) <= target);
1043 
1044         /* save the crossover point in thread private dispatch structure */
1045         pr->u.p.parm2 = cross;
1046 
1047 // C75803
1048 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1049 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1050 #else
1051 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1052 #endif
1053         /* dynamic-style scheduling offset */
1054         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1055                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1056                         cross * chunk;
1057 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1058         // restore FPCW
1059         _control87(oldFpcw, _MCW_PC);
1060 #endif
1061       } // if
1062     } else {
1063       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1064                      "kmp_sch_static_greedy\n",
1065                      gtid));
1066       schedule = kmp_sch_static_greedy;
1067       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1068       pr->u.p.parm1 = tc;
1069     } // if
1070   } // case
1071   break;
1072   case kmp_sch_static_greedy:
1073     KD_TRACE(100,
1074              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1075     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1076                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1077                         : tc;
1078     break;
1079   case kmp_sch_static_chunked:
1080   case kmp_sch_dynamic_chunked:
1081     if (pr->u.p.parm1 <= 0) {
1082       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1083     }
1084     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1085                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1086                    gtid));
1087     break;
1088   case kmp_sch_trapezoidal: {
1089     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1090 
1091     T parm1, parm2, parm3, parm4;
1092     KD_TRACE(100,
1093              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1094 
1095     parm1 = chunk;
1096 
1097     /* F : size of the first cycle */
1098     parm2 = (tc / (2 * th->th.th_team_nproc));
1099 
1100     if (parm2 < 1) {
1101       parm2 = 1;
1102     }
1103 
1104     /* L : size of the last cycle.  Make sure the last cycle is not larger
1105        than the first cycle. */
1106     if (parm1 < 1) {
1107       parm1 = 1;
1108     } else if (parm1 > parm2) {
1109       parm1 = parm2;
1110     }
1111 
1112     /* N : number of cycles */
1113     parm3 = (parm2 + parm1);
1114     parm3 = (2 * tc + parm3 - 1) / parm3;
1115 
1116     if (parm3 < 2) {
1117       parm3 = 2;
1118     }
1119 
1120     /* sigma : decreasing incr of the trapezoid */
1121     parm4 = (parm3 - 1);
1122     parm4 = (parm2 - parm1) / parm4;
1123 
1124     // pointless check, because parm4 >= 0 always
1125     // if ( parm4 < 0 ) {
1126     //    parm4 = 0;
1127     //}
1128 
1129     pr->u.p.parm1 = parm1;
1130     pr->u.p.parm2 = parm2;
1131     pr->u.p.parm3 = parm3;
1132     pr->u.p.parm4 = parm4;
1133   } // case
1134   break;
1135 
1136   default: {
1137     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1138                 KMP_HNT(GetNewerLibrary), // Hint
1139                 __kmp_msg_null // Variadic argument list terminator
1140                 );
1141   } break;
1142   } // switch
1143   pr->schedule = schedule;
1144   if (active) {
1145     /* The name of this buffer should be my_buffer_index when it's free to use
1146      * it */
1147 
1148     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1149                    "sh->buffer_index:%d\n",
1150                    gtid, my_buffer_index, sh->buffer_index));
1151     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1152                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1153     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1154     // my_buffer_index are *always* 32-bit integers.
1155     KMP_MB(); /* is this necessary? */
1156     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1157                    "sh->buffer_index:%d\n",
1158                    gtid, my_buffer_index, sh->buffer_index));
1159 
1160     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1161     th->th.th_dispatch->th_dispatch_sh_current =
1162         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1163 #if USE_ITT_BUILD
1164     if (pr->ordered) {
1165       __kmp_itt_ordered_init(gtid);
1166     }
1167     // Report loop metadata
1168     if (itt_need_metadata_reporting) {
1169       // Only report metadata by master of active team at level 1
1170       kmp_uint64 schedtype = 0;
1171       switch (schedule) {
1172       case kmp_sch_static_chunked:
1173       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1174         break;
1175       case kmp_sch_static_greedy:
1176         cur_chunk = pr->u.p.parm1;
1177         break;
1178       case kmp_sch_dynamic_chunked:
1179         schedtype = 1;
1180         break;
1181       case kmp_sch_guided_iterative_chunked:
1182       case kmp_sch_guided_analytical_chunked:
1183       case kmp_sch_guided_simd:
1184         schedtype = 2;
1185         break;
1186       default:
1187         // Should we put this case under "static"?
1188         // case kmp_sch_static_steal:
1189         schedtype = 3;
1190         break;
1191       }
1192       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1193     }
1194 #endif /* USE_ITT_BUILD */
1195   }
1196 
1197 #ifdef KMP_DEBUG
1198   {
1199     char *buff;
1200     // create format specifiers before the debug output
1201     buff = __kmp_str_format(
1202         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1203         "lb:%%%s ub:%%%s"
1204         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1205         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1206         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1207         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1208         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1209         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1210     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1211                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1212                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1213                   pr->u.p.parm3, pr->u.p.parm4));
1214     __kmp_str_free(&buff);
1215   }
1216 #endif
1217 #if (KMP_STATIC_STEAL_ENABLED)
1218   // It cannot be guaranteed that after execution of a loop with some other
1219   // schedule kind all the parm3 variables will contain the same value. Even if
1220   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1221   // rather than program life-time increment. So the dedicated variable is
1222   // required. The 'static_steal_counter' is used.
1223   if (schedule == kmp_sch_static_steal) {
1224     // Other threads will inspect this variable when searching for a victim.
1225     // This is a flag showing that other threads may steal from this thread
1226     // since then.
1227     volatile T *p = &pr->u.p.static_steal_counter;
1228     *p = *p + 1;
1229   }
1230 #endif // ( KMP_STATIC_STEAL_ENABLED )
1231 
1232 #if OMPT_SUPPORT && OMPT_OPTIONAL
1233   if (ompt_enabled.ompt_callback_work) {
1234     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1235     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1236     kmp_info_t *thr = __kmp_threads[gtid];
1237     ompt_callbacks.ompt_callback(ompt_callback_work)(
1238         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1239         &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1240   }
1241 #endif
1242 }
1243 
1244 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1245  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1246  * every chunk of iterations.  If the ordered section(s) were not executed
1247  * for this iteration (or every iteration in this chunk), we need to set the
1248  * ordered iteration counters so that the next thread can proceed. */
1249 template <typename UT>
1250 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1251   typedef typename traits_t<UT>::signed_t ST;
1252   kmp_info_t *th = __kmp_threads[gtid];
1253 
1254   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1255   if (!th->th.th_team->t.t_serialized) {
1256 
1257     dispatch_private_info_template<UT> *pr =
1258         reinterpret_cast<dispatch_private_info_template<UT> *>(
1259             th->th.th_dispatch->th_dispatch_pr_current);
1260     dispatch_shared_info_template<UT> volatile *sh =
1261         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1262             th->th.th_dispatch->th_dispatch_sh_current);
1263     KMP_DEBUG_ASSERT(pr);
1264     KMP_DEBUG_ASSERT(sh);
1265     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1266                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1267 
1268     if (pr->ordered_bumped) {
1269       KD_TRACE(
1270           1000,
1271           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1272            gtid));
1273       pr->ordered_bumped = 0;
1274     } else {
1275       UT lower = pr->u.p.ordered_lower;
1276 
1277 #ifdef KMP_DEBUG
1278       {
1279         char *buff;
1280         // create format specifiers before the debug output
1281         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1282                                 "ordered_iteration:%%%s lower:%%%s\n",
1283                                 traits_t<UT>::spec, traits_t<UT>::spec);
1284         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1285         __kmp_str_free(&buff);
1286       }
1287 #endif
1288 
1289       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1290                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291       KMP_MB(); /* is this necessary? */
1292 #ifdef KMP_DEBUG
1293       {
1294         char *buff;
1295         // create format specifiers before the debug output
1296         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1297                                 "ordered_iteration:%%%s lower:%%%s\n",
1298                                 traits_t<UT>::spec, traits_t<UT>::spec);
1299         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1300         __kmp_str_free(&buff);
1301       }
1302 #endif
1303 
1304       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1305     } // if
1306   } // if
1307   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1308 }
1309 
1310 #ifdef KMP_GOMP_COMPAT
1311 
1312 template <typename UT>
1313 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1314   typedef typename traits_t<UT>::signed_t ST;
1315   kmp_info_t *th = __kmp_threads[gtid];
1316 
1317   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1318   if (!th->th.th_team->t.t_serialized) {
1319     //        int cid;
1320     dispatch_private_info_template<UT> *pr =
1321         reinterpret_cast<dispatch_private_info_template<UT> *>(
1322             th->th.th_dispatch->th_dispatch_pr_current);
1323     dispatch_shared_info_template<UT> volatile *sh =
1324         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1325             th->th.th_dispatch->th_dispatch_sh_current);
1326     KMP_DEBUG_ASSERT(pr);
1327     KMP_DEBUG_ASSERT(sh);
1328     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1329                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1330 
1331     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1332     UT lower = pr->u.p.ordered_lower;
1333     UT upper = pr->u.p.ordered_upper;
1334     UT inc = upper - lower + 1;
1335 
1336     if (pr->ordered_bumped == inc) {
1337       KD_TRACE(
1338           1000,
1339           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1340            gtid));
1341       pr->ordered_bumped = 0;
1342     } else {
1343       inc -= pr->ordered_bumped;
1344 
1345 #ifdef KMP_DEBUG
1346       {
1347         char *buff;
1348         // create format specifiers before the debug output
1349         buff = __kmp_str_format(
1350             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1351             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1352             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1353         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1354         __kmp_str_free(&buff);
1355       }
1356 #endif
1357 
1358       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1359                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1360 
1361       KMP_MB(); /* is this necessary? */
1362       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1363                       "ordered_bumped to zero\n",
1364                       gtid));
1365       pr->ordered_bumped = 0;
1366 //!!!!! TODO check if the inc should be unsigned, or signed???
1367 #ifdef KMP_DEBUG
1368       {
1369         char *buff;
1370         // create format specifiers before the debug output
1371         buff = __kmp_str_format(
1372             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1373             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1374             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1375             traits_t<UT>::spec);
1376         KD_TRACE(1000,
1377                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1378         __kmp_str_free(&buff);
1379       }
1380 #endif
1381 
1382       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1383     }
1384     //        }
1385   }
1386   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1387 }
1388 
1389 #endif /* KMP_GOMP_COMPAT */
1390 
1391 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1392    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1393    is not called. */
1394 #if OMPT_SUPPORT && OMPT_OPTIONAL
1395 #define OMPT_LOOP_END                                                          \
1396   if (status == 0) {                                                           \
1397     if (ompt_enabled.ompt_callback_work) {                                     \
1398       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1399       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1400       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1401           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1402           &(task_info->task_data), 0, codeptr);                                \
1403     }                                                                          \
1404   }
1405 // TODO: implement count
1406 #else
1407 #define OMPT_LOOP_END // no-op
1408 #endif
1409 
1410 template <typename T>
1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1412                                T *p_lb, T *p_ub,
1413                                typename traits_t<T>::signed_t *p_st
1414 #if OMPT_SUPPORT && OMPT_OPTIONAL
1415                                ,
1416                                void *codeptr
1417 #endif
1418                                ) {
1419 
1420   typedef typename traits_t<T>::unsigned_t UT;
1421   typedef typename traits_t<T>::signed_t ST;
1422   typedef typename traits_t<T>::floating_t DBL;
1423 
1424   // This is potentially slightly misleading, schedule(runtime) will appear here
1425   // even if the actual runtme schedule is static. (Which points out a
1426   // disadavantage of schedule(runtime): even when static scheduling is used it
1427   // costs more than a compile time choice to use static scheduling would.)
1428   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1429 
1430   int status;
1431   dispatch_private_info_template<T> *pr;
1432   kmp_info_t *th = __kmp_threads[gtid];
1433   kmp_team_t *team = th->th.th_team;
1434 
1435   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1436 #ifdef KMP_DEBUG
1437   {
1438     char *buff;
1439     // create format specifiers before the debug output
1440     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1441                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1442                             traits_t<T>::spec, traits_t<T>::spec,
1443                             traits_t<ST>::spec);
1444     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1445     __kmp_str_free(&buff);
1446   }
1447 #endif
1448 
1449   if (team->t.t_serialized) {
1450     /* NOTE: serialize this dispatch becase we are not at the active level */
1451     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1452         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1453     KMP_DEBUG_ASSERT(pr);
1454 
1455     if ((status = (pr->u.p.tc != 0)) == 0) {
1456       *p_lb = 0;
1457       *p_ub = 0;
1458       //            if ( p_last != NULL )
1459       //                *p_last = 0;
1460       if (p_st != NULL)
1461         *p_st = 0;
1462       if (__kmp_env_consistency_check) {
1463         if (pr->pushed_ws != ct_none) {
1464           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1465         }
1466       }
1467     } else if (pr->nomerge) {
1468       kmp_int32 last;
1469       T start;
1470       UT limit, trip, init;
1471       ST incr;
1472       T chunk = pr->u.p.parm1;
1473 
1474       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1475                      gtid));
1476 
1477       init = chunk * pr->u.p.count++;
1478       trip = pr->u.p.tc - 1;
1479 
1480       if ((status = (init <= trip)) == 0) {
1481         *p_lb = 0;
1482         *p_ub = 0;
1483         //                if ( p_last != NULL )
1484         //                    *p_last = 0;
1485         if (p_st != NULL)
1486           *p_st = 0;
1487         if (__kmp_env_consistency_check) {
1488           if (pr->pushed_ws != ct_none) {
1489             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1490           }
1491         }
1492       } else {
1493         start = pr->u.p.lb;
1494         limit = chunk + init - 1;
1495         incr = pr->u.p.st;
1496 
1497         if ((last = (limit >= trip)) != 0) {
1498           limit = trip;
1499 #if KMP_OS_WINDOWS
1500           pr->u.p.last_upper = pr->u.p.ub;
1501 #endif /* KMP_OS_WINDOWS */
1502         }
1503         if (p_last != NULL)
1504           *p_last = last;
1505         if (p_st != NULL)
1506           *p_st = incr;
1507         if (incr == 1) {
1508           *p_lb = start + init;
1509           *p_ub = start + limit;
1510         } else {
1511           *p_lb = start + init * incr;
1512           *p_ub = start + limit * incr;
1513         }
1514 
1515         if (pr->ordered) {
1516           pr->u.p.ordered_lower = init;
1517           pr->u.p.ordered_upper = limit;
1518 #ifdef KMP_DEBUG
1519           {
1520             char *buff;
1521             // create format specifiers before the debug output
1522             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1523                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1524                                     traits_t<UT>::spec, traits_t<UT>::spec);
1525             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1526                             pr->u.p.ordered_upper));
1527             __kmp_str_free(&buff);
1528           }
1529 #endif
1530         } // if
1531       } // if
1532     } else {
1533       pr->u.p.tc = 0;
1534       *p_lb = pr->u.p.lb;
1535       *p_ub = pr->u.p.ub;
1536 #if KMP_OS_WINDOWS
1537       pr->u.p.last_upper = *p_ub;
1538 #endif /* KMP_OS_WINDOWS */
1539       if (p_last != NULL)
1540         *p_last = TRUE;
1541       if (p_st != NULL)
1542         *p_st = pr->u.p.st;
1543     } // if
1544 #ifdef KMP_DEBUG
1545     {
1546       char *buff;
1547       // create format specifiers before the debug output
1548       buff = __kmp_str_format(
1549           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1550           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1551           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1552       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1553       __kmp_str_free(&buff);
1554     }
1555 #endif
1556 #if INCLUDE_SSC_MARKS
1557     SSC_MARK_DISPATCH_NEXT();
1558 #endif
1559     OMPT_LOOP_END;
1560     return status;
1561   } else {
1562     kmp_int32 last = 0;
1563     dispatch_shared_info_template<UT> *sh;
1564     T start;
1565     ST incr;
1566     UT limit, trip, init;
1567 
1568     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1569                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1570 
1571     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1572         th->th.th_dispatch->th_dispatch_pr_current);
1573     KMP_DEBUG_ASSERT(pr);
1574     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1575         th->th.th_dispatch->th_dispatch_sh_current);
1576     KMP_DEBUG_ASSERT(sh);
1577 
1578     if (pr->u.p.tc == 0) {
1579       // zero trip count
1580       status = 0;
1581     } else {
1582       switch (pr->schedule) {
1583 #if (KMP_STATIC_STEAL_ENABLED)
1584       case kmp_sch_static_steal: {
1585         T chunk = pr->u.p.parm1;
1586         int nproc = th->th.th_team_nproc;
1587 
1588         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1589                        gtid));
1590 
1591         trip = pr->u.p.tc - 1;
1592 
1593         if (traits_t<T>::type_size > 4) {
1594           // use lock for 8-byte and CAS for 4-byte induction
1595           // variable. TODO (optional): check and use 16-byte CAS
1596           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1597           KMP_DEBUG_ASSERT(lck != NULL);
1598           if (pr->u.p.count < (UT)pr->u.p.ub) {
1599             __kmp_acquire_lock(lck, gtid);
1600             // try to get own chunk of iterations
1601             init = (pr->u.p.count)++;
1602             status = (init < (UT)pr->u.p.ub);
1603             __kmp_release_lock(lck, gtid);
1604           } else {
1605             status = 0; // no own chunks
1606           }
1607           if (!status) { // try to steal
1608             kmp_info_t **other_threads = team->t.t_threads;
1609             int while_limit = nproc; // nproc attempts to find a victim
1610             int while_index = 0;
1611             // TODO: algorithm of searching for a victim
1612             // should be cleaned up and measured
1613             while ((!status) && (while_limit != ++while_index)) {
1614               T remaining;
1615               T victimIdx = pr->u.p.parm4;
1616               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1617               dispatch_private_info_template<T> *victim =
1618                   reinterpret_cast<dispatch_private_info_template<T> *>(
1619                       other_threads[victimIdx]
1620                           ->th.th_dispatch->th_dispatch_pr_current);
1621               while ((victim == NULL || victim == pr ||
1622                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1623                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1624                      oldVictimIdx != victimIdx) {
1625                 victimIdx = (victimIdx + 1) % nproc;
1626                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1627                     other_threads[victimIdx]
1628                         ->th.th_dispatch->th_dispatch_pr_current);
1629               }
1630               if (!victim ||
1631                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1632                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1633                 continue; // try once more (nproc attempts in total)
1634                 // no victim is ready yet to participate in stealing
1635                 // because all victims are still in kmp_init_dispatch
1636               }
1637               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1638                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1639                 continue; // not enough chunks to steal, goto next victim
1640               }
1641 
1642               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1643               KMP_ASSERT(lck != NULL);
1644               __kmp_acquire_lock(lck, gtid);
1645               limit = victim->u.p.ub; // keep initial ub
1646               if (victim->u.p.count >= limit ||
1647                   (remaining = limit - victim->u.p.count) < 2) {
1648                 __kmp_release_lock(lck, gtid);
1649                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1650                 continue; // not enough chunks to steal
1651               }
1652               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1653               // or by 1
1654               if (remaining > 3) {
1655                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1656                 init = (victim->u.p.ub -=
1657                         (remaining >> 2)); // steal 1/4 of remaining
1658               } else {
1659                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1660                 init =
1661                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1662               }
1663               __kmp_release_lock(lck, gtid);
1664 
1665               KMP_DEBUG_ASSERT(init + 1 <= limit);
1666               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1667               status = 1;
1668               while_index = 0;
1669               // now update own count and ub with stolen range but init chunk
1670               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1671               pr->u.p.count = init + 1;
1672               pr->u.p.ub = limit;
1673               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1674             } // while (search for victim)
1675           } // if (try to find victim and steal)
1676         } else {
1677           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1678           typedef union {
1679             struct {
1680               UT count;
1681               T ub;
1682             } p;
1683             kmp_int64 b;
1684           } union_i4;
1685           // All operations on 'count' or 'ub' must be combined atomically
1686           // together.
1687           {
1688             union_i4 vold, vnew;
1689             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1690             vnew = vold;
1691             vnew.p.count++;
1692             while (!KMP_COMPARE_AND_STORE_ACQ64(
1693                 (volatile kmp_int64 *)&pr->u.p.count,
1694                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1695                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1696               KMP_CPU_PAUSE();
1697               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1698               vnew = vold;
1699               vnew.p.count++;
1700             }
1701             vnew = vold;
1702             init = vnew.p.count;
1703             status = (init < (UT)vnew.p.ub);
1704           }
1705 
1706           if (!status) {
1707             kmp_info_t **other_threads = team->t.t_threads;
1708             int while_limit = nproc; // nproc attempts to find a victim
1709             int while_index = 0;
1710 
1711             // TODO: algorithm of searching for a victim
1712             // should be cleaned up and measured
1713             while ((!status) && (while_limit != ++while_index)) {
1714               union_i4 vold, vnew;
1715               kmp_int32 remaining;
1716               T victimIdx = pr->u.p.parm4;
1717               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1718               dispatch_private_info_template<T> *victim =
1719                   reinterpret_cast<dispatch_private_info_template<T> *>(
1720                       other_threads[victimIdx]
1721                           ->th.th_dispatch->th_dispatch_pr_current);
1722               while ((victim == NULL || victim == pr ||
1723                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1724                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1725                      oldVictimIdx != victimIdx) {
1726                 victimIdx = (victimIdx + 1) % nproc;
1727                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1728                     other_threads[victimIdx]
1729                         ->th.th_dispatch->th_dispatch_pr_current);
1730               }
1731               if (!victim ||
1732                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1733                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1734                 continue; // try once more (nproc attempts in total)
1735                 // no victim is ready yet to participate in stealing
1736                 // because all victims are still in kmp_init_dispatch
1737               }
1738               pr->u.p.parm4 = victimIdx; // new victim found
1739               while (1) { // CAS loop if victim has enough chunks to steal
1740                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1741                 vnew = vold;
1742 
1743                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1744                 if (vnew.p.count >= (UT)vnew.p.ub ||
1745                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1746                   pr->u.p.parm4 =
1747                       (victimIdx + 1) % nproc; // shift start victim id
1748                   break; // not enough chunks to steal, goto next victim
1749                 }
1750                 if (remaining > 3) {
1751                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1752                 } else {
1753                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1754                 }
1755                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1756                 // TODO: Should this be acquire or release?
1757                 if (KMP_COMPARE_AND_STORE_ACQ64(
1758                         (volatile kmp_int64 *)&victim->u.p.count,
1759                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1760                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1761                   // stealing succeeded
1762                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1763                                   vold.p.ub - vnew.p.ub);
1764                   status = 1;
1765                   while_index = 0;
1766                   // now update own count and ub
1767                   init = vnew.p.ub;
1768                   vold.p.count = init + 1;
1769 #if KMP_ARCH_X86
1770                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1771                                    vold.b);
1772 #else
1773                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1774 #endif
1775                   break;
1776                 } // if (check CAS result)
1777                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1778               } // while (try to steal from particular victim)
1779             } // while (search for victim)
1780           } // if (try to find victim and steal)
1781         } // if (4-byte induction variable)
1782         if (!status) {
1783           *p_lb = 0;
1784           *p_ub = 0;
1785           if (p_st != NULL)
1786             *p_st = 0;
1787         } else {
1788           start = pr->u.p.parm2;
1789           init *= chunk;
1790           limit = chunk + init - 1;
1791           incr = pr->u.p.st;
1792           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1793 
1794           KMP_DEBUG_ASSERT(init <= trip);
1795           if ((last = (limit >= trip)) != 0)
1796             limit = trip;
1797           if (p_st != NULL)
1798             *p_st = incr;
1799 
1800           if (incr == 1) {
1801             *p_lb = start + init;
1802             *p_ub = start + limit;
1803           } else {
1804             *p_lb = start + init * incr;
1805             *p_ub = start + limit * incr;
1806           }
1807 
1808           if (pr->ordered) {
1809             pr->u.p.ordered_lower = init;
1810             pr->u.p.ordered_upper = limit;
1811 #ifdef KMP_DEBUG
1812             {
1813               char *buff;
1814               // create format specifiers before the debug output
1815               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1816                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1817                                       traits_t<UT>::spec, traits_t<UT>::spec);
1818               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1819                               pr->u.p.ordered_upper));
1820               __kmp_str_free(&buff);
1821             }
1822 #endif
1823           } // if
1824         } // if
1825         break;
1826       } // case
1827 #endif // ( KMP_STATIC_STEAL_ENABLED )
1828       case kmp_sch_static_balanced: {
1829         KD_TRACE(
1830             100,
1831             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1832         if ((status = !pr->u.p.count) !=
1833             0) { /* check if thread has any iteration to do */
1834           pr->u.p.count = 1;
1835           *p_lb = pr->u.p.lb;
1836           *p_ub = pr->u.p.ub;
1837           last = pr->u.p.parm1;
1838           if (p_st != NULL)
1839             *p_st = pr->u.p.st;
1840         } else { /* no iterations to do */
1841           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1842         }
1843         if (pr->ordered) {
1844 #ifdef KMP_DEBUG
1845           {
1846             char *buff;
1847             // create format specifiers before the debug output
1848             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1849                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1850                                     traits_t<UT>::spec, traits_t<UT>::spec);
1851             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1852                             pr->u.p.ordered_upper));
1853             __kmp_str_free(&buff);
1854           }
1855 #endif
1856         } // if
1857       } // case
1858       break;
1859       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1860                                      merged here */
1861       case kmp_sch_static_chunked: {
1862         T parm1;
1863 
1864         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1865                        "kmp_sch_static_[affinity|chunked] case\n",
1866                        gtid));
1867         parm1 = pr->u.p.parm1;
1868 
1869         trip = pr->u.p.tc - 1;
1870         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1871 
1872         if ((status = (init <= trip)) != 0) {
1873           start = pr->u.p.lb;
1874           incr = pr->u.p.st;
1875           limit = parm1 + init - 1;
1876 
1877           if ((last = (limit >= trip)) != 0)
1878             limit = trip;
1879 
1880           if (p_st != NULL)
1881             *p_st = incr;
1882 
1883           pr->u.p.count += th->th.th_team_nproc;
1884 
1885           if (incr == 1) {
1886             *p_lb = start + init;
1887             *p_ub = start + limit;
1888           } else {
1889             *p_lb = start + init * incr;
1890             *p_ub = start + limit * incr;
1891           }
1892 
1893           if (pr->ordered) {
1894             pr->u.p.ordered_lower = init;
1895             pr->u.p.ordered_upper = limit;
1896 #ifdef KMP_DEBUG
1897             {
1898               char *buff;
1899               // create format specifiers before the debug output
1900               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1901                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1902                                       traits_t<UT>::spec, traits_t<UT>::spec);
1903               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1904                               pr->u.p.ordered_upper));
1905               __kmp_str_free(&buff);
1906             }
1907 #endif
1908           } // if
1909         } // if
1910       } // case
1911       break;
1912 
1913       case kmp_sch_dynamic_chunked: {
1914         T chunk = pr->u.p.parm1;
1915 
1916         KD_TRACE(
1917             100,
1918             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1919 
1920         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1921         trip = pr->u.p.tc - 1;
1922 
1923         if ((status = (init <= trip)) == 0) {
1924           *p_lb = 0;
1925           *p_ub = 0;
1926           if (p_st != NULL)
1927             *p_st = 0;
1928         } else {
1929           start = pr->u.p.lb;
1930           limit = chunk + init - 1;
1931           incr = pr->u.p.st;
1932 
1933           if ((last = (limit >= trip)) != 0)
1934             limit = trip;
1935 
1936           if (p_st != NULL)
1937             *p_st = incr;
1938 
1939           if (incr == 1) {
1940             *p_lb = start + init;
1941             *p_ub = start + limit;
1942           } else {
1943             *p_lb = start + init * incr;
1944             *p_ub = start + limit * incr;
1945           }
1946 
1947           if (pr->ordered) {
1948             pr->u.p.ordered_lower = init;
1949             pr->u.p.ordered_upper = limit;
1950 #ifdef KMP_DEBUG
1951             {
1952               char *buff;
1953               // create format specifiers before the debug output
1954               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1955                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1956                                       traits_t<UT>::spec, traits_t<UT>::spec);
1957               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1958                               pr->u.p.ordered_upper));
1959               __kmp_str_free(&buff);
1960             }
1961 #endif
1962           } // if
1963         } // if
1964       } // case
1965       break;
1966 
1967       case kmp_sch_guided_iterative_chunked: {
1968         T chunkspec = pr->u.p.parm1;
1969         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1970                        "iterative case\n",
1971                        gtid));
1972         trip = pr->u.p.tc;
1973         // Start atomic part of calculations
1974         while (1) {
1975           ST remaining; // signed, because can be < 0
1976           init = sh->u.s.iteration; // shared value
1977           remaining = trip - init;
1978           if (remaining <= 0) { // AC: need to compare with 0 first
1979             // nothing to do, don't try atomic op
1980             status = 0;
1981             break;
1982           }
1983           if ((T)remaining <
1984               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1985             // use dynamic-style shcedule
1986             // atomically inrement iterations, get old value
1987             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1988                                      (ST)chunkspec);
1989             remaining = trip - init;
1990             if (remaining <= 0) {
1991               status = 0; // all iterations got by other threads
1992             } else { // got some iterations to work on
1993               status = 1;
1994               if ((T)remaining > chunkspec) {
1995                 limit = init + chunkspec - 1;
1996               } else {
1997                 last = 1; // the last chunk
1998                 limit = init + remaining - 1;
1999               } // if
2000             } // if
2001             break;
2002           } // if
2003           limit = init + (UT)(remaining *
2004                               *(double *)&pr->u.p.parm3); // divide by K*nproc
2005           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2006                                    (ST)init, (ST)limit)) {
2007             // CAS was successful, chunk obtained
2008             status = 1;
2009             --limit;
2010             break;
2011           } // if
2012         } // while
2013         if (status != 0) {
2014           start = pr->u.p.lb;
2015           incr = pr->u.p.st;
2016           if (p_st != NULL)
2017             *p_st = incr;
2018           *p_lb = start + init * incr;
2019           *p_ub = start + limit * incr;
2020           if (pr->ordered) {
2021             pr->u.p.ordered_lower = init;
2022             pr->u.p.ordered_upper = limit;
2023 #ifdef KMP_DEBUG
2024             {
2025               char *buff;
2026               // create format specifiers before the debug output
2027               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2028                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2029                                       traits_t<UT>::spec, traits_t<UT>::spec);
2030               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2031                               pr->u.p.ordered_upper));
2032               __kmp_str_free(&buff);
2033             }
2034 #endif
2035           } // if
2036         } else {
2037           *p_lb = 0;
2038           *p_ub = 0;
2039           if (p_st != NULL)
2040             *p_st = 0;
2041         } // if
2042       } // case
2043       break;
2044 
2045       case kmp_sch_guided_simd: {
2046         // same as iterative but curr-chunk adjusted to be multiple of given
2047         // chunk
2048         T chunk = pr->u.p.parm1;
2049         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2050                        gtid));
2051         trip = pr->u.p.tc;
2052         // Start atomic part of calculations
2053         while (1) {
2054           ST remaining; // signed, because can be < 0
2055           init = sh->u.s.iteration; // shared value
2056           remaining = trip - init;
2057           if (remaining <= 0) { // AC: need to compare with 0 first
2058             status = 0; // nothing to do, don't try atomic op
2059             break;
2060           }
2061           KMP_DEBUG_ASSERT(init % chunk == 0);
2062           // compare with K*nproc*(chunk+1), K=2 by default
2063           if ((T)remaining < pr->u.p.parm2) {
2064             // use dynamic-style shcedule
2065             // atomically inrement iterations, get old value
2066             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2067                                      (ST)chunk);
2068             remaining = trip - init;
2069             if (remaining <= 0) {
2070               status = 0; // all iterations got by other threads
2071             } else {
2072               // got some iterations to work on
2073               status = 1;
2074               if ((T)remaining > chunk) {
2075                 limit = init + chunk - 1;
2076               } else {
2077                 last = 1; // the last chunk
2078                 limit = init + remaining - 1;
2079               } // if
2080             } // if
2081             break;
2082           } // if
2083           // divide by K*nproc
2084           UT span = remaining * (*(double *)&pr->u.p.parm3);
2085           UT rem = span % chunk;
2086           if (rem) // adjust so that span%chunk == 0
2087             span += chunk - rem;
2088           limit = init + span;
2089           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2090                                    (ST)init, (ST)limit)) {
2091             // CAS was successful, chunk obtained
2092             status = 1;
2093             --limit;
2094             break;
2095           } // if
2096         } // while
2097         if (status != 0) {
2098           start = pr->u.p.lb;
2099           incr = pr->u.p.st;
2100           if (p_st != NULL)
2101             *p_st = incr;
2102           *p_lb = start + init * incr;
2103           *p_ub = start + limit * incr;
2104           if (pr->ordered) {
2105             pr->u.p.ordered_lower = init;
2106             pr->u.p.ordered_upper = limit;
2107 #ifdef KMP_DEBUG
2108             {
2109               char *buff;
2110               // create format specifiers before the debug output
2111               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2112                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2113                                       traits_t<UT>::spec, traits_t<UT>::spec);
2114               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2115                               pr->u.p.ordered_upper));
2116               __kmp_str_free(&buff);
2117             }
2118 #endif
2119           } // if
2120         } else {
2121           *p_lb = 0;
2122           *p_ub = 0;
2123           if (p_st != NULL)
2124             *p_st = 0;
2125         } // if
2126       } // case
2127       break;
2128 
2129       case kmp_sch_guided_analytical_chunked: {
2130         T chunkspec = pr->u.p.parm1;
2131         UT chunkIdx;
2132 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2133         /* for storing original FPCW value for Windows* OS on
2134            IA-32 architecture 8-byte version */
2135         unsigned int oldFpcw;
2136         unsigned int fpcwSet = 0;
2137 #endif
2138         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2139                        "analytical case\n",
2140                        gtid));
2141 
2142         trip = pr->u.p.tc;
2143 
2144         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2145         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2146                          trip);
2147 
2148         while (1) { /* this while loop is a safeguard against unexpected zero
2149                        chunk sizes */
2150           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2151           if (chunkIdx >= (UT)pr->u.p.parm2) {
2152             --trip;
2153             /* use dynamic-style scheduling */
2154             init = chunkIdx * chunkspec + pr->u.p.count;
2155             /* need to verify init > 0 in case of overflow in the above
2156              * calculation */
2157             if ((status = (init > 0 && init <= trip)) != 0) {
2158               limit = init + chunkspec - 1;
2159 
2160               if ((last = (limit >= trip)) != 0)
2161                 limit = trip;
2162             }
2163             break;
2164           } else {
2165 /* use exponential-style scheduling */
2166 /* The following check is to workaround the lack of long double precision on
2167    Windows* OS.
2168    This check works around the possible effect that init != 0 for chunkIdx == 0.
2169  */
2170 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2171             /* If we haven't already done so, save original FPCW and set
2172                precision to 64-bit, as Windows* OS on IA-32 architecture
2173                defaults to 53-bit */
2174             if (!fpcwSet) {
2175               oldFpcw = _control87(0, 0);
2176               _control87(_PC_64, _MCW_PC);
2177               fpcwSet = 0x30000;
2178             }
2179 #endif
2180             if (chunkIdx) {
2181               init = __kmp_dispatch_guided_remaining<T>(
2182                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2183               KMP_DEBUG_ASSERT(init);
2184               init = trip - init;
2185             } else
2186               init = 0;
2187             limit = trip - __kmp_dispatch_guided_remaining<T>(
2188                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2189             KMP_ASSERT(init <= limit);
2190             if (init < limit) {
2191               KMP_DEBUG_ASSERT(limit <= trip);
2192               --limit;
2193               status = 1;
2194               break;
2195             } // if
2196           } // if
2197         } // while (1)
2198 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2199         /* restore FPCW if necessary
2200            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2201            here */
2202         if (fpcwSet && (oldFpcw & fpcwSet))
2203           _control87(oldFpcw, _MCW_PC);
2204 #endif
2205         if (status != 0) {
2206           start = pr->u.p.lb;
2207           incr = pr->u.p.st;
2208           if (p_st != NULL)
2209             *p_st = incr;
2210           *p_lb = start + init * incr;
2211           *p_ub = start + limit * incr;
2212           if (pr->ordered) {
2213             pr->u.p.ordered_lower = init;
2214             pr->u.p.ordered_upper = limit;
2215 #ifdef KMP_DEBUG
2216             {
2217               char *buff;
2218               // create format specifiers before the debug output
2219               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2220                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2221                                       traits_t<UT>::spec, traits_t<UT>::spec);
2222               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2223                               pr->u.p.ordered_upper));
2224               __kmp_str_free(&buff);
2225             }
2226 #endif
2227           }
2228         } else {
2229           *p_lb = 0;
2230           *p_ub = 0;
2231           if (p_st != NULL)
2232             *p_st = 0;
2233         }
2234       } // case
2235       break;
2236 
2237       case kmp_sch_trapezoidal: {
2238         UT index;
2239         T parm2 = pr->u.p.parm2;
2240         T parm3 = pr->u.p.parm3;
2241         T parm4 = pr->u.p.parm4;
2242         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2243                        gtid));
2244 
2245         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2246 
2247         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2248         trip = pr->u.p.tc - 1;
2249 
2250         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2251           *p_lb = 0;
2252           *p_ub = 0;
2253           if (p_st != NULL)
2254             *p_st = 0;
2255         } else {
2256           start = pr->u.p.lb;
2257           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2258           incr = pr->u.p.st;
2259 
2260           if ((last = (limit >= trip)) != 0)
2261             limit = trip;
2262 
2263           if (p_st != NULL)
2264             *p_st = incr;
2265 
2266           if (incr == 1) {
2267             *p_lb = start + init;
2268             *p_ub = start + limit;
2269           } else {
2270             *p_lb = start + init * incr;
2271             *p_ub = start + limit * incr;
2272           }
2273 
2274           if (pr->ordered) {
2275             pr->u.p.ordered_lower = init;
2276             pr->u.p.ordered_upper = limit;
2277 #ifdef KMP_DEBUG
2278             {
2279               char *buff;
2280               // create format specifiers before the debug output
2281               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2283                                       traits_t<UT>::spec, traits_t<UT>::spec);
2284               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285                               pr->u.p.ordered_upper));
2286               __kmp_str_free(&buff);
2287             }
2288 #endif
2289           } // if
2290         } // if
2291       } // case
2292       break;
2293       default: {
2294         status = 0; // to avoid complaints on uninitialized variable use
2295         __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2296                     KMP_HNT(GetNewerLibrary), // Hint
2297                     __kmp_msg_null // Variadic argument list terminator
2298                     );
2299       } break;
2300       } // switch
2301     } // if tc == 0;
2302 
2303     if (status == 0) {
2304       UT num_done;
2305 
2306       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2307 #ifdef KMP_DEBUG
2308       {
2309         char *buff;
2310         // create format specifiers before the debug output
2311         buff = __kmp_str_format(
2312             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2313             traits_t<UT>::spec);
2314         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2315         __kmp_str_free(&buff);
2316       }
2317 #endif
2318 
2319       if ((ST)num_done == th->th.th_team_nproc - 1) {
2320 #if (KMP_STATIC_STEAL_ENABLED)
2321         if (pr->schedule == kmp_sch_static_steal &&
2322             traits_t<T>::type_size > 4) {
2323           int i;
2324           kmp_info_t **other_threads = team->t.t_threads;
2325           // loop complete, safe to destroy locks used for stealing
2326           for (i = 0; i < th->th.th_team_nproc; ++i) {
2327             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2328             KMP_ASSERT(lck != NULL);
2329             __kmp_destroy_lock(lck);
2330             __kmp_free(lck);
2331             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2332           }
2333         }
2334 #endif
2335         /* NOTE: release this buffer to be reused */
2336 
2337         KMP_MB(); /* Flush all pending memory write invalidates.  */
2338 
2339         sh->u.s.num_done = 0;
2340         sh->u.s.iteration = 0;
2341 
2342         /* TODO replace with general release procedure? */
2343         if (pr->ordered) {
2344           sh->u.s.ordered_iteration = 0;
2345         }
2346 
2347         KMP_MB(); /* Flush all pending memory write invalidates.  */
2348 
2349         sh->buffer_index += __kmp_dispatch_num_buffers;
2350         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2351                        gtid, sh->buffer_index));
2352 
2353         KMP_MB(); /* Flush all pending memory write invalidates.  */
2354 
2355       } // if
2356       if (__kmp_env_consistency_check) {
2357         if (pr->pushed_ws != ct_none) {
2358           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2359         }
2360       }
2361 
2362       th->th.th_dispatch->th_deo_fcn = NULL;
2363       th->th.th_dispatch->th_dxo_fcn = NULL;
2364       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2365       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2366     } // if (status == 0)
2367 #if KMP_OS_WINDOWS
2368     else if (last) {
2369       pr->u.p.last_upper = pr->u.p.ub;
2370     }
2371 #endif /* KMP_OS_WINDOWS */
2372     if (p_last != NULL && status != 0)
2373       *p_last = last;
2374   } // if
2375 
2376 #ifdef KMP_DEBUG
2377   {
2378     char *buff;
2379     // create format specifiers before the debug output
2380     buff = __kmp_str_format(
2381         "__kmp_dispatch_next: T#%%d normal case: "
2382         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2383         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2384     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2385     __kmp_str_free(&buff);
2386   }
2387 #endif
2388 #if INCLUDE_SSC_MARKS
2389   SSC_MARK_DISPATCH_NEXT();
2390 #endif
2391   OMPT_LOOP_END;
2392   return status;
2393 }
2394 
2395 template <typename T>
2396 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2397                                   kmp_int32 *plastiter, T *plower, T *pupper,
2398                                   typename traits_t<T>::signed_t incr) {
2399   typedef typename traits_t<T>::unsigned_t UT;
2400   typedef typename traits_t<T>::signed_t ST;
2401   kmp_uint32 team_id;
2402   kmp_uint32 nteams;
2403   UT trip_count;
2404   kmp_team_t *team;
2405   kmp_info_t *th;
2406 
2407   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2408   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2409 #ifdef KMP_DEBUG
2410   {
2411     char *buff;
2412     // create format specifiers before the debug output
2413     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2414                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2415                             traits_t<T>::spec, traits_t<T>::spec,
2416                             traits_t<ST>::spec, traits_t<T>::spec);
2417     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2418     __kmp_str_free(&buff);
2419   }
2420 #endif
2421 
2422   if (__kmp_env_consistency_check) {
2423     if (incr == 0) {
2424       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2425                             loc);
2426     }
2427     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2428       // The loop is illegal.
2429       // Some zero-trip loops maintained by compiler, e.g.:
2430       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2431       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2432       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2433       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2434       // Compiler does not check the following illegal loops:
2435       //   for(i=0;i<10;i+=incr) // where incr<0
2436       //   for(i=10;i>0;i-=incr) // where incr<0
2437       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2438     }
2439   }
2440   th = __kmp_threads[gtid];
2441   team = th->th.th_team;
2442 #if OMP_40_ENABLED
2443   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2444   nteams = th->th.th_teams_size.nteams;
2445 #endif
2446   team_id = team->t.t_master_tid;
2447   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2448 
2449   // compute global trip count
2450   if (incr == 1) {
2451     trip_count = *pupper - *plower + 1;
2452   } else if (incr == -1) {
2453     trip_count = *plower - *pupper + 1;
2454   } else if (incr > 0) {
2455     // upper-lower can exceed the limit of signed type
2456     trip_count = (UT)(*pupper - *plower) / incr + 1;
2457   } else {
2458     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2459   }
2460 
2461   if (trip_count <= nteams) {
2462     KMP_DEBUG_ASSERT(
2463         __kmp_static == kmp_sch_static_greedy ||
2464         __kmp_static ==
2465             kmp_sch_static_balanced); // Unknown static scheduling type.
2466     // only some teams get single iteration, others get nothing
2467     if (team_id < trip_count) {
2468       *pupper = *plower = *plower + team_id * incr;
2469     } else {
2470       *plower = *pupper + incr; // zero-trip loop
2471     }
2472     if (plastiter != NULL)
2473       *plastiter = (team_id == trip_count - 1);
2474   } else {
2475     if (__kmp_static == kmp_sch_static_balanced) {
2476       UT chunk = trip_count / nteams;
2477       UT extras = trip_count % nteams;
2478       *plower +=
2479           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2480       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2481       if (plastiter != NULL)
2482         *plastiter = (team_id == nteams - 1);
2483     } else {
2484       T chunk_inc_count =
2485           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2486       T upper = *pupper;
2487       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2488       // Unknown static scheduling type.
2489       *plower += team_id * chunk_inc_count;
2490       *pupper = *plower + chunk_inc_count - incr;
2491       // Check/correct bounds if needed
2492       if (incr > 0) {
2493         if (*pupper < *plower)
2494           *pupper = traits_t<T>::max_value;
2495         if (plastiter != NULL)
2496           *plastiter = *plower <= upper && *pupper > upper - incr;
2497         if (*pupper > upper)
2498           *pupper = upper; // tracker C73258
2499       } else {
2500         if (*pupper > *plower)
2501           *pupper = traits_t<T>::min_value;
2502         if (plastiter != NULL)
2503           *plastiter = *plower >= upper && *pupper < upper - incr;
2504         if (*pupper < upper)
2505           *pupper = upper; // tracker C73258
2506       }
2507     }
2508   }
2509 }
2510 
2511 //-----------------------------------------------------------------------------
2512 // Dispatch routines
2513 //    Transfer call to template< type T >
2514 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2515 //                         T lb, T ub, ST st, ST chunk )
2516 extern "C" {
2517 
2518 /*!
2519 @ingroup WORK_SHARING
2520 @{
2521 @param loc Source location
2522 @param gtid Global thread id
2523 @param schedule Schedule type
2524 @param lb  Lower bound
2525 @param ub  Upper bound
2526 @param st  Step (or increment if you prefer)
2527 @param chunk The chunk size to block with
2528 
2529 This function prepares the runtime to start a dynamically scheduled for loop,
2530 saving the loop arguments.
2531 These functions are all identical apart from the types of the arguments.
2532 */
2533 
2534 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2535                             enum sched_type schedule, kmp_int32 lb,
2536                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2537   KMP_DEBUG_ASSERT(__kmp_init_serial);
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539   OMPT_STORE_RETURN_ADDRESS(gtid);
2540 #endif
2541   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2542 }
2543 /*!
2544 See @ref __kmpc_dispatch_init_4
2545 */
2546 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2547                              enum sched_type schedule, kmp_uint32 lb,
2548                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2549   KMP_DEBUG_ASSERT(__kmp_init_serial);
2550 #if OMPT_SUPPORT && OMPT_OPTIONAL
2551   OMPT_STORE_RETURN_ADDRESS(gtid);
2552 #endif
2553   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2554 }
2555 
2556 /*!
2557 See @ref __kmpc_dispatch_init_4
2558 */
2559 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2560                             enum sched_type schedule, kmp_int64 lb,
2561                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2562   KMP_DEBUG_ASSERT(__kmp_init_serial);
2563 #if OMPT_SUPPORT && OMPT_OPTIONAL
2564   OMPT_STORE_RETURN_ADDRESS(gtid);
2565 #endif
2566   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2567 }
2568 
2569 /*!
2570 See @ref __kmpc_dispatch_init_4
2571 */
2572 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2573                              enum sched_type schedule, kmp_uint64 lb,
2574                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2575   KMP_DEBUG_ASSERT(__kmp_init_serial);
2576 #if OMPT_SUPPORT && OMPT_OPTIONAL
2577   OMPT_STORE_RETURN_ADDRESS(gtid);
2578 #endif
2579   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2580 }
2581 
2582 /*!
2583 See @ref __kmpc_dispatch_init_4
2584 
2585 Difference from __kmpc_dispatch_init set of functions is these functions
2586 are called for composite distribute parallel for construct. Thus before
2587 regular iterations dispatching we need to calc per-team iteration space.
2588 
2589 These functions are all identical apart from the types of the arguments.
2590 */
2591 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2592                                  enum sched_type schedule, kmp_int32 *p_last,
2593                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2594                                  kmp_int32 chunk) {
2595   KMP_DEBUG_ASSERT(__kmp_init_serial);
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597   OMPT_STORE_RETURN_ADDRESS(gtid);
2598 #endif
2599   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2600   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2601 }
2602 
2603 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2604                                   enum sched_type schedule, kmp_int32 *p_last,
2605                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2606                                   kmp_int32 chunk) {
2607   KMP_DEBUG_ASSERT(__kmp_init_serial);
2608 #if OMPT_SUPPORT && OMPT_OPTIONAL
2609   OMPT_STORE_RETURN_ADDRESS(gtid);
2610 #endif
2611   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2612   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2613 }
2614 
2615 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2616                                  enum sched_type schedule, kmp_int32 *p_last,
2617                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2618                                  kmp_int64 chunk) {
2619   KMP_DEBUG_ASSERT(__kmp_init_serial);
2620 #if OMPT_SUPPORT && OMPT_OPTIONAL
2621   OMPT_STORE_RETURN_ADDRESS(gtid);
2622 #endif
2623   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2624   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2625 }
2626 
2627 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2628                                   enum sched_type schedule, kmp_int32 *p_last,
2629                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2630                                   kmp_int64 chunk) {
2631   KMP_DEBUG_ASSERT(__kmp_init_serial);
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633   OMPT_STORE_RETURN_ADDRESS(gtid);
2634 #endif
2635   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2636   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2637 }
2638 
2639 /*!
2640 @param loc Source code location
2641 @param gtid Global thread id
2642 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2643 otherwise
2644 @param p_lb   Pointer to the lower bound for the next chunk of work
2645 @param p_ub   Pointer to the upper bound for the next chunk of work
2646 @param p_st   Pointer to the stride for the next chunk of work
2647 @return one if there is work to be done, zero otherwise
2648 
2649 Get the next dynamically allocated chunk of work for this thread.
2650 If there is no more work, then the lb,ub and stride need not be modified.
2651 */
2652 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2653                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2654 #if OMPT_SUPPORT && OMPT_OPTIONAL
2655   OMPT_STORE_RETURN_ADDRESS(gtid);
2656 #endif
2657   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659                                         ,
2660                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2661 #endif
2662                                             );
2663 }
2664 
2665 /*!
2666 See @ref __kmpc_dispatch_next_4
2667 */
2668 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2669                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2670                             kmp_int32 *p_st) {
2671 #if OMPT_SUPPORT && OMPT_OPTIONAL
2672   OMPT_STORE_RETURN_ADDRESS(gtid);
2673 #endif
2674   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2675 #if OMPT_SUPPORT && OMPT_OPTIONAL
2676                                          ,
2677                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2678 #endif
2679                                              );
2680 }
2681 
2682 /*!
2683 See @ref __kmpc_dispatch_next_4
2684 */
2685 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2686                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2687 #if OMPT_SUPPORT && OMPT_OPTIONAL
2688   OMPT_STORE_RETURN_ADDRESS(gtid);
2689 #endif
2690   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2691 #if OMPT_SUPPORT && OMPT_OPTIONAL
2692                                         ,
2693                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2694 #endif
2695                                             );
2696 }
2697 
2698 /*!
2699 See @ref __kmpc_dispatch_next_4
2700 */
2701 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2702                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2703                             kmp_int64 *p_st) {
2704 #if OMPT_SUPPORT && OMPT_OPTIONAL
2705   OMPT_STORE_RETURN_ADDRESS(gtid);
2706 #endif
2707   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2708 #if OMPT_SUPPORT && OMPT_OPTIONAL
2709                                          ,
2710                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2711 #endif
2712                                              );
2713 }
2714 
2715 /*!
2716 @param loc Source code location
2717 @param gtid Global thread id
2718 
2719 Mark the end of a dynamic loop.
2720 */
2721 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2722   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2723 }
2724 
2725 /*!
2726 See @ref __kmpc_dispatch_fini_4
2727 */
2728 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2729   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2730 }
2731 
2732 /*!
2733 See @ref __kmpc_dispatch_fini_4
2734 */
2735 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2736   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2737 }
2738 
2739 /*!
2740 See @ref __kmpc_dispatch_fini_4
2741 */
2742 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2743   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2744 }
2745 /*! @} */
2746 
2747 //-----------------------------------------------------------------------------
2748 // Non-template routines from kmp_dispatch.cpp used in other sources
2749 
2750 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2751   return value == checker;
2752 }
2753 
2754 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2755   return value != checker;
2756 }
2757 
2758 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2759   return value < checker;
2760 }
2761 
2762 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2763   return value >= checker;
2764 }
2765 
2766 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2767   return value <= checker;
2768 }
2769 
2770 kmp_uint32
2771 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2772                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2773                    void *obj // Higher-level synchronization object, or NULL.
2774                    ) {
2775   // note: we may not belong to a team at this point
2776   volatile kmp_uint32 *spin = spinner;
2777   kmp_uint32 check = checker;
2778   kmp_uint32 spins;
2779   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2780   kmp_uint32 r;
2781 
2782   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2783   KMP_INIT_YIELD(spins);
2784   // main wait spin loop
2785   while (!f(r = TCR_4(*spin), check)) {
2786     KMP_FSYNC_SPIN_PREPARE(obj);
2787     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2788        split. It causes problems with infinite recursion because of exit lock */
2789     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2790         __kmp_abort_thread(); */
2791 
2792     /* if we have waited a bit, or are oversubscribed, yield */
2793     /* pause is in the following code */
2794     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2795     KMP_YIELD_SPIN(spins);
2796   }
2797   KMP_FSYNC_SPIN_ACQUIRED(obj);
2798   return r;
2799 }
2800 
2801 void __kmp_wait_yield_4_ptr(
2802     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2803     void *obj // Higher-level synchronization object, or NULL.
2804     ) {
2805   // note: we may not belong to a team at this point
2806   void *spin = spinner;
2807   kmp_uint32 check = checker;
2808   kmp_uint32 spins;
2809   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2810 
2811   KMP_FSYNC_SPIN_INIT(obj, spin);
2812   KMP_INIT_YIELD(spins);
2813   // main wait spin loop
2814   while (!f(spin, check)) {
2815     KMP_FSYNC_SPIN_PREPARE(obj);
2816     /* if we have waited a bit, or are oversubscribed, yield */
2817     /* pause is in the following code */
2818     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2819     KMP_YIELD_SPIN(spins);
2820   }
2821   KMP_FSYNC_SPIN_ACQUIRED(obj);
2822 }
2823 
2824 } // extern "C"
2825 
2826 #ifdef KMP_GOMP_COMPAT
2827 
2828 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2829                                enum sched_type schedule, kmp_int32 lb,
2830                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2831                                int push_ws) {
2832   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2833                                  push_ws);
2834 }
2835 
2836 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2837                                 enum sched_type schedule, kmp_uint32 lb,
2838                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2839                                 int push_ws) {
2840   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2841                                   push_ws);
2842 }
2843 
2844 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2845                                enum sched_type schedule, kmp_int64 lb,
2846                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2847                                int push_ws) {
2848   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2849                                  push_ws);
2850 }
2851 
2852 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2853                                 enum sched_type schedule, kmp_uint64 lb,
2854                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2855                                 int push_ws) {
2856   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2857                                   push_ws);
2858 }
2859 
2860 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2861   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2862 }
2863 
2864 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2865   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2866 }
2867 
2868 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2869   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2870 }
2871 
2872 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2873   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2874 }
2875 
2876 #endif /* KMP_GOMP_COMPAT */
2877 
2878 /* ------------------------------------------------------------------------ */
2879