1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 
44 #if KMP_STATIC_STEAL_ENABLED
45 
46 // replaces dispatch_private_info{32,64} structures and
47 // dispatch_private_info{32,64}_t types
48 template <typename T> struct dispatch_private_infoXX_template {
49   typedef typename traits_t<T>::unsigned_t UT;
50   typedef typename traits_t<T>::signed_t ST;
51   UT count; // unsigned
52   T ub;
53   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
54   T lb;
55   ST st; // signed
56   UT tc; // unsigned
57   T static_steal_counter; // for static_steal only; maybe better to put after ub
58 
59   /* parm[1-4] are used in different ways by different scheduling algorithms */
60 
61   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
62   //    a) parm3 is properly aligned and
63   //    b) all parm1-4 are in the same cache line.
64   // Because of parm1-4 are used together, performance seems to be better
65   // if they are in the same line (not measured though).
66 
67   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
68     T parm1;
69     T parm2;
70     T parm3;
71     T parm4;
72   };
73 
74   UT ordered_lower; // unsigned
75   UT ordered_upper; // unsigned
76 #if KMP_OS_WINDOWS
77   T last_upper;
78 #endif /* KMP_OS_WINDOWS */
79 };
80 
81 #else /* KMP_STATIC_STEAL_ENABLED */
82 
83 // replaces dispatch_private_info{32,64} structures and
84 // dispatch_private_info{32,64}_t types
85 template <typename T> struct dispatch_private_infoXX_template {
86   typedef typename traits_t<T>::unsigned_t UT;
87   typedef typename traits_t<T>::signed_t ST;
88   T lb;
89   T ub;
90   ST st; // signed
91   UT tc; // unsigned
92 
93   T parm1;
94   T parm2;
95   T parm3;
96   T parm4;
97 
98   UT count; // unsigned
99 
100   UT ordered_lower; // unsigned
101   UT ordered_upper; // unsigned
102 #if KMP_OS_WINDOWS
103   T last_upper;
104 #endif /* KMP_OS_WINDOWS */
105 };
106 
107 #endif /* KMP_STATIC_STEAL_ENABLED */
108 
109 // replaces dispatch_private_info structure and dispatch_private_info_t type
110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
111   // duplicate alignment here, otherwise size of structure is not correct in our
112   // compiler
113   union KMP_ALIGN_CACHE private_info_tmpl {
114     dispatch_private_infoXX_template<T> p;
115     dispatch_private_info64_t p64;
116   } u;
117   enum sched_type schedule; /* scheduling algorithm */
118   kmp_uint32 ordered; /* ordered clause specified */
119   kmp_uint32 ordered_bumped;
120   // To retain the structure size after making ordered_iteration scalar
121   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
122   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
123   kmp_uint32 nomerge; /* don't merge iters if serialized */
124   kmp_uint32 type_size;
125   enum cons_type pushed_ws;
126 };
127 
128 // replaces dispatch_shared_info{32,64} structures and
129 // dispatch_shared_info{32,64}_t types
130 template <typename UT> struct dispatch_shared_infoXX_template {
131   /* chunk index under dynamic, number of idle threads under static-steal;
132      iteration index otherwise */
133   volatile UT iteration;
134   volatile UT num_done;
135   volatile UT ordered_iteration;
136   // to retain the structure size making ordered_iteration scalar
137   UT ordered_dummy[KMP_MAX_ORDERED - 3];
138 };
139 
140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
141 template <typename UT> struct dispatch_shared_info_template {
142   // we need union here to keep the structure size
143   union shared_info_tmpl {
144     dispatch_shared_infoXX_template<UT> s;
145     dispatch_shared_info64_t s64;
146   } u;
147   volatile kmp_uint32 buffer_index;
148 #if OMP_45_ENABLED
149   volatile kmp_int32 doacross_buf_idx; // teamwise index
150   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
151   kmp_int32 doacross_num_done; // count finished threads
152 #endif
153 #if KMP_USE_HWLOC
154   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
155   // machines (> 48 cores). Performance analysis showed that a cache thrash
156   // was occurring and this padding helps alleviate the problem.
157   char padding[64];
158 #endif
159 };
160 
161 /* ------------------------------------------------------------------------ */
162 
163 #undef USE_TEST_LOCKS
164 
165 // test_then_add template (general template should NOT be used)
166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
167 
168 template <>
169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
170                                                  kmp_int32 d) {
171   kmp_int32 r;
172   r = KMP_TEST_THEN_ADD32(p, d);
173   return r;
174 }
175 
176 template <>
177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
178                                                  kmp_int64 d) {
179   kmp_int64 r;
180   r = KMP_TEST_THEN_ADD64(p, d);
181   return r;
182 }
183 
184 // test_then_inc_acq template (general template should NOT be used)
185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
186 
187 template <>
188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
189   kmp_int32 r;
190   r = KMP_TEST_THEN_INC_ACQ32(p);
191   return r;
192 }
193 
194 template <>
195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
196   kmp_int64 r;
197   r = KMP_TEST_THEN_INC_ACQ64(p);
198   return r;
199 }
200 
201 // test_then_inc template (general template should NOT be used)
202 template <typename T> static __forceinline T test_then_inc(volatile T *p);
203 
204 template <>
205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
206   kmp_int32 r;
207   r = KMP_TEST_THEN_INC32(p);
208   return r;
209 }
210 
211 template <>
212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
213   kmp_int64 r;
214   r = KMP_TEST_THEN_INC64(p);
215   return r;
216 }
217 
218 // compare_and_swap template (general template should NOT be used)
219 template <typename T>
220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
221 
222 template <>
223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
224                                                     kmp_int32 c, kmp_int32 s) {
225   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
226 }
227 
228 template <>
229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
230                                                     kmp_int64 c, kmp_int64 s) {
231   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
232 }
233 
234 /* Spin wait loop that first does pause, then yield.
235     Waits until function returns non-zero when called with *spinner and check.
236     Does NOT put threads to sleep.
237     Arguments:
238         UT is unsigned 4- or 8-byte type
239         spinner - memory location to check value
240         checker - value which spinner is >, <, ==, etc.
241         pred - predicate function to perform binary comparison of some sort
242 #if USE_ITT_BUILD
243         obj -- is higher-level synchronization object to report to ittnotify.
244         It is used to report locks consistently. For example, if lock is
245         acquired immediately, its address is reported to ittnotify via
246         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
247         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
248         same address, not an address of low-level spinner.
249 #endif // USE_ITT_BUILD
250     TODO: make inline function (move to header file for icl)
251 */
252 template <typename UT>
253 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
254                            kmp_uint32 (*pred)(UT, UT)
255                                USE_ITT_BUILD_ARG(void *obj)) {
256   // note: we may not belong to a team at this point
257   volatile UT *spin = spinner;
258   UT check = checker;
259   kmp_uint32 spins;
260   kmp_uint32 (*f)(UT, UT) = pred;
261   UT r;
262 
263   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
264   KMP_INIT_YIELD(spins);
265   // main wait spin loop
266   while (!f(r = *spin, check)) {
267     KMP_FSYNC_SPIN_PREPARE(obj);
268     /* GEH - remove this since it was accidentally introduced when kmp_wait was
269        split. It causes problems with infinite recursion because of exit lock */
270     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
271         __kmp_abort_thread(); */
272 
273     // if we are oversubscribed, or have waited a bit (and
274     // KMP_LIBRARY=throughput, then yield. pause is in the following code
275     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
276     KMP_YIELD_SPIN(spins);
277   }
278   KMP_FSYNC_SPIN_ACQUIRED(obj);
279   return r;
280 }
281 
282 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
283   return value == checker;
284 }
285 
286 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
287   return value != checker;
288 }
289 
290 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
291   return value < checker;
292 }
293 
294 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
295   return value >= checker;
296 }
297 
298 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
299   return value <= checker;
300 }
301 
302 /* ------------------------------------------------------------------------ */
303 
304 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
305                                      ident_t *loc_ref) {
306   kmp_info_t *th;
307 
308   KMP_DEBUG_ASSERT(gtid_ref);
309 
310   if (__kmp_env_consistency_check) {
311     th = __kmp_threads[*gtid_ref];
312     if (th->th.th_root->r.r_active &&
313         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
314 #if KMP_USE_DYNAMIC_LOCK
315       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
316 #else
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
318 #endif
319     }
320   }
321 }
322 
323 template <typename UT>
324 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
325   typedef typename traits_t<UT>::signed_t ST;
326   dispatch_private_info_template<UT> *pr;
327 
328   int gtid = *gtid_ref;
329   //    int  cid = *cid_ref;
330   kmp_info_t *th = __kmp_threads[gtid];
331   KMP_DEBUG_ASSERT(th->th.th_dispatch);
332 
333   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
334   if (__kmp_env_consistency_check) {
335     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
336         th->th.th_dispatch->th_dispatch_pr_current);
337     if (pr->pushed_ws != ct_none) {
338 #if KMP_USE_DYNAMIC_LOCK
339       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
340 #else
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
342 #endif
343     }
344   }
345 
346   if (!th->th.th_team->t.t_serialized) {
347     dispatch_shared_info_template<UT> *sh =
348         reinterpret_cast<dispatch_shared_info_template<UT> *>(
349             th->th.th_dispatch->th_dispatch_sh_current);
350     UT lower;
351 
352     if (!__kmp_env_consistency_check) {
353       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
354           th->th.th_dispatch->th_dispatch_pr_current);
355     }
356     lower = pr->u.p.ordered_lower;
357 
358 #if !defined(KMP_GOMP_COMPAT)
359     if (__kmp_env_consistency_check) {
360       if (pr->ordered_bumped) {
361         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
362         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
363                                ct_ordered_in_pdo, loc_ref,
364                                &p->stack_data[p->w_top]);
365       }
366     }
367 #endif /* !defined(KMP_GOMP_COMPAT) */
368 
369     KMP_MB();
370 #ifdef KMP_DEBUG
371     {
372       char *buff;
373       // create format specifiers before the debug output
374       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
375                               "ordered_iter:%%%s lower:%%%s\n",
376                               traits_t<UT>::spec, traits_t<UT>::spec);
377       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
378       __kmp_str_free(&buff);
379     }
380 #endif
381 
382     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
383                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384     KMP_MB(); /* is this necessary? */
385 #ifdef KMP_DEBUG
386     {
387       char *buff;
388       // create format specifiers before the debug output
389       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390                               "ordered_iter:%%%s lower:%%%s\n",
391                               traits_t<UT>::spec, traits_t<UT>::spec);
392       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
393       __kmp_str_free(&buff);
394     }
395 #endif
396   }
397   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398 }
399 
400 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
401                                      ident_t *loc_ref) {
402   kmp_info_t *th;
403 
404   if (__kmp_env_consistency_check) {
405     th = __kmp_threads[*gtid_ref];
406     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
407       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
408     }
409   }
410 }
411 
412 template <typename UT>
413 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
414   typedef typename traits_t<UT>::signed_t ST;
415   dispatch_private_info_template<UT> *pr;
416 
417   int gtid = *gtid_ref;
418   //    int  cid = *cid_ref;
419   kmp_info_t *th = __kmp_threads[gtid];
420   KMP_DEBUG_ASSERT(th->th.th_dispatch);
421 
422   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
423   if (__kmp_env_consistency_check) {
424     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
425         th->th.th_dispatch->th_dispatch_pr_current);
426     if (pr->pushed_ws != ct_none) {
427       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
428     }
429   }
430 
431   if (!th->th.th_team->t.t_serialized) {
432     dispatch_shared_info_template<UT> *sh =
433         reinterpret_cast<dispatch_shared_info_template<UT> *>(
434             th->th.th_dispatch->th_dispatch_sh_current);
435 
436     if (!__kmp_env_consistency_check) {
437       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
438           th->th.th_dispatch->th_dispatch_pr_current);
439     }
440 
441     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
442 #if !defined(KMP_GOMP_COMPAT)
443     if (__kmp_env_consistency_check) {
444       if (pr->ordered_bumped != 0) {
445         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
446         /* How to test it? - OM */
447         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
448                                ct_ordered_in_pdo, loc_ref,
449                                &p->stack_data[p->w_top]);
450       }
451     }
452 #endif /* !defined(KMP_GOMP_COMPAT) */
453 
454     KMP_MB(); /* Flush all pending memory write invalidates.  */
455 
456     pr->ordered_bumped += 1;
457 
458     KD_TRACE(1000,
459              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
460               gtid, pr->ordered_bumped));
461 
462     KMP_MB(); /* Flush all pending memory write invalidates.  */
463 
464     /* TODO use general release procedure? */
465     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
466 
467     KMP_MB(); /* Flush all pending memory write invalidates.  */
468   }
469   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
470 }
471 
472 // Computes and returns x to the power of y, where y must a non-negative integer
473 template <typename UT>
474 static __forceinline long double __kmp_pow(long double x, UT y) {
475   long double s = 1.0L;
476 
477   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
478   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
479   while (y) {
480     if (y & 1)
481       s *= x;
482     x *= x;
483     y >>= 1;
484   }
485   return s;
486 }
487 
488 /* Computes and returns the number of unassigned iterations after idx chunks
489    have been assigned (the total number of unassigned iterations in chunks with
490    index greater than or equal to idx). __forceinline seems to be broken so that
491    if we __forceinline this function, the behavior is wrong
492    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
493 template <typename T>
494 static __inline typename traits_t<T>::unsigned_t
495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
496                                 typename traits_t<T>::unsigned_t idx) {
497   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
498      ICL 8.1, long double arithmetic may not really have long double precision,
499      even with /Qlong_double.  Currently, we workaround that in the caller code,
500      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
501      of precision is not expected to be a correctness issue, though. */
502   typedef typename traits_t<T>::unsigned_t UT;
503 
504   long double x = tc * __kmp_pow<UT>(base, idx);
505   UT r = (UT)x;
506   if (x == r)
507     return r;
508   return r + 1;
509 }
510 
511 // Parameters of the guided-iterative algorithm:
512 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
513 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
514 // by default n = 2. For example with n = 3 the chunks distribution will be more
515 // flat.
516 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
517 static int guided_int_param = 2;
518 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
519 
520 // UT - unsigned flavor of T, ST - signed flavor of T,
521 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
522 template <typename T>
523 static void
524 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
525                     T ub, typename traits_t<T>::signed_t st,
526                     typename traits_t<T>::signed_t chunk, int push_ws) {
527   typedef typename traits_t<T>::unsigned_t UT;
528   typedef typename traits_t<T>::signed_t ST;
529   typedef typename traits_t<T>::floating_t DBL;
530 
531   int active;
532   T tc;
533   kmp_info_t *th;
534   kmp_team_t *team;
535   kmp_uint32 my_buffer_index;
536   dispatch_private_info_template<T> *pr;
537   dispatch_shared_info_template<UT> volatile *sh;
538 
539   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
540                    sizeof(dispatch_private_info));
541   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
542                    sizeof(dispatch_shared_info));
543 
544   if (!TCR_4(__kmp_init_parallel))
545     __kmp_parallel_initialize();
546 
547 #if INCLUDE_SSC_MARKS
548   SSC_MARK_DISPATCH_INIT();
549 #endif
550 #ifdef KMP_DEBUG
551   {
552     char *buff;
553     // create format specifiers before the debug output
554     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
555                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
556                             traits_t<ST>::spec, traits_t<T>::spec,
557                             traits_t<T>::spec, traits_t<ST>::spec);
558     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
559     __kmp_str_free(&buff);
560   }
561 #endif
562   /* setup data */
563   th = __kmp_threads[gtid];
564   team = th->th.th_team;
565   active = !team->t.t_serialized;
566   th->th.th_ident = loc;
567 
568 #if USE_ITT_BUILD
569   kmp_uint64 cur_chunk = chunk;
570   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
571                                     __kmp_forkjoin_frames_mode == 3 &&
572                                     KMP_MASTER_GTID(gtid) &&
573 #if OMP_40_ENABLED
574                                     th->th.th_teams_microtask == NULL &&
575 #endif
576                                     team->t.t_active_level == 1;
577 #endif
578   if (!active) {
579     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
580         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
581   } else {
582     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
583                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
584 
585     my_buffer_index = th->th.th_dispatch->th_disp_index++;
586 
587     /* What happens when number of threads changes, need to resize buffer? */
588     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
589         &th->th.th_dispatch
590              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
591     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
592         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593   }
594 
595 #if (KMP_STATIC_STEAL_ENABLED)
596   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
597     // AC: we now have only one implementation of stealing, so use it
598     schedule = kmp_sch_static_steal;
599   else
600 #endif
601     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
602 
603   /* Pick up the nomerge/ordered bits from the scheduling type */
604   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
605     pr->nomerge = TRUE;
606     schedule =
607         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
608   } else {
609     pr->nomerge = FALSE;
610   }
611   pr->type_size = traits_t<T>::type_size; // remember the size of variables
612   if (kmp_ord_lower & schedule) {
613     pr->ordered = TRUE;
614     schedule =
615         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
616   } else {
617     pr->ordered = FALSE;
618   }
619 
620   if (schedule == kmp_sch_static) {
621     schedule = __kmp_static;
622   } else {
623     if (schedule == kmp_sch_runtime) {
624       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
625       // not specified)
626       schedule = team->t.t_sched.r_sched_type;
627       // Detail the schedule if needed (global controls are differentiated
628       // appropriately)
629       if (schedule == kmp_sch_guided_chunked) {
630         schedule = __kmp_guided;
631       } else if (schedule == kmp_sch_static) {
632         schedule = __kmp_static;
633       }
634       // Use the chunk size specified by OMP_SCHEDULE (or default if not
635       // specified)
636       chunk = team->t.t_sched.chunk;
637 #if USE_ITT_BUILD
638       cur_chunk = chunk;
639 #endif
640 #ifdef KMP_DEBUG
641       {
642         char *buff;
643         // create format specifiers before the debug output
644         buff = __kmp_str_format(
645             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
646             traits_t<ST>::spec);
647         KD_TRACE(10, (buff, gtid, schedule, chunk));
648         __kmp_str_free(&buff);
649       }
650 #endif
651     } else {
652       if (schedule == kmp_sch_guided_chunked) {
653         schedule = __kmp_guided;
654       }
655       if (chunk <= 0) {
656         chunk = KMP_DEFAULT_CHUNK;
657       }
658     }
659 
660     if (schedule == kmp_sch_auto) {
661       // mapping and differentiation: in the __kmp_do_serial_initialize()
662       schedule = __kmp_auto;
663 #ifdef KMP_DEBUG
664       {
665         char *buff;
666         // create format specifiers before the debug output
667         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
668                                 "schedule:%%d chunk:%%%s\n",
669                                 traits_t<ST>::spec);
670         KD_TRACE(10, (buff, gtid, schedule, chunk));
671         __kmp_str_free(&buff);
672       }
673 #endif
674     }
675 
676     /* guided analytical not safe for too many threads */
677     if (schedule == kmp_sch_guided_analytical_chunked &&
678         th->th.th_team_nproc > 1 << 20) {
679       schedule = kmp_sch_guided_iterative_chunked;
680       KMP_WARNING(DispatchManyThreads);
681     }
682     if (schedule == kmp_sch_runtime_simd) {
683       // compiler provides simd_width in the chunk parameter
684       schedule = team->t.t_sched.r_sched_type;
685       // Detail the schedule if needed (global controls are differentiated
686       // appropriately)
687       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
688           schedule == __kmp_static) {
689         schedule = kmp_sch_static_balanced_chunked;
690       } else {
691         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
692           schedule = kmp_sch_guided_simd;
693         }
694         chunk = team->t.t_sched.chunk * chunk;
695       }
696 #if USE_ITT_BUILD
697       cur_chunk = chunk;
698 #endif
699 #ifdef KMP_DEBUG
700       {
701         char *buff;
702         // create format specifiers before the debug output
703         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
704                                 " chunk:%%%s\n",
705                                 traits_t<ST>::spec);
706         KD_TRACE(10, (buff, gtid, schedule, chunk));
707         __kmp_str_free(&buff);
708       }
709 #endif
710     }
711     pr->u.p.parm1 = chunk;
712   }
713   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
714               "unknown scheduling type");
715 
716   pr->u.p.count = 0;
717 
718   if (__kmp_env_consistency_check) {
719     if (st == 0) {
720       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
721                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
722     }
723   }
724   // compute trip count
725   if (st == 1) { // most common case
726     if (ub >= lb) {
727       tc = ub - lb + 1;
728     } else { // ub < lb
729       tc = 0; // zero-trip
730     }
731   } else if (st < 0) {
732     if (lb >= ub) {
733       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
734       // where the division needs to be unsigned regardless of the result type
735       tc = (UT)(lb - ub) / (-st) + 1;
736     } else { // lb < ub
737       tc = 0; // zero-trip
738     }
739   } else { // st > 0
740     if (ub >= lb) {
741       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
742       // where the division needs to be unsigned regardless of the result type
743       tc = (UT)(ub - lb) / st + 1;
744     } else { // ub < lb
745       tc = 0; // zero-trip
746     }
747   }
748 
749   // Any half-decent optimizer will remove this test when the blocks are empty
750   // since the macros expand to nothing when statistics are disabled.
751   if (schedule == __kmp_static) {
752     KMP_COUNT_BLOCK(OMP_FOR_static);
753     KMP_COUNT_VALUE(FOR_static_iterations, tc);
754   } else {
755     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
756     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
757   }
758 
759   pr->u.p.lb = lb;
760   pr->u.p.ub = ub;
761   pr->u.p.st = st;
762   pr->u.p.tc = tc;
763 
764 #if KMP_OS_WINDOWS
765   pr->u.p.last_upper = ub + st;
766 #endif /* KMP_OS_WINDOWS */
767 
768   /* NOTE: only the active parallel region(s) has active ordered sections */
769 
770   if (active) {
771     if (pr->ordered == 0) {
772       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
773       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
774     } else {
775       pr->ordered_bumped = 0;
776 
777       pr->u.p.ordered_lower = 1;
778       pr->u.p.ordered_upper = 0;
779 
780       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
781       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
782     }
783   }
784 
785   if (__kmp_env_consistency_check) {
786     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
787     if (push_ws) {
788       __kmp_push_workshare(gtid, ws, loc);
789       pr->pushed_ws = ws;
790     } else {
791       __kmp_check_workshare(gtid, ws, loc);
792       pr->pushed_ws = ct_none;
793     }
794   }
795 
796   switch (schedule) {
797 #if (KMP_STATIC_STEAL_ENABLED)
798   case kmp_sch_static_steal: {
799     T nproc = th->th.th_team_nproc;
800     T ntc, init;
801 
802     KD_TRACE(100,
803              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
804 
805     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
806     if (nproc > 1 && ntc >= nproc) {
807       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
808       T id = __kmp_tid_from_gtid(gtid);
809       T small_chunk, extras;
810 
811       small_chunk = ntc / nproc;
812       extras = ntc % nproc;
813 
814       init = id * small_chunk + (id < extras ? id : extras);
815       pr->u.p.count = init;
816       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
817 
818       pr->u.p.parm2 = lb;
819       // pr->pfields.parm3 = 0; // it's not used in static_steal
820       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
821       pr->u.p.st = st;
822       if (traits_t<T>::type_size > 4) {
823         // AC: TODO: check if 16-byte CAS available and use it to
824         // improve performance (probably wait for explicit request
825         // before spending time on this).
826         // For now use dynamically allocated per-thread lock,
827         // free memory in __kmp_dispatch_next when status==0.
828         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
829         th->th.th_dispatch->th_steal_lock =
830             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
831         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
832       }
833       break;
834     } else {
835       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
836                      "kmp_sch_static_balanced\n",
837                      gtid));
838       schedule = kmp_sch_static_balanced;
839       /* too few iterations: fall-through to kmp_sch_static_balanced */
840     } // if
841     /* FALL-THROUGH to static balanced */
842   } // case
843 #endif
844   case kmp_sch_static_balanced: {
845     T nproc = th->th.th_team_nproc;
846     T init, limit;
847 
848     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
849                    gtid));
850 
851     if (nproc > 1) {
852       T id = __kmp_tid_from_gtid(gtid);
853 
854       if (tc < nproc) {
855         if (id < tc) {
856           init = id;
857           limit = id;
858           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
859         } else {
860           pr->u.p.count = 1; /* means no more chunks to execute */
861           pr->u.p.parm1 = FALSE;
862           break;
863         }
864       } else {
865         T small_chunk = tc / nproc;
866         T extras = tc % nproc;
867         init = id * small_chunk + (id < extras ? id : extras);
868         limit = init + small_chunk - (id < extras ? 0 : 1);
869         pr->u.p.parm1 = (id == nproc - 1);
870       }
871     } else {
872       if (tc > 0) {
873         init = 0;
874         limit = tc - 1;
875         pr->u.p.parm1 = TRUE;
876       } else { // zero trip count
877         pr->u.p.count = 1; /* means no more chunks to execute */
878         pr->u.p.parm1 = FALSE;
879         break;
880       }
881     }
882 #if USE_ITT_BUILD
883     // Calculate chunk for metadata report
884     if (itt_need_metadata_reporting)
885       cur_chunk = limit - init + 1;
886 #endif
887     if (st == 1) {
888       pr->u.p.lb = lb + init;
889       pr->u.p.ub = lb + limit;
890     } else {
891       // calculated upper bound, "ub" is user-defined upper bound
892       T ub_tmp = lb + limit * st;
893       pr->u.p.lb = lb + init * st;
894       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
895       // it exactly
896       if (st > 0) {
897         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
898       } else {
899         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
900       }
901     }
902     if (pr->ordered) {
903       pr->u.p.ordered_lower = init;
904       pr->u.p.ordered_upper = limit;
905     }
906     break;
907   } // case
908   case kmp_sch_static_balanced_chunked: {
909     // similar to balanced, but chunk adjusted to multiple of simd width
910     T nth = th->th.th_team_nproc;
911     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
912                    " -> falling-through to static_greedy\n",
913                    gtid));
914     schedule = kmp_sch_static_greedy;
915     if (nth > 1)
916       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
917     else
918       pr->u.p.parm1 = tc;
919     break;
920   } // case
921   case kmp_sch_guided_iterative_chunked:
922   case kmp_sch_guided_simd: {
923     T nproc = th->th.th_team_nproc;
924     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
925                    " case\n",
926                    gtid));
927 
928     if (nproc > 1) {
929       if ((2L * chunk + 1) * nproc >= tc) {
930         /* chunk size too large, switch to dynamic */
931         schedule = kmp_sch_dynamic_chunked;
932       } else {
933         // when remaining iters become less than parm2 - switch to dynamic
934         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
935         *(double *)&pr->u.p.parm3 =
936             guided_flt_param / nproc; // may occupy parm3 and parm4
937       }
938     } else {
939       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
940                      "kmp_sch_static_greedy\n",
941                      gtid));
942       schedule = kmp_sch_static_greedy;
943       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
944       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
945                      gtid));
946       pr->u.p.parm1 = tc;
947     } // if
948   } // case
949   break;
950   case kmp_sch_guided_analytical_chunked: {
951     T nproc = th->th.th_team_nproc;
952     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
953                    " case\n",
954                    gtid));
955     if (nproc > 1) {
956       if ((2L * chunk + 1) * nproc >= tc) {
957         /* chunk size too large, switch to dynamic */
958         schedule = kmp_sch_dynamic_chunked;
959       } else {
960         /* commonly used term: (2 nproc - 1)/(2 nproc) */
961         DBL x;
962 
963 #if KMP_OS_WINDOWS && KMP_ARCH_X86
964         /* Linux* OS already has 64-bit computation by default for long double,
965            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
966            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
967            instead of the default 53-bit. Even though long double doesn't work
968            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
969            expected to impact the correctness of the algorithm, but this has not
970            been mathematically proven. */
971         // save original FPCW and set precision to 64-bit, as
972         // Windows* OS on IA-32 architecture defaults to 53-bit
973         unsigned int oldFpcw = _control87(0, 0);
974         _control87(_PC_64, _MCW_PC); // 0,0x30000
975 #endif
976         /* value used for comparison in solver for cross-over point */
977         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
978 
979         /* crossover point--chunk indexes equal to or greater than
980            this point switch to dynamic-style scheduling */
981         UT cross;
982 
983         /* commonly used term: (2 nproc - 1)/(2 nproc) */
984         x = (long double)1.0 - (long double)0.5 / nproc;
985 
986 #ifdef KMP_DEBUG
987         { // test natural alignment
988           struct _test_a {
989             char a;
990             union {
991               char b;
992               DBL d;
993             };
994           } t;
995           ptrdiff_t natural_alignment =
996               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
997           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
998           // long)natural_alignment );
999           KMP_DEBUG_ASSERT(
1000               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1001         }
1002 #endif // KMP_DEBUG
1003 
1004         /* save the term in thread private dispatch structure */
1005         *(DBL *)&pr->u.p.parm3 = x;
1006 
1007         /* solve for the crossover point to the nearest integer i for which C_i
1008            <= chunk */
1009         {
1010           UT left, right, mid;
1011           long double p;
1012 
1013           /* estimate initial upper and lower bound */
1014 
1015           /* doesn't matter what value right is as long as it is positive, but
1016              it affects performance of the solver */
1017           right = 229;
1018           p = __kmp_pow<UT>(x, right);
1019           if (p > target) {
1020             do {
1021               p *= p;
1022               right <<= 1;
1023             } while (p > target && right < (1 << 27));
1024             /* lower bound is previous (failed) estimate of upper bound */
1025             left = right >> 1;
1026           } else {
1027             left = 0;
1028           }
1029 
1030           /* bisection root-finding method */
1031           while (left + 1 < right) {
1032             mid = (left + right) / 2;
1033             if (__kmp_pow<UT>(x, mid) > target) {
1034               left = mid;
1035             } else {
1036               right = mid;
1037             }
1038           } // while
1039           cross = right;
1040         }
1041         /* assert sanity of computed crossover point */
1042         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1043                    __kmp_pow<UT>(x, cross) <= target);
1044 
1045         /* save the crossover point in thread private dispatch structure */
1046         pr->u.p.parm2 = cross;
1047 
1048 // C75803
1049 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1050 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1051 #else
1052 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053 #endif
1054         /* dynamic-style scheduling offset */
1055         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1056                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1057                         cross * chunk;
1058 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1059         // restore FPCW
1060         _control87(oldFpcw, _MCW_PC);
1061 #endif
1062       } // if
1063     } else {
1064       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1065                      "kmp_sch_static_greedy\n",
1066                      gtid));
1067       schedule = kmp_sch_static_greedy;
1068       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1069       pr->u.p.parm1 = tc;
1070     } // if
1071   } // case
1072   break;
1073   case kmp_sch_static_greedy:
1074     KD_TRACE(100,
1075              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1076     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1077                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1078                         : tc;
1079     break;
1080   case kmp_sch_static_chunked:
1081   case kmp_sch_dynamic_chunked:
1082     if (pr->u.p.parm1 <= 0) {
1083       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1084     }
1085     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1086                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1087                    gtid));
1088     break;
1089   case kmp_sch_trapezoidal: {
1090     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1091 
1092     T parm1, parm2, parm3, parm4;
1093     KD_TRACE(100,
1094              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1095 
1096     parm1 = chunk;
1097 
1098     /* F : size of the first cycle */
1099     parm2 = (tc / (2 * th->th.th_team_nproc));
1100 
1101     if (parm2 < 1) {
1102       parm2 = 1;
1103     }
1104 
1105     /* L : size of the last cycle.  Make sure the last cycle is not larger
1106        than the first cycle. */
1107     if (parm1 < 1) {
1108       parm1 = 1;
1109     } else if (parm1 > parm2) {
1110       parm1 = parm2;
1111     }
1112 
1113     /* N : number of cycles */
1114     parm3 = (parm2 + parm1);
1115     parm3 = (2 * tc + parm3 - 1) / parm3;
1116 
1117     if (parm3 < 2) {
1118       parm3 = 2;
1119     }
1120 
1121     /* sigma : decreasing incr of the trapezoid */
1122     parm4 = (parm3 - 1);
1123     parm4 = (parm2 - parm1) / parm4;
1124 
1125     // pointless check, because parm4 >= 0 always
1126     // if ( parm4 < 0 ) {
1127     //    parm4 = 0;
1128     //}
1129 
1130     pr->u.p.parm1 = parm1;
1131     pr->u.p.parm2 = parm2;
1132     pr->u.p.parm3 = parm3;
1133     pr->u.p.parm4 = parm4;
1134   } // case
1135   break;
1136 
1137   default: {
1138     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1139                 KMP_HNT(GetNewerLibrary), // Hint
1140                 __kmp_msg_null // Variadic argument list terminator
1141                 );
1142   } break;
1143   } // switch
1144   pr->schedule = schedule;
1145   if (active) {
1146     /* The name of this buffer should be my_buffer_index when it's free to use
1147      * it */
1148 
1149     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1150                    "sh->buffer_index:%d\n",
1151                    gtid, my_buffer_index, sh->buffer_index));
1152     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1153                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1154     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1155     // my_buffer_index are *always* 32-bit integers.
1156     KMP_MB(); /* is this necessary? */
1157     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1158                    "sh->buffer_index:%d\n",
1159                    gtid, my_buffer_index, sh->buffer_index));
1160 
1161     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1162     th->th.th_dispatch->th_dispatch_sh_current =
1163         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1164 #if USE_ITT_BUILD
1165     if (pr->ordered) {
1166       __kmp_itt_ordered_init(gtid);
1167     }
1168     // Report loop metadata
1169     if (itt_need_metadata_reporting) {
1170       // Only report metadata by master of active team at level 1
1171       kmp_uint64 schedtype = 0;
1172       switch (schedule) {
1173       case kmp_sch_static_chunked:
1174       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1175         break;
1176       case kmp_sch_static_greedy:
1177         cur_chunk = pr->u.p.parm1;
1178         break;
1179       case kmp_sch_dynamic_chunked:
1180         schedtype = 1;
1181         break;
1182       case kmp_sch_guided_iterative_chunked:
1183       case kmp_sch_guided_analytical_chunked:
1184       case kmp_sch_guided_simd:
1185         schedtype = 2;
1186         break;
1187       default:
1188         // Should we put this case under "static"?
1189         // case kmp_sch_static_steal:
1190         schedtype = 3;
1191         break;
1192       }
1193       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1194     }
1195 #endif /* USE_ITT_BUILD */
1196   }
1197 
1198 #ifdef KMP_DEBUG
1199   {
1200     char *buff;
1201     // create format specifiers before the debug output
1202     buff = __kmp_str_format(
1203         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1204         "lb:%%%s ub:%%%s"
1205         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1206         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1207         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1208         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1209         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1210         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1211     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1212                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1213                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1214                   pr->u.p.parm3, pr->u.p.parm4));
1215     __kmp_str_free(&buff);
1216   }
1217 #endif
1218 #if (KMP_STATIC_STEAL_ENABLED)
1219   // It cannot be guaranteed that after execution of a loop with some other
1220   // schedule kind all the parm3 variables will contain the same value. Even if
1221   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1222   // rather than program life-time increment. So the dedicated variable is
1223   // required. The 'static_steal_counter' is used.
1224   if (schedule == kmp_sch_static_steal) {
1225     // Other threads will inspect this variable when searching for a victim.
1226     // This is a flag showing that other threads may steal from this thread
1227     // since then.
1228     volatile T *p = &pr->u.p.static_steal_counter;
1229     *p = *p + 1;
1230   }
1231 #endif // ( KMP_STATIC_STEAL_ENABLED )
1232 
1233 #if OMPT_SUPPORT && OMPT_OPTIONAL
1234   if (ompt_enabled.ompt_callback_work) {
1235     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1236     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1237     kmp_info_t *thr = __kmp_threads[gtid];
1238     ompt_callbacks.ompt_callback(ompt_callback_work)(
1239         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1240         &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1241   }
1242 #endif
1243 }
1244 
1245 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1246  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247  * every chunk of iterations.  If the ordered section(s) were not executed
1248  * for this iteration (or every iteration in this chunk), we need to set the
1249  * ordered iteration counters so that the next thread can proceed. */
1250 template <typename UT>
1251 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252   typedef typename traits_t<UT>::signed_t ST;
1253   kmp_info_t *th = __kmp_threads[gtid];
1254 
1255   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256   if (!th->th.th_team->t.t_serialized) {
1257 
1258     dispatch_private_info_template<UT> *pr =
1259         reinterpret_cast<dispatch_private_info_template<UT> *>(
1260             th->th.th_dispatch->th_dispatch_pr_current);
1261     dispatch_shared_info_template<UT> volatile *sh =
1262         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263             th->th.th_dispatch->th_dispatch_sh_current);
1264     KMP_DEBUG_ASSERT(pr);
1265     KMP_DEBUG_ASSERT(sh);
1266     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1268 
1269     if (pr->ordered_bumped) {
1270       KD_TRACE(
1271           1000,
1272           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273            gtid));
1274       pr->ordered_bumped = 0;
1275     } else {
1276       UT lower = pr->u.p.ordered_lower;
1277 
1278 #ifdef KMP_DEBUG
1279       {
1280         char *buff;
1281         // create format specifiers before the debug output
1282         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283                                 "ordered_iteration:%%%s lower:%%%s\n",
1284                                 traits_t<UT>::spec, traits_t<UT>::spec);
1285         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286         __kmp_str_free(&buff);
1287       }
1288 #endif
1289 
1290       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292       KMP_MB(); /* is this necessary? */
1293 #ifdef KMP_DEBUG
1294       {
1295         char *buff;
1296         // create format specifiers before the debug output
1297         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298                                 "ordered_iteration:%%%s lower:%%%s\n",
1299                                 traits_t<UT>::spec, traits_t<UT>::spec);
1300         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301         __kmp_str_free(&buff);
1302       }
1303 #endif
1304 
1305       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1306     } // if
1307   } // if
1308   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1309 }
1310 
1311 #ifdef KMP_GOMP_COMPAT
1312 
1313 template <typename UT>
1314 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315   typedef typename traits_t<UT>::signed_t ST;
1316   kmp_info_t *th = __kmp_threads[gtid];
1317 
1318   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319   if (!th->th.th_team->t.t_serialized) {
1320     //        int cid;
1321     dispatch_private_info_template<UT> *pr =
1322         reinterpret_cast<dispatch_private_info_template<UT> *>(
1323             th->th.th_dispatch->th_dispatch_pr_current);
1324     dispatch_shared_info_template<UT> volatile *sh =
1325         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326             th->th.th_dispatch->th_dispatch_sh_current);
1327     KMP_DEBUG_ASSERT(pr);
1328     KMP_DEBUG_ASSERT(sh);
1329     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1331 
1332     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333     UT lower = pr->u.p.ordered_lower;
1334     UT upper = pr->u.p.ordered_upper;
1335     UT inc = upper - lower + 1;
1336 
1337     if (pr->ordered_bumped == inc) {
1338       KD_TRACE(
1339           1000,
1340           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341            gtid));
1342       pr->ordered_bumped = 0;
1343     } else {
1344       inc -= pr->ordered_bumped;
1345 
1346 #ifdef KMP_DEBUG
1347       {
1348         char *buff;
1349         // create format specifiers before the debug output
1350         buff = __kmp_str_format(
1351             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355         __kmp_str_free(&buff);
1356       }
1357 #endif
1358 
1359       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1361 
1362       KMP_MB(); /* is this necessary? */
1363       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364                       "ordered_bumped to zero\n",
1365                       gtid));
1366       pr->ordered_bumped = 0;
1367 //!!!!! TODO check if the inc should be unsigned, or signed???
1368 #ifdef KMP_DEBUG
1369       {
1370         char *buff;
1371         // create format specifiers before the debug output
1372         buff = __kmp_str_format(
1373             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376             traits_t<UT>::spec);
1377         KD_TRACE(1000,
1378                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379         __kmp_str_free(&buff);
1380       }
1381 #endif
1382 
1383       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1384     }
1385     //        }
1386   }
1387   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1388 }
1389 
1390 #endif /* KMP_GOMP_COMPAT */
1391 
1392 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394    is not called. */
1395 #if OMPT_SUPPORT && OMPT_OPTIONAL
1396 #define OMPT_LOOP_END                                                          \
1397   if (status == 0) {                                                           \
1398     if (ompt_enabled.ompt_callback_work) {                                     \
1399       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1400       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1401       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1402           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1403           &(task_info->task_data), 0, codeptr);                                \
1404     }                                                                          \
1405   }
1406 // TODO: implement count
1407 #else
1408 #define OMPT_LOOP_END // no-op
1409 #endif
1410 
1411 template <typename T>
1412 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1413                                T *p_lb, T *p_ub,
1414                                typename traits_t<T>::signed_t *p_st
1415 #if OMPT_SUPPORT && OMPT_OPTIONAL
1416                                ,
1417                                void *codeptr
1418 #endif
1419                                ) {
1420 
1421   typedef typename traits_t<T>::unsigned_t UT;
1422   typedef typename traits_t<T>::signed_t ST;
1423   typedef typename traits_t<T>::floating_t DBL;
1424 
1425   // This is potentially slightly misleading, schedule(runtime) will appear here
1426   // even if the actual runtme schedule is static. (Which points out a
1427   // disadavantage of schedule(runtime): even when static scheduling is used it
1428   // costs more than a compile time choice to use static scheduling would.)
1429   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1430 
1431   int status;
1432   dispatch_private_info_template<T> *pr;
1433   kmp_info_t *th = __kmp_threads[gtid];
1434   kmp_team_t *team = th->th.th_team;
1435 
1436   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1437 #ifdef KMP_DEBUG
1438   {
1439     char *buff;
1440     // create format specifiers before the debug output
1441     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1442                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1443                             traits_t<T>::spec, traits_t<T>::spec,
1444                             traits_t<ST>::spec);
1445     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1446     __kmp_str_free(&buff);
1447   }
1448 #endif
1449 
1450   if (team->t.t_serialized) {
1451     /* NOTE: serialize this dispatch becase we are not at the active level */
1452     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1453         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1454     KMP_DEBUG_ASSERT(pr);
1455 
1456     if ((status = (pr->u.p.tc != 0)) == 0) {
1457       *p_lb = 0;
1458       *p_ub = 0;
1459       //            if ( p_last != NULL )
1460       //                *p_last = 0;
1461       if (p_st != NULL)
1462         *p_st = 0;
1463       if (__kmp_env_consistency_check) {
1464         if (pr->pushed_ws != ct_none) {
1465           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1466         }
1467       }
1468     } else if (pr->nomerge) {
1469       kmp_int32 last;
1470       T start;
1471       UT limit, trip, init;
1472       ST incr;
1473       T chunk = pr->u.p.parm1;
1474 
1475       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1476                      gtid));
1477 
1478       init = chunk * pr->u.p.count++;
1479       trip = pr->u.p.tc - 1;
1480 
1481       if ((status = (init <= trip)) == 0) {
1482         *p_lb = 0;
1483         *p_ub = 0;
1484         //                if ( p_last != NULL )
1485         //                    *p_last = 0;
1486         if (p_st != NULL)
1487           *p_st = 0;
1488         if (__kmp_env_consistency_check) {
1489           if (pr->pushed_ws != ct_none) {
1490             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1491           }
1492         }
1493       } else {
1494         start = pr->u.p.lb;
1495         limit = chunk + init - 1;
1496         incr = pr->u.p.st;
1497 
1498         if ((last = (limit >= trip)) != 0) {
1499           limit = trip;
1500 #if KMP_OS_WINDOWS
1501           pr->u.p.last_upper = pr->u.p.ub;
1502 #endif /* KMP_OS_WINDOWS */
1503         }
1504         if (p_last != NULL)
1505           *p_last = last;
1506         if (p_st != NULL)
1507           *p_st = incr;
1508         if (incr == 1) {
1509           *p_lb = start + init;
1510           *p_ub = start + limit;
1511         } else {
1512           *p_lb = start + init * incr;
1513           *p_ub = start + limit * incr;
1514         }
1515 
1516         if (pr->ordered) {
1517           pr->u.p.ordered_lower = init;
1518           pr->u.p.ordered_upper = limit;
1519 #ifdef KMP_DEBUG
1520           {
1521             char *buff;
1522             // create format specifiers before the debug output
1523             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1524                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1525                                     traits_t<UT>::spec, traits_t<UT>::spec);
1526             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1527                             pr->u.p.ordered_upper));
1528             __kmp_str_free(&buff);
1529           }
1530 #endif
1531         } // if
1532       } // if
1533     } else {
1534       pr->u.p.tc = 0;
1535       *p_lb = pr->u.p.lb;
1536       *p_ub = pr->u.p.ub;
1537 #if KMP_OS_WINDOWS
1538       pr->u.p.last_upper = *p_ub;
1539 #endif /* KMP_OS_WINDOWS */
1540       if (p_last != NULL)
1541         *p_last = TRUE;
1542       if (p_st != NULL)
1543         *p_st = pr->u.p.st;
1544     } // if
1545 #ifdef KMP_DEBUG
1546     {
1547       char *buff;
1548       // create format specifiers before the debug output
1549       buff = __kmp_str_format(
1550           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1551           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1552           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1553       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1554       __kmp_str_free(&buff);
1555     }
1556 #endif
1557 #if INCLUDE_SSC_MARKS
1558     SSC_MARK_DISPATCH_NEXT();
1559 #endif
1560     OMPT_LOOP_END;
1561     return status;
1562   } else {
1563     kmp_int32 last = 0;
1564     dispatch_shared_info_template<UT> *sh;
1565     T start;
1566     ST incr;
1567     UT limit, trip, init;
1568 
1569     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1570                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1571 
1572     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1573         th->th.th_dispatch->th_dispatch_pr_current);
1574     KMP_DEBUG_ASSERT(pr);
1575     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1576         th->th.th_dispatch->th_dispatch_sh_current);
1577     KMP_DEBUG_ASSERT(sh);
1578 
1579     if (pr->u.p.tc == 0) {
1580       // zero trip count
1581       status = 0;
1582     } else {
1583       switch (pr->schedule) {
1584 #if (KMP_STATIC_STEAL_ENABLED)
1585       case kmp_sch_static_steal: {
1586         T chunk = pr->u.p.parm1;
1587         int nproc = th->th.th_team_nproc;
1588 
1589         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1590                        gtid));
1591 
1592         trip = pr->u.p.tc - 1;
1593 
1594         if (traits_t<T>::type_size > 4) {
1595           // use lock for 8-byte and CAS for 4-byte induction
1596           // variable. TODO (optional): check and use 16-byte CAS
1597           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1598           KMP_DEBUG_ASSERT(lck != NULL);
1599           if (pr->u.p.count < (UT)pr->u.p.ub) {
1600             __kmp_acquire_lock(lck, gtid);
1601             // try to get own chunk of iterations
1602             init = (pr->u.p.count)++;
1603             status = (init < (UT)pr->u.p.ub);
1604             __kmp_release_lock(lck, gtid);
1605           } else {
1606             status = 0; // no own chunks
1607           }
1608           if (!status) { // try to steal
1609             kmp_info_t **other_threads = team->t.t_threads;
1610             int while_limit = nproc; // nproc attempts to find a victim
1611             int while_index = 0;
1612             // TODO: algorithm of searching for a victim
1613             // should be cleaned up and measured
1614             while ((!status) && (while_limit != ++while_index)) {
1615               T remaining;
1616               T victimIdx = pr->u.p.parm4;
1617               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1618               dispatch_private_info_template<T> *victim =
1619                   reinterpret_cast<dispatch_private_info_template<T> *>(
1620                       other_threads[victimIdx]
1621                           ->th.th_dispatch->th_dispatch_pr_current);
1622               while ((victim == NULL || victim == pr ||
1623                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1624                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1625                      oldVictimIdx != victimIdx) {
1626                 victimIdx = (victimIdx + 1) % nproc;
1627                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1628                     other_threads[victimIdx]
1629                         ->th.th_dispatch->th_dispatch_pr_current);
1630               }
1631               if (!victim ||
1632                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1633                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1634                 continue; // try once more (nproc attempts in total)
1635                 // no victim is ready yet to participate in stealing
1636                 // because all victims are still in kmp_init_dispatch
1637               }
1638               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1639                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1640                 continue; // not enough chunks to steal, goto next victim
1641               }
1642 
1643               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1644               KMP_ASSERT(lck != NULL);
1645               __kmp_acquire_lock(lck, gtid);
1646               limit = victim->u.p.ub; // keep initial ub
1647               if (victim->u.p.count >= limit ||
1648                   (remaining = limit - victim->u.p.count) < 2) {
1649                 __kmp_release_lock(lck, gtid);
1650                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1651                 continue; // not enough chunks to steal
1652               }
1653               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1654               // or by 1
1655               if (remaining > 3) {
1656                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1657                 init = (victim->u.p.ub -=
1658                         (remaining >> 2)); // steal 1/4 of remaining
1659               } else {
1660                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1661                 init =
1662                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1663               }
1664               __kmp_release_lock(lck, gtid);
1665 
1666               KMP_DEBUG_ASSERT(init + 1 <= limit);
1667               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1668               status = 1;
1669               while_index = 0;
1670               // now update own count and ub with stolen range but init chunk
1671               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1672               pr->u.p.count = init + 1;
1673               pr->u.p.ub = limit;
1674               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1675             } // while (search for victim)
1676           } // if (try to find victim and steal)
1677         } else {
1678           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1679           typedef union {
1680             struct {
1681               UT count;
1682               T ub;
1683             } p;
1684             kmp_int64 b;
1685           } union_i4;
1686           // All operations on 'count' or 'ub' must be combined atomically
1687           // together.
1688           {
1689             union_i4 vold, vnew;
1690             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1691             vnew = vold;
1692             vnew.p.count++;
1693             while (!KMP_COMPARE_AND_STORE_ACQ64(
1694                 (volatile kmp_int64 *)&pr->u.p.count,
1695                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1696                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1697               KMP_CPU_PAUSE();
1698               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1699               vnew = vold;
1700               vnew.p.count++;
1701             }
1702             vnew = vold;
1703             init = vnew.p.count;
1704             status = (init < (UT)vnew.p.ub);
1705           }
1706 
1707           if (!status) {
1708             kmp_info_t **other_threads = team->t.t_threads;
1709             int while_limit = nproc; // nproc attempts to find a victim
1710             int while_index = 0;
1711 
1712             // TODO: algorithm of searching for a victim
1713             // should be cleaned up and measured
1714             while ((!status) && (while_limit != ++while_index)) {
1715               union_i4 vold, vnew;
1716               kmp_int32 remaining;
1717               T victimIdx = pr->u.p.parm4;
1718               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1719               dispatch_private_info_template<T> *victim =
1720                   reinterpret_cast<dispatch_private_info_template<T> *>(
1721                       other_threads[victimIdx]
1722                           ->th.th_dispatch->th_dispatch_pr_current);
1723               while ((victim == NULL || victim == pr ||
1724                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1725                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1726                      oldVictimIdx != victimIdx) {
1727                 victimIdx = (victimIdx + 1) % nproc;
1728                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1729                     other_threads[victimIdx]
1730                         ->th.th_dispatch->th_dispatch_pr_current);
1731               }
1732               if (!victim ||
1733                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1734                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1735                 continue; // try once more (nproc attempts in total)
1736                 // no victim is ready yet to participate in stealing
1737                 // because all victims are still in kmp_init_dispatch
1738               }
1739               pr->u.p.parm4 = victimIdx; // new victim found
1740               while (1) { // CAS loop if victim has enough chunks to steal
1741                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1742                 vnew = vold;
1743 
1744                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1745                 if (vnew.p.count >= (UT)vnew.p.ub ||
1746                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1747                   pr->u.p.parm4 =
1748                       (victimIdx + 1) % nproc; // shift start victim id
1749                   break; // not enough chunks to steal, goto next victim
1750                 }
1751                 if (remaining > 3) {
1752                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1753                 } else {
1754                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1755                 }
1756                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1757                 // TODO: Should this be acquire or release?
1758                 if (KMP_COMPARE_AND_STORE_ACQ64(
1759                         (volatile kmp_int64 *)&victim->u.p.count,
1760                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1761                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1762                   // stealing succeeded
1763                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1764                                   vold.p.ub - vnew.p.ub);
1765                   status = 1;
1766                   while_index = 0;
1767                   // now update own count and ub
1768                   init = vnew.p.ub;
1769                   vold.p.count = init + 1;
1770 #if KMP_ARCH_X86
1771                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1772                                    vold.b);
1773 #else
1774                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1775 #endif
1776                   break;
1777                 } // if (check CAS result)
1778                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1779               } // while (try to steal from particular victim)
1780             } // while (search for victim)
1781           } // if (try to find victim and steal)
1782         } // if (4-byte induction variable)
1783         if (!status) {
1784           *p_lb = 0;
1785           *p_ub = 0;
1786           if (p_st != NULL)
1787             *p_st = 0;
1788         } else {
1789           start = pr->u.p.parm2;
1790           init *= chunk;
1791           limit = chunk + init - 1;
1792           incr = pr->u.p.st;
1793           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1794 
1795           KMP_DEBUG_ASSERT(init <= trip);
1796           if ((last = (limit >= trip)) != 0)
1797             limit = trip;
1798           if (p_st != NULL)
1799             *p_st = incr;
1800 
1801           if (incr == 1) {
1802             *p_lb = start + init;
1803             *p_ub = start + limit;
1804           } else {
1805             *p_lb = start + init * incr;
1806             *p_ub = start + limit * incr;
1807           }
1808 
1809           if (pr->ordered) {
1810             pr->u.p.ordered_lower = init;
1811             pr->u.p.ordered_upper = limit;
1812 #ifdef KMP_DEBUG
1813             {
1814               char *buff;
1815               // create format specifiers before the debug output
1816               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1817                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1818                                       traits_t<UT>::spec, traits_t<UT>::spec);
1819               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1820                               pr->u.p.ordered_upper));
1821               __kmp_str_free(&buff);
1822             }
1823 #endif
1824           } // if
1825         } // if
1826         break;
1827       } // case
1828 #endif // ( KMP_STATIC_STEAL_ENABLED )
1829       case kmp_sch_static_balanced: {
1830         KD_TRACE(
1831             100,
1832             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1833         if ((status = !pr->u.p.count) !=
1834             0) { /* check if thread has any iteration to do */
1835           pr->u.p.count = 1;
1836           *p_lb = pr->u.p.lb;
1837           *p_ub = pr->u.p.ub;
1838           last = pr->u.p.parm1;
1839           if (p_st != NULL)
1840             *p_st = pr->u.p.st;
1841         } else { /* no iterations to do */
1842           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1843         }
1844         if (pr->ordered) {
1845 #ifdef KMP_DEBUG
1846           {
1847             char *buff;
1848             // create format specifiers before the debug output
1849             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1850                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1851                                     traits_t<UT>::spec, traits_t<UT>::spec);
1852             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1853                             pr->u.p.ordered_upper));
1854             __kmp_str_free(&buff);
1855           }
1856 #endif
1857         } // if
1858       } // case
1859       break;
1860       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1861                                      merged here */
1862       case kmp_sch_static_chunked: {
1863         T parm1;
1864 
1865         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1866                        "kmp_sch_static_[affinity|chunked] case\n",
1867                        gtid));
1868         parm1 = pr->u.p.parm1;
1869 
1870         trip = pr->u.p.tc - 1;
1871         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1872 
1873         if ((status = (init <= trip)) != 0) {
1874           start = pr->u.p.lb;
1875           incr = pr->u.p.st;
1876           limit = parm1 + init - 1;
1877 
1878           if ((last = (limit >= trip)) != 0)
1879             limit = trip;
1880 
1881           if (p_st != NULL)
1882             *p_st = incr;
1883 
1884           pr->u.p.count += th->th.th_team_nproc;
1885 
1886           if (incr == 1) {
1887             *p_lb = start + init;
1888             *p_ub = start + limit;
1889           } else {
1890             *p_lb = start + init * incr;
1891             *p_ub = start + limit * incr;
1892           }
1893 
1894           if (pr->ordered) {
1895             pr->u.p.ordered_lower = init;
1896             pr->u.p.ordered_upper = limit;
1897 #ifdef KMP_DEBUG
1898             {
1899               char *buff;
1900               // create format specifiers before the debug output
1901               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1902                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1903                                       traits_t<UT>::spec, traits_t<UT>::spec);
1904               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1905                               pr->u.p.ordered_upper));
1906               __kmp_str_free(&buff);
1907             }
1908 #endif
1909           } // if
1910         } // if
1911       } // case
1912       break;
1913 
1914       case kmp_sch_dynamic_chunked: {
1915         T chunk = pr->u.p.parm1;
1916 
1917         KD_TRACE(
1918             100,
1919             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1920 
1921         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1922         trip = pr->u.p.tc - 1;
1923 
1924         if ((status = (init <= trip)) == 0) {
1925           *p_lb = 0;
1926           *p_ub = 0;
1927           if (p_st != NULL)
1928             *p_st = 0;
1929         } else {
1930           start = pr->u.p.lb;
1931           limit = chunk + init - 1;
1932           incr = pr->u.p.st;
1933 
1934           if ((last = (limit >= trip)) != 0)
1935             limit = trip;
1936 
1937           if (p_st != NULL)
1938             *p_st = incr;
1939 
1940           if (incr == 1) {
1941             *p_lb = start + init;
1942             *p_ub = start + limit;
1943           } else {
1944             *p_lb = start + init * incr;
1945             *p_ub = start + limit * incr;
1946           }
1947 
1948           if (pr->ordered) {
1949             pr->u.p.ordered_lower = init;
1950             pr->u.p.ordered_upper = limit;
1951 #ifdef KMP_DEBUG
1952             {
1953               char *buff;
1954               // create format specifiers before the debug output
1955               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1956                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1957                                       traits_t<UT>::spec, traits_t<UT>::spec);
1958               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1959                               pr->u.p.ordered_upper));
1960               __kmp_str_free(&buff);
1961             }
1962 #endif
1963           } // if
1964         } // if
1965       } // case
1966       break;
1967 
1968       case kmp_sch_guided_iterative_chunked: {
1969         T chunkspec = pr->u.p.parm1;
1970         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1971                        "iterative case\n",
1972                        gtid));
1973         trip = pr->u.p.tc;
1974         // Start atomic part of calculations
1975         while (1) {
1976           ST remaining; // signed, because can be < 0
1977           init = sh->u.s.iteration; // shared value
1978           remaining = trip - init;
1979           if (remaining <= 0) { // AC: need to compare with 0 first
1980             // nothing to do, don't try atomic op
1981             status = 0;
1982             break;
1983           }
1984           if ((T)remaining <
1985               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1986             // use dynamic-style shcedule
1987             // atomically inrement iterations, get old value
1988             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1989                                      (ST)chunkspec);
1990             remaining = trip - init;
1991             if (remaining <= 0) {
1992               status = 0; // all iterations got by other threads
1993             } else { // got some iterations to work on
1994               status = 1;
1995               if ((T)remaining > chunkspec) {
1996                 limit = init + chunkspec - 1;
1997               } else {
1998                 last = 1; // the last chunk
1999                 limit = init + remaining - 1;
2000               } // if
2001             } // if
2002             break;
2003           } // if
2004           limit = init + (UT)(remaining *
2005                               *(double *)&pr->u.p.parm3); // divide by K*nproc
2006           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2007                                    (ST)init, (ST)limit)) {
2008             // CAS was successful, chunk obtained
2009             status = 1;
2010             --limit;
2011             break;
2012           } // if
2013         } // while
2014         if (status != 0) {
2015           start = pr->u.p.lb;
2016           incr = pr->u.p.st;
2017           if (p_st != NULL)
2018             *p_st = incr;
2019           *p_lb = start + init * incr;
2020           *p_ub = start + limit * incr;
2021           if (pr->ordered) {
2022             pr->u.p.ordered_lower = init;
2023             pr->u.p.ordered_upper = limit;
2024 #ifdef KMP_DEBUG
2025             {
2026               char *buff;
2027               // create format specifiers before the debug output
2028               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2029                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2030                                       traits_t<UT>::spec, traits_t<UT>::spec);
2031               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2032                               pr->u.p.ordered_upper));
2033               __kmp_str_free(&buff);
2034             }
2035 #endif
2036           } // if
2037         } else {
2038           *p_lb = 0;
2039           *p_ub = 0;
2040           if (p_st != NULL)
2041             *p_st = 0;
2042         } // if
2043       } // case
2044       break;
2045 
2046       case kmp_sch_guided_simd: {
2047         // same as iterative but curr-chunk adjusted to be multiple of given
2048         // chunk
2049         T chunk = pr->u.p.parm1;
2050         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2051                        gtid));
2052         trip = pr->u.p.tc;
2053         // Start atomic part of calculations
2054         while (1) {
2055           ST remaining; // signed, because can be < 0
2056           init = sh->u.s.iteration; // shared value
2057           remaining = trip - init;
2058           if (remaining <= 0) { // AC: need to compare with 0 first
2059             status = 0; // nothing to do, don't try atomic op
2060             break;
2061           }
2062           KMP_DEBUG_ASSERT(init % chunk == 0);
2063           // compare with K*nproc*(chunk+1), K=2 by default
2064           if ((T)remaining < pr->u.p.parm2) {
2065             // use dynamic-style shcedule
2066             // atomically inrement iterations, get old value
2067             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2068                                      (ST)chunk);
2069             remaining = trip - init;
2070             if (remaining <= 0) {
2071               status = 0; // all iterations got by other threads
2072             } else {
2073               // got some iterations to work on
2074               status = 1;
2075               if ((T)remaining > chunk) {
2076                 limit = init + chunk - 1;
2077               } else {
2078                 last = 1; // the last chunk
2079                 limit = init + remaining - 1;
2080               } // if
2081             } // if
2082             break;
2083           } // if
2084           // divide by K*nproc
2085           UT span = remaining * (*(double *)&pr->u.p.parm3);
2086           UT rem = span % chunk;
2087           if (rem) // adjust so that span%chunk == 0
2088             span += chunk - rem;
2089           limit = init + span;
2090           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2091                                    (ST)init, (ST)limit)) {
2092             // CAS was successful, chunk obtained
2093             status = 1;
2094             --limit;
2095             break;
2096           } // if
2097         } // while
2098         if (status != 0) {
2099           start = pr->u.p.lb;
2100           incr = pr->u.p.st;
2101           if (p_st != NULL)
2102             *p_st = incr;
2103           *p_lb = start + init * incr;
2104           *p_ub = start + limit * incr;
2105           if (pr->ordered) {
2106             pr->u.p.ordered_lower = init;
2107             pr->u.p.ordered_upper = limit;
2108 #ifdef KMP_DEBUG
2109             {
2110               char *buff;
2111               // create format specifiers before the debug output
2112               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2113                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2114                                       traits_t<UT>::spec, traits_t<UT>::spec);
2115               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2116                               pr->u.p.ordered_upper));
2117               __kmp_str_free(&buff);
2118             }
2119 #endif
2120           } // if
2121         } else {
2122           *p_lb = 0;
2123           *p_ub = 0;
2124           if (p_st != NULL)
2125             *p_st = 0;
2126         } // if
2127       } // case
2128       break;
2129 
2130       case kmp_sch_guided_analytical_chunked: {
2131         T chunkspec = pr->u.p.parm1;
2132         UT chunkIdx;
2133 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2134         /* for storing original FPCW value for Windows* OS on
2135            IA-32 architecture 8-byte version */
2136         unsigned int oldFpcw;
2137         unsigned int fpcwSet = 0;
2138 #endif
2139         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2140                        "analytical case\n",
2141                        gtid));
2142 
2143         trip = pr->u.p.tc;
2144 
2145         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2146         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2147                          trip);
2148 
2149         while (1) { /* this while loop is a safeguard against unexpected zero
2150                        chunk sizes */
2151           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2152           if (chunkIdx >= (UT)pr->u.p.parm2) {
2153             --trip;
2154             /* use dynamic-style scheduling */
2155             init = chunkIdx * chunkspec + pr->u.p.count;
2156             /* need to verify init > 0 in case of overflow in the above
2157              * calculation */
2158             if ((status = (init > 0 && init <= trip)) != 0) {
2159               limit = init + chunkspec - 1;
2160 
2161               if ((last = (limit >= trip)) != 0)
2162                 limit = trip;
2163             }
2164             break;
2165           } else {
2166 /* use exponential-style scheduling */
2167 /* The following check is to workaround the lack of long double precision on
2168    Windows* OS.
2169    This check works around the possible effect that init != 0 for chunkIdx == 0.
2170  */
2171 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2172             /* If we haven't already done so, save original FPCW and set
2173                precision to 64-bit, as Windows* OS on IA-32 architecture
2174                defaults to 53-bit */
2175             if (!fpcwSet) {
2176               oldFpcw = _control87(0, 0);
2177               _control87(_PC_64, _MCW_PC);
2178               fpcwSet = 0x30000;
2179             }
2180 #endif
2181             if (chunkIdx) {
2182               init = __kmp_dispatch_guided_remaining<T>(
2183                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2184               KMP_DEBUG_ASSERT(init);
2185               init = trip - init;
2186             } else
2187               init = 0;
2188             limit = trip - __kmp_dispatch_guided_remaining<T>(
2189                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2190             KMP_ASSERT(init <= limit);
2191             if (init < limit) {
2192               KMP_DEBUG_ASSERT(limit <= trip);
2193               --limit;
2194               status = 1;
2195               break;
2196             } // if
2197           } // if
2198         } // while (1)
2199 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2200         /* restore FPCW if necessary
2201            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2202            here */
2203         if (fpcwSet && (oldFpcw & fpcwSet))
2204           _control87(oldFpcw, _MCW_PC);
2205 #endif
2206         if (status != 0) {
2207           start = pr->u.p.lb;
2208           incr = pr->u.p.st;
2209           if (p_st != NULL)
2210             *p_st = incr;
2211           *p_lb = start + init * incr;
2212           *p_ub = start + limit * incr;
2213           if (pr->ordered) {
2214             pr->u.p.ordered_lower = init;
2215             pr->u.p.ordered_upper = limit;
2216 #ifdef KMP_DEBUG
2217             {
2218               char *buff;
2219               // create format specifiers before the debug output
2220               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2221                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2222                                       traits_t<UT>::spec, traits_t<UT>::spec);
2223               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2224                               pr->u.p.ordered_upper));
2225               __kmp_str_free(&buff);
2226             }
2227 #endif
2228           }
2229         } else {
2230           *p_lb = 0;
2231           *p_ub = 0;
2232           if (p_st != NULL)
2233             *p_st = 0;
2234         }
2235       } // case
2236       break;
2237 
2238       case kmp_sch_trapezoidal: {
2239         UT index;
2240         T parm2 = pr->u.p.parm2;
2241         T parm3 = pr->u.p.parm3;
2242         T parm4 = pr->u.p.parm4;
2243         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2244                        gtid));
2245 
2246         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2247 
2248         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2249         trip = pr->u.p.tc - 1;
2250 
2251         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2252           *p_lb = 0;
2253           *p_ub = 0;
2254           if (p_st != NULL)
2255             *p_st = 0;
2256         } else {
2257           start = pr->u.p.lb;
2258           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2259           incr = pr->u.p.st;
2260 
2261           if ((last = (limit >= trip)) != 0)
2262             limit = trip;
2263 
2264           if (p_st != NULL)
2265             *p_st = incr;
2266 
2267           if (incr == 1) {
2268             *p_lb = start + init;
2269             *p_ub = start + limit;
2270           } else {
2271             *p_lb = start + init * incr;
2272             *p_ub = start + limit * incr;
2273           }
2274 
2275           if (pr->ordered) {
2276             pr->u.p.ordered_lower = init;
2277             pr->u.p.ordered_upper = limit;
2278 #ifdef KMP_DEBUG
2279             {
2280               char *buff;
2281               // create format specifiers before the debug output
2282               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2284                                       traits_t<UT>::spec, traits_t<UT>::spec);
2285               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286                               pr->u.p.ordered_upper));
2287               __kmp_str_free(&buff);
2288             }
2289 #endif
2290           } // if
2291         } // if
2292       } // case
2293       break;
2294       default: {
2295         status = 0; // to avoid complaints on uninitialized variable use
2296         __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2297                     KMP_HNT(GetNewerLibrary), // Hint
2298                     __kmp_msg_null // Variadic argument list terminator
2299                     );
2300       } break;
2301       } // switch
2302     } // if tc == 0;
2303 
2304     if (status == 0) {
2305       UT num_done;
2306 
2307       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2308 #ifdef KMP_DEBUG
2309       {
2310         char *buff;
2311         // create format specifiers before the debug output
2312         buff = __kmp_str_format(
2313             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2314             traits_t<UT>::spec);
2315         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2316         __kmp_str_free(&buff);
2317       }
2318 #endif
2319 
2320       if ((ST)num_done == th->th.th_team_nproc - 1) {
2321 #if (KMP_STATIC_STEAL_ENABLED)
2322         if (pr->schedule == kmp_sch_static_steal &&
2323             traits_t<T>::type_size > 4) {
2324           int i;
2325           kmp_info_t **other_threads = team->t.t_threads;
2326           // loop complete, safe to destroy locks used for stealing
2327           for (i = 0; i < th->th.th_team_nproc; ++i) {
2328             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2329             KMP_ASSERT(lck != NULL);
2330             __kmp_destroy_lock(lck);
2331             __kmp_free(lck);
2332             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2333           }
2334         }
2335 #endif
2336         /* NOTE: release this buffer to be reused */
2337 
2338         KMP_MB(); /* Flush all pending memory write invalidates.  */
2339 
2340         sh->u.s.num_done = 0;
2341         sh->u.s.iteration = 0;
2342 
2343         /* TODO replace with general release procedure? */
2344         if (pr->ordered) {
2345           sh->u.s.ordered_iteration = 0;
2346         }
2347 
2348         KMP_MB(); /* Flush all pending memory write invalidates.  */
2349 
2350         sh->buffer_index += __kmp_dispatch_num_buffers;
2351         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2352                        gtid, sh->buffer_index));
2353 
2354         KMP_MB(); /* Flush all pending memory write invalidates.  */
2355 
2356       } // if
2357       if (__kmp_env_consistency_check) {
2358         if (pr->pushed_ws != ct_none) {
2359           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2360         }
2361       }
2362 
2363       th->th.th_dispatch->th_deo_fcn = NULL;
2364       th->th.th_dispatch->th_dxo_fcn = NULL;
2365       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2366       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2367     } // if (status == 0)
2368 #if KMP_OS_WINDOWS
2369     else if (last) {
2370       pr->u.p.last_upper = pr->u.p.ub;
2371     }
2372 #endif /* KMP_OS_WINDOWS */
2373     if (p_last != NULL && status != 0)
2374       *p_last = last;
2375   } // if
2376 
2377 #ifdef KMP_DEBUG
2378   {
2379     char *buff;
2380     // create format specifiers before the debug output
2381     buff = __kmp_str_format(
2382         "__kmp_dispatch_next: T#%%d normal case: "
2383         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2384         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2385     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2386     __kmp_str_free(&buff);
2387   }
2388 #endif
2389 #if INCLUDE_SSC_MARKS
2390   SSC_MARK_DISPATCH_NEXT();
2391 #endif
2392   OMPT_LOOP_END;
2393   return status;
2394 }
2395 
2396 template <typename T>
2397 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2398                                   kmp_int32 *plastiter, T *plower, T *pupper,
2399                                   typename traits_t<T>::signed_t incr) {
2400   typedef typename traits_t<T>::unsigned_t UT;
2401   typedef typename traits_t<T>::signed_t ST;
2402   kmp_uint32 team_id;
2403   kmp_uint32 nteams;
2404   UT trip_count;
2405   kmp_team_t *team;
2406   kmp_info_t *th;
2407 
2408   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2409   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2410 #ifdef KMP_DEBUG
2411   {
2412     char *buff;
2413     // create format specifiers before the debug output
2414     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2415                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2416                             traits_t<T>::spec, traits_t<T>::spec,
2417                             traits_t<ST>::spec, traits_t<T>::spec);
2418     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2419     __kmp_str_free(&buff);
2420   }
2421 #endif
2422 
2423   if (__kmp_env_consistency_check) {
2424     if (incr == 0) {
2425       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2426                             loc);
2427     }
2428     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2429       // The loop is illegal.
2430       // Some zero-trip loops maintained by compiler, e.g.:
2431       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2432       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2433       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2434       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2435       // Compiler does not check the following illegal loops:
2436       //   for(i=0;i<10;i+=incr) // where incr<0
2437       //   for(i=10;i>0;i-=incr) // where incr<0
2438       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2439     }
2440   }
2441   th = __kmp_threads[gtid];
2442   team = th->th.th_team;
2443 #if OMP_40_ENABLED
2444   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2445   nteams = th->th.th_teams_size.nteams;
2446 #endif
2447   team_id = team->t.t_master_tid;
2448   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2449 
2450   // compute global trip count
2451   if (incr == 1) {
2452     trip_count = *pupper - *plower + 1;
2453   } else if (incr == -1) {
2454     trip_count = *plower - *pupper + 1;
2455   } else if (incr > 0) {
2456     // upper-lower can exceed the limit of signed type
2457     trip_count = (UT)(*pupper - *plower) / incr + 1;
2458   } else {
2459     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2460   }
2461 
2462   if (trip_count <= nteams) {
2463     KMP_DEBUG_ASSERT(
2464         __kmp_static == kmp_sch_static_greedy ||
2465         __kmp_static ==
2466             kmp_sch_static_balanced); // Unknown static scheduling type.
2467     // only some teams get single iteration, others get nothing
2468     if (team_id < trip_count) {
2469       *pupper = *plower = *plower + team_id * incr;
2470     } else {
2471       *plower = *pupper + incr; // zero-trip loop
2472     }
2473     if (plastiter != NULL)
2474       *plastiter = (team_id == trip_count - 1);
2475   } else {
2476     if (__kmp_static == kmp_sch_static_balanced) {
2477       UT chunk = trip_count / nteams;
2478       UT extras = trip_count % nteams;
2479       *plower +=
2480           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2481       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2482       if (plastiter != NULL)
2483         *plastiter = (team_id == nteams - 1);
2484     } else {
2485       T chunk_inc_count =
2486           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2487       T upper = *pupper;
2488       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2489       // Unknown static scheduling type.
2490       *plower += team_id * chunk_inc_count;
2491       *pupper = *plower + chunk_inc_count - incr;
2492       // Check/correct bounds if needed
2493       if (incr > 0) {
2494         if (*pupper < *plower)
2495           *pupper = traits_t<T>::max_value;
2496         if (plastiter != NULL)
2497           *plastiter = *plower <= upper && *pupper > upper - incr;
2498         if (*pupper > upper)
2499           *pupper = upper; // tracker C73258
2500       } else {
2501         if (*pupper > *plower)
2502           *pupper = traits_t<T>::min_value;
2503         if (plastiter != NULL)
2504           *plastiter = *plower >= upper && *pupper < upper - incr;
2505         if (*pupper < upper)
2506           *pupper = upper; // tracker C73258
2507       }
2508     }
2509   }
2510 }
2511 
2512 //-----------------------------------------------------------------------------
2513 // Dispatch routines
2514 //    Transfer call to template< type T >
2515 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2516 //                         T lb, T ub, ST st, ST chunk )
2517 extern "C" {
2518 
2519 /*!
2520 @ingroup WORK_SHARING
2521 @{
2522 @param loc Source location
2523 @param gtid Global thread id
2524 @param schedule Schedule type
2525 @param lb  Lower bound
2526 @param ub  Upper bound
2527 @param st  Step (or increment if you prefer)
2528 @param chunk The chunk size to block with
2529 
2530 This function prepares the runtime to start a dynamically scheduled for loop,
2531 saving the loop arguments.
2532 These functions are all identical apart from the types of the arguments.
2533 */
2534 
2535 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2536                             enum sched_type schedule, kmp_int32 lb,
2537                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2538   KMP_DEBUG_ASSERT(__kmp_init_serial);
2539 #if OMPT_SUPPORT && OMPT_OPTIONAL
2540   OMPT_STORE_RETURN_ADDRESS(gtid);
2541 #endif
2542   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2543 }
2544 /*!
2545 See @ref __kmpc_dispatch_init_4
2546 */
2547 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2548                              enum sched_type schedule, kmp_uint32 lb,
2549                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2550   KMP_DEBUG_ASSERT(__kmp_init_serial);
2551 #if OMPT_SUPPORT && OMPT_OPTIONAL
2552   OMPT_STORE_RETURN_ADDRESS(gtid);
2553 #endif
2554   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2555 }
2556 
2557 /*!
2558 See @ref __kmpc_dispatch_init_4
2559 */
2560 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2561                             enum sched_type schedule, kmp_int64 lb,
2562                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2563   KMP_DEBUG_ASSERT(__kmp_init_serial);
2564 #if OMPT_SUPPORT && OMPT_OPTIONAL
2565   OMPT_STORE_RETURN_ADDRESS(gtid);
2566 #endif
2567   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2568 }
2569 
2570 /*!
2571 See @ref __kmpc_dispatch_init_4
2572 */
2573 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2574                              enum sched_type schedule, kmp_uint64 lb,
2575                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2576   KMP_DEBUG_ASSERT(__kmp_init_serial);
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578   OMPT_STORE_RETURN_ADDRESS(gtid);
2579 #endif
2580   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2581 }
2582 
2583 /*!
2584 See @ref __kmpc_dispatch_init_4
2585 
2586 Difference from __kmpc_dispatch_init set of functions is these functions
2587 are called for composite distribute parallel for construct. Thus before
2588 regular iterations dispatching we need to calc per-team iteration space.
2589 
2590 These functions are all identical apart from the types of the arguments.
2591 */
2592 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2593                                  enum sched_type schedule, kmp_int32 *p_last,
2594                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2595                                  kmp_int32 chunk) {
2596   KMP_DEBUG_ASSERT(__kmp_init_serial);
2597 #if OMPT_SUPPORT && OMPT_OPTIONAL
2598   OMPT_STORE_RETURN_ADDRESS(gtid);
2599 #endif
2600   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2601   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2602 }
2603 
2604 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2605                                   enum sched_type schedule, kmp_int32 *p_last,
2606                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2607                                   kmp_int32 chunk) {
2608   KMP_DEBUG_ASSERT(__kmp_init_serial);
2609 #if OMPT_SUPPORT && OMPT_OPTIONAL
2610   OMPT_STORE_RETURN_ADDRESS(gtid);
2611 #endif
2612   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2613   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2614 }
2615 
2616 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2617                                  enum sched_type schedule, kmp_int32 *p_last,
2618                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2619                                  kmp_int64 chunk) {
2620   KMP_DEBUG_ASSERT(__kmp_init_serial);
2621 #if OMPT_SUPPORT && OMPT_OPTIONAL
2622   OMPT_STORE_RETURN_ADDRESS(gtid);
2623 #endif
2624   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2625   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2626 }
2627 
2628 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2629                                   enum sched_type schedule, kmp_int32 *p_last,
2630                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2631                                   kmp_int64 chunk) {
2632   KMP_DEBUG_ASSERT(__kmp_init_serial);
2633 #if OMPT_SUPPORT && OMPT_OPTIONAL
2634   OMPT_STORE_RETURN_ADDRESS(gtid);
2635 #endif
2636   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2637   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2638 }
2639 
2640 /*!
2641 @param loc Source code location
2642 @param gtid Global thread id
2643 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2644 otherwise
2645 @param p_lb   Pointer to the lower bound for the next chunk of work
2646 @param p_ub   Pointer to the upper bound for the next chunk of work
2647 @param p_st   Pointer to the stride for the next chunk of work
2648 @return one if there is work to be done, zero otherwise
2649 
2650 Get the next dynamically allocated chunk of work for this thread.
2651 If there is no more work, then the lb,ub and stride need not be modified.
2652 */
2653 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2654                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2655 #if OMPT_SUPPORT && OMPT_OPTIONAL
2656   OMPT_STORE_RETURN_ADDRESS(gtid);
2657 #endif
2658   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2659 #if OMPT_SUPPORT && OMPT_OPTIONAL
2660                                         ,
2661                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2662 #endif
2663                                             );
2664 }
2665 
2666 /*!
2667 See @ref __kmpc_dispatch_next_4
2668 */
2669 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2670                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2671                             kmp_int32 *p_st) {
2672 #if OMPT_SUPPORT && OMPT_OPTIONAL
2673   OMPT_STORE_RETURN_ADDRESS(gtid);
2674 #endif
2675   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2676 #if OMPT_SUPPORT && OMPT_OPTIONAL
2677                                          ,
2678                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2679 #endif
2680                                              );
2681 }
2682 
2683 /*!
2684 See @ref __kmpc_dispatch_next_4
2685 */
2686 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2687                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2688 #if OMPT_SUPPORT && OMPT_OPTIONAL
2689   OMPT_STORE_RETURN_ADDRESS(gtid);
2690 #endif
2691   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2692 #if OMPT_SUPPORT && OMPT_OPTIONAL
2693                                         ,
2694                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2695 #endif
2696                                             );
2697 }
2698 
2699 /*!
2700 See @ref __kmpc_dispatch_next_4
2701 */
2702 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2703                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2704                             kmp_int64 *p_st) {
2705 #if OMPT_SUPPORT && OMPT_OPTIONAL
2706   OMPT_STORE_RETURN_ADDRESS(gtid);
2707 #endif
2708   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710                                          ,
2711                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2712 #endif
2713                                              );
2714 }
2715 
2716 /*!
2717 @param loc Source code location
2718 @param gtid Global thread id
2719 
2720 Mark the end of a dynamic loop.
2721 */
2722 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2723   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2724 }
2725 
2726 /*!
2727 See @ref __kmpc_dispatch_fini_4
2728 */
2729 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2730   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2731 }
2732 
2733 /*!
2734 See @ref __kmpc_dispatch_fini_4
2735 */
2736 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2737   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2738 }
2739 
2740 /*!
2741 See @ref __kmpc_dispatch_fini_4
2742 */
2743 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2744   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2745 }
2746 /*! @} */
2747 
2748 //-----------------------------------------------------------------------------
2749 // Non-template routines from kmp_dispatch.cpp used in other sources
2750 
2751 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2752   return value == checker;
2753 }
2754 
2755 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2756   return value != checker;
2757 }
2758 
2759 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2760   return value < checker;
2761 }
2762 
2763 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2764   return value >= checker;
2765 }
2766 
2767 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2768   return value <= checker;
2769 }
2770 
2771 kmp_uint32
2772 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2773                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2774                    void *obj // Higher-level synchronization object, or NULL.
2775                    ) {
2776   // note: we may not belong to a team at this point
2777   volatile kmp_uint32 *spin = spinner;
2778   kmp_uint32 check = checker;
2779   kmp_uint32 spins;
2780   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2781   kmp_uint32 r;
2782 
2783   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2784   KMP_INIT_YIELD(spins);
2785   // main wait spin loop
2786   while (!f(r = TCR_4(*spin), check)) {
2787     KMP_FSYNC_SPIN_PREPARE(obj);
2788     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2789        split. It causes problems with infinite recursion because of exit lock */
2790     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2791         __kmp_abort_thread(); */
2792 
2793     /* if we have waited a bit, or are oversubscribed, yield */
2794     /* pause is in the following code */
2795     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2796     KMP_YIELD_SPIN(spins);
2797   }
2798   KMP_FSYNC_SPIN_ACQUIRED(obj);
2799   return r;
2800 }
2801 
2802 void __kmp_wait_yield_4_ptr(
2803     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2804     void *obj // Higher-level synchronization object, or NULL.
2805     ) {
2806   // note: we may not belong to a team at this point
2807   void *spin = spinner;
2808   kmp_uint32 check = checker;
2809   kmp_uint32 spins;
2810   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2811 
2812   KMP_FSYNC_SPIN_INIT(obj, spin);
2813   KMP_INIT_YIELD(spins);
2814   // main wait spin loop
2815   while (!f(spin, check)) {
2816     KMP_FSYNC_SPIN_PREPARE(obj);
2817     /* if we have waited a bit, or are oversubscribed, yield */
2818     /* pause is in the following code */
2819     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2820     KMP_YIELD_SPIN(spins);
2821   }
2822   KMP_FSYNC_SPIN_ACQUIRED(obj);
2823 }
2824 
2825 } // extern "C"
2826 
2827 #ifdef KMP_GOMP_COMPAT
2828 
2829 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2830                                enum sched_type schedule, kmp_int32 lb,
2831                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2832                                int push_ws) {
2833   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2834                                  push_ws);
2835 }
2836 
2837 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2838                                 enum sched_type schedule, kmp_uint32 lb,
2839                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2840                                 int push_ws) {
2841   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2842                                   push_ws);
2843 }
2844 
2845 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2846                                enum sched_type schedule, kmp_int64 lb,
2847                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2848                                int push_ws) {
2849   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2850                                  push_ws);
2851 }
2852 
2853 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2854                                 enum sched_type schedule, kmp_uint64 lb,
2855                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2856                                 int push_ws) {
2857   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2858                                   push_ws);
2859 }
2860 
2861 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2862   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2863 }
2864 
2865 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2866   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2867 }
2868 
2869 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2870   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2871 }
2872 
2873 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2874   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2875 }
2876 
2877 #endif /* KMP_GOMP_COMPAT */
2878 
2879 /* ------------------------------------------------------------------------ */
2880