1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  *       it may change values between parallel regions.  __kmp_max_nth
20  *       is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52   typedef typename traits_t<T>::unsigned_t UT;
53   typedef typename traits_t<T>::signed_t ST;
54   UT count; // unsigned
55   T ub;
56   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57   T lb;
58   ST st; // signed
59   UT tc; // unsigned
60   T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62   /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65   //    a) parm3 is properly aligned and
66   //    b) all parm1-4 are in the same cache line.
67   // Because of parm1-4 are used together, performance seems to be better
68   // if they are in the same line (not measured though).
69 
70   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71     T parm1;
72     T parm2;
73     T parm3;
74     T parm4;
75   };
76 
77   UT ordered_lower; // unsigned
78   UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80   T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   T lb;
92   T ub;
93   ST st; // signed
94   UT tc; // unsigned
95 
96   T parm1;
97   T parm2;
98   T parm3;
99   T parm4;
100 
101   UT count; // unsigned
102 
103   UT ordered_lower; // unsigned
104   UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106   T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114   // duplicate alignment here, otherwise size of structure is not correct in our
115   // compiler
116   union KMP_ALIGN_CACHE private_info_tmpl {
117     dispatch_private_infoXX_template<T> p;
118     dispatch_private_info64_t p64;
119   } u;
120   enum sched_type schedule; /* scheduling algorithm */
121   kmp_uint32 ordered; /* ordered clause specified */
122   kmp_uint32 ordered_bumped;
123   // To retain the structure size after making ordered_iteration scalar
124   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126   kmp_uint32 nomerge; /* don't merge iters if serialized */
127   kmp_uint32 type_size;
128   enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134   /* chunk index under dynamic, number of idle threads under static-steal;
135      iteration index otherwise */
136   volatile UT iteration;
137   volatile UT num_done;
138   volatile UT ordered_iteration;
139   // to retain the structure size making ordered_iteration scalar
140   UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145   // we need union here to keep the structure size
146   union shared_info_tmpl {
147     dispatch_shared_infoXX_template<UT> s;
148     dispatch_shared_info64_t s64;
149   } u;
150   volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152   volatile kmp_int32 doacross_buf_idx; // teamwise index
153   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154   kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158   // machines (> 48 cores). Performance analysis showed that a cache thrash
159   // was occurring and this padding helps alleviate the problem.
160   char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173                                                  kmp_int32 d) {
174   kmp_int32 r;
175   r = KMP_TEST_THEN_ADD32(p, d);
176   return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181                                                  kmp_int64 d) {
182   kmp_int64 r;
183   r = KMP_TEST_THEN_ADD64(p, d);
184   return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192   kmp_int32 r;
193   r = KMP_TEST_THEN_INC_ACQ32(p);
194   return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199   kmp_int64 r;
200   r = KMP_TEST_THEN_INC_ACQ64(p);
201   return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209   kmp_int32 r;
210   r = KMP_TEST_THEN_INC32(p);
211   return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216   kmp_int64 r;
217   r = KMP_TEST_THEN_INC64(p);
218   return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227                                                     kmp_int32 c, kmp_int32 s) {
228   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233                                                     kmp_int64 c, kmp_int64 s) {
234   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238     Waits until function returns non-zero when called with *spinner and check.
239     Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241     Arguments:
242         obj -- is higher-level synchronization object to report to ittnotify.
243         It is used to report locks consistently. For example, if lock is
244         acquired immediately, its address is reported to ittnotify via
245         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247         same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253     __kmp_wait_yield(
254         volatile UT *spinner, UT checker,
255         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256             void *obj) // Higher-level synchronization object, or NULL.
257         ) {
258   // note: we may not belong to a team at this point
259   register volatile UT *spin = spinner;
260   register UT check = checker;
261   register kmp_uint32 spins;
262   register kmp_uint32 (*f)(UT, UT) = pred;
263   register UT r;
264 
265   KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
266   KMP_INIT_YIELD(spins);
267   // main wait spin loop
268   while (!f(r = *spin, check)) {
269     KMP_FSYNC_SPIN_PREPARE(obj);
270     /* GEH - remove this since it was accidentally introduced when kmp_wait was
271        split. It causes problems with infinite recursion because of exit lock */
272     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273         __kmp_abort_thread(); */
274 
275     // if we are oversubscribed, or have waited a bit (and
276     // KMP_LIBRARY=throughput, then yield. pause is in the following code
277     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278     KMP_YIELD_SPIN(spins);
279   }
280   KMP_FSYNC_SPIN_ACQUIRED(obj);
281   return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285   return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289   return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293   return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297   return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301   return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307                                      ident_t *loc_ref) {
308   kmp_info_t *th;
309 
310   KMP_DEBUG_ASSERT(gtid_ref);
311 
312   if (__kmp_env_consistency_check) {
313     th = __kmp_threads[*gtid_ref];
314     if (th->th.th_root->r.r_active &&
315         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321     }
322   }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327   typedef typename traits_t<UT>::signed_t ST;
328   dispatch_private_info_template<UT> *pr;
329 
330   int gtid = *gtid_ref;
331   //    int  cid = *cid_ref;
332   kmp_info_t *th = __kmp_threads[gtid];
333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336   if (__kmp_env_consistency_check) {
337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338         th->th.th_dispatch->th_dispatch_pr_current);
339     if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345     }
346   }
347 
348   if (!th->th.th_team->t.t_serialized) {
349     dispatch_shared_info_template<UT> *sh =
350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
351             th->th.th_dispatch->th_dispatch_sh_current);
352     UT lower;
353 
354     if (!__kmp_env_consistency_check) {
355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356           th->th.th_dispatch->th_dispatch_pr_current);
357     }
358     lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361     if (__kmp_env_consistency_check) {
362       if (pr->ordered_bumped) {
363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365                                ct_ordered_in_pdo, loc_ref,
366                                &p->stack_data[p->w_top]);
367       }
368     }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371     KMP_MB();
372 #ifdef KMP_DEBUG
373     {
374       const char *buff;
375       // create format specifiers before the debug output
376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377                               "ordered_iter:%%%s lower:%%%s\n",
378                               traits_t<UT>::spec, traits_t<UT>::spec);
379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380       __kmp_str_free(&buff);
381     }
382 #endif
383 
384     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386     KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388     {
389       const char *buff;
390       // create format specifiers before the debug output
391       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392                               "ordered_iter:%%%s lower:%%%s\n",
393                               traits_t<UT>::spec, traits_t<UT>::spec);
394       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395       __kmp_str_free(&buff);
396     }
397 #endif
398   }
399   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403                                      ident_t *loc_ref) {
404   kmp_info_t *th;
405 
406   if (__kmp_env_consistency_check) {
407     th = __kmp_threads[*gtid_ref];
408     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410     }
411   }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416   typedef typename traits_t<UT>::signed_t ST;
417   dispatch_private_info_template<UT> *pr;
418 
419   int gtid = *gtid_ref;
420   //    int  cid = *cid_ref;
421   kmp_info_t *th = __kmp_threads[gtid];
422   KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425   if (__kmp_env_consistency_check) {
426     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427         th->th.th_dispatch->th_dispatch_pr_current);
428     if (pr->pushed_ws != ct_none) {
429       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430     }
431   }
432 
433   if (!th->th.th_team->t.t_serialized) {
434     dispatch_shared_info_template<UT> *sh =
435         reinterpret_cast<dispatch_shared_info_template<UT> *>(
436             th->th.th_dispatch->th_dispatch_sh_current);
437 
438     if (!__kmp_env_consistency_check) {
439       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440           th->th.th_dispatch->th_dispatch_pr_current);
441     }
442 
443     KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration);
444 #if !defined(KMP_GOMP_COMPAT)
445     if (__kmp_env_consistency_check) {
446       if (pr->ordered_bumped != 0) {
447         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448         /* How to test it? - OM */
449         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450                                ct_ordered_in_pdo, loc_ref,
451                                &p->stack_data[p->w_top]);
452       }
453     }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456     KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458     pr->ordered_bumped += 1;
459 
460     KD_TRACE(1000,
461              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462               gtid, pr->ordered_bumped));
463 
464     KMP_MB(); /* Flush all pending memory write invalidates.  */
465 
466     /* TODO use general release procedure? */
467     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469     KMP_MB(); /* Flush all pending memory write invalidates.  */
470   }
471   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477   long double s = 1.0L;
478 
479   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481   while (y) {
482     if (y & 1)
483       s *= x;
484     x *= x;
485     y >>= 1;
486   }
487   return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491    have been assigned (the total number of unassigned iterations in chunks with
492    index greater than or equal to idx). __forceinline seems to be broken so that
493    if we __forceinline this function, the behavior is wrong
494    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498                                 typename traits_t<T>::unsigned_t idx) {
499   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500      ICL 8.1, long double arithmetic may not really have long double precision,
501      even with /Qlong_double.  Currently, we workaround that in the caller code,
502      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
503      of precision is not expected to be a correctness issue, though. */
504   typedef typename traits_t<T>::unsigned_t UT;
505 
506   long double x = tc * __kmp_pow<UT>(base, idx);
507   UT r = (UT)x;
508   if (x == r)
509     return r;
510   return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
515 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527                     T ub, typename traits_t<T>::signed_t st,
528                     typename traits_t<T>::signed_t chunk, int push_ws) {
529   typedef typename traits_t<T>::unsigned_t UT;
530   typedef typename traits_t<T>::signed_t ST;
531   typedef typename traits_t<T>::floating_t DBL;
532 
533   int active;
534   T tc;
535   kmp_info_t *th;
536   kmp_team_t *team;
537   kmp_uint32 my_buffer_index;
538   dispatch_private_info_template<T> *pr;
539   dispatch_shared_info_template<UT> volatile *sh;
540 
541   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542                    sizeof(dispatch_private_info));
543   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544                    sizeof(dispatch_shared_info));
545 
546   if (!TCR_4(__kmp_init_parallel))
547     __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550   SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553   {
554     const char *buff;
555     // create format specifiers before the debug output
556     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558                             traits_t<ST>::spec, traits_t<T>::spec,
559                             traits_t<T>::spec, traits_t<ST>::spec);
560     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561     __kmp_str_free(&buff);
562   }
563 #endif
564   /* setup data */
565   th = __kmp_threads[gtid];
566   team = th->th.th_team;
567   active = !team->t.t_serialized;
568   th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571   kmp_uint64 cur_chunk = chunk;
572   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573                                     __kmp_forkjoin_frames_mode == 3 &&
574                                     KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576                                     th->th.th_teams_microtask == NULL &&
577 #endif
578                                     team->t.t_active_level == 1;
579 #endif
580   if (!active) {
581     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583   } else {
584     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587     my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589     /* What happens when number of threads changes, need to resize buffer? */
590     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591         &th->th.th_dispatch
592              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595   }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599     // AC: we now have only one implementation of stealing, so use it
600     schedule = kmp_sch_static_steal;
601   else
602 #endif
603     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605   /* Pick up the nomerge/ordered bits from the scheduling type */
606   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607     pr->nomerge = TRUE;
608     schedule =
609         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610   } else {
611     pr->nomerge = FALSE;
612   }
613   pr->type_size = traits_t<T>::type_size; // remember the size of variables
614   if (kmp_ord_lower & schedule) {
615     pr->ordered = TRUE;
616     schedule =
617         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618   } else {
619     pr->ordered = FALSE;
620   }
621 
622   if (schedule == kmp_sch_static) {
623     schedule = __kmp_static;
624   } else {
625     if (schedule == kmp_sch_runtime) {
626       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627       // not specified)
628       schedule = team->t.t_sched.r_sched_type;
629       // Detail the schedule if needed (global controls are differentiated
630       // appropriately)
631       if (schedule == kmp_sch_guided_chunked) {
632         schedule = __kmp_guided;
633       } else if (schedule == kmp_sch_static) {
634         schedule = __kmp_static;
635       }
636       // Use the chunk size specified by OMP_SCHEDULE (or default if not
637       // specified)
638       chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640       cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643       {
644         const char *buff;
645         // create format specifiers before the debug output
646         buff = __kmp_str_format(
647             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648             traits_t<ST>::spec);
649         KD_TRACE(10, (buff, gtid, schedule, chunk));
650         __kmp_str_free(&buff);
651       }
652 #endif
653     } else {
654       if (schedule == kmp_sch_guided_chunked) {
655         schedule = __kmp_guided;
656       }
657       if (chunk <= 0) {
658         chunk = KMP_DEFAULT_CHUNK;
659       }
660     }
661 
662     if (schedule == kmp_sch_auto) {
663       // mapping and differentiation: in the __kmp_do_serial_initialize()
664       schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666       {
667         const char *buff;
668         // create format specifiers before the debug output
669         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670                                 "schedule:%%d chunk:%%%s\n",
671                                 traits_t<ST>::spec);
672         KD_TRACE(10, (buff, gtid, schedule, chunk));
673         __kmp_str_free(&buff);
674       }
675 #endif
676     }
677 
678     /* guided analytical not safe for too many threads */
679     if (schedule == kmp_sch_guided_analytical_chunked &&
680         th->th.th_team_nproc > 1 << 20) {
681       schedule = kmp_sch_guided_iterative_chunked;
682       KMP_WARNING(DispatchManyThreads);
683     }
684     pr->u.p.parm1 = chunk;
685   }
686   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
687               "unknown scheduling type");
688 
689   pr->u.p.count = 0;
690 
691   if (__kmp_env_consistency_check) {
692     if (st == 0) {
693       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
694                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
695     }
696   }
697   // compute trip count
698   if (st == 1) { // most common case
699     if (ub >= lb) {
700       tc = ub - lb + 1;
701     } else { // ub < lb
702       tc = 0; // zero-trip
703     }
704   } else if (st < 0) {
705     if (lb >= ub) {
706       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
707       // where the division needs to be unsigned regardless of the result type
708       tc = (UT)(lb - ub) / (-st) + 1;
709     } else { // lb < ub
710       tc = 0; // zero-trip
711     }
712   } else { // st > 0
713     if (ub >= lb) {
714       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
715       // where the division needs to be unsigned regardless of the result type
716       tc = (UT)(ub - lb) / st + 1;
717     } else { // ub < lb
718       tc = 0; // zero-trip
719     }
720   }
721 
722   // Any half-decent optimizer will remove this test when the blocks are empty
723   // since the macros expand to nothing when statistics are disabled.
724   if (schedule == __kmp_static) {
725     KMP_COUNT_BLOCK(OMP_FOR_static);
726     KMP_COUNT_VALUE(FOR_static_iterations, tc);
727   } else {
728     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
729     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
730   }
731 
732   pr->u.p.lb = lb;
733   pr->u.p.ub = ub;
734   pr->u.p.st = st;
735   pr->u.p.tc = tc;
736 
737 #if KMP_OS_WINDOWS
738   pr->u.p.last_upper = ub + st;
739 #endif /* KMP_OS_WINDOWS */
740 
741   /* NOTE: only the active parallel region(s) has active ordered sections */
742 
743   if (active) {
744     if (pr->ordered == 0) {
745       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
746       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
747     } else {
748       pr->ordered_bumped = 0;
749 
750       pr->u.p.ordered_lower = 1;
751       pr->u.p.ordered_upper = 0;
752 
753       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
754       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
755     }
756   }
757 
758   if (__kmp_env_consistency_check) {
759     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
760     if (push_ws) {
761       __kmp_push_workshare(gtid, ws, loc);
762       pr->pushed_ws = ws;
763     } else {
764       __kmp_check_workshare(gtid, ws, loc);
765       pr->pushed_ws = ct_none;
766     }
767   }
768 
769   switch (schedule) {
770 #if (KMP_STATIC_STEAL_ENABLED)
771   case kmp_sch_static_steal: {
772     T nproc = th->th.th_team_nproc;
773     T ntc, init;
774 
775     KD_TRACE(100,
776              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
777 
778     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
779     if (nproc > 1 && ntc >= nproc) {
780       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
781       T id = __kmp_tid_from_gtid(gtid);
782       T small_chunk, extras;
783 
784       small_chunk = ntc / nproc;
785       extras = ntc % nproc;
786 
787       init = id * small_chunk + (id < extras ? id : extras);
788       pr->u.p.count = init;
789       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
790 
791       pr->u.p.parm2 = lb;
792       // pr->pfields.parm3 = 0; // it's not used in static_steal
793       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
794       pr->u.p.st = st;
795       if (traits_t<T>::type_size > 4) {
796         // AC: TODO: check if 16-byte CAS available and use it to
797         // improve performance (probably wait for explicit request
798         // before spending time on this).
799         // For now use dynamically allocated per-thread lock,
800         // free memory in __kmp_dispatch_next when status==0.
801         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
802         th->th.th_dispatch->th_steal_lock =
803             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
804         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
805       }
806       break;
807     } else {
808       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
809                      "kmp_sch_static_balanced\n",
810                      gtid));
811       schedule = kmp_sch_static_balanced;
812       /* too few iterations: fall-through to kmp_sch_static_balanced */
813     } // if
814     /* FALL-THROUGH to static balanced */
815   } // case
816 #endif
817   case kmp_sch_static_balanced: {
818     T nproc = th->th.th_team_nproc;
819     T init, limit;
820 
821     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
822                    gtid));
823 
824     if (nproc > 1) {
825       T id = __kmp_tid_from_gtid(gtid);
826 
827       if (tc < nproc) {
828         if (id < tc) {
829           init = id;
830           limit = id;
831           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
832         } else {
833           pr->u.p.count = 1; /* means no more chunks to execute */
834           pr->u.p.parm1 = FALSE;
835           break;
836         }
837       } else {
838         T small_chunk = tc / nproc;
839         T extras = tc % nproc;
840         init = id * small_chunk + (id < extras ? id : extras);
841         limit = init + small_chunk - (id < extras ? 0 : 1);
842         pr->u.p.parm1 = (id == nproc - 1);
843       }
844     } else {
845       if (tc > 0) {
846         init = 0;
847         limit = tc - 1;
848         pr->u.p.parm1 = TRUE;
849       } else { // zero trip count
850         pr->u.p.count = 1; /* means no more chunks to execute */
851         pr->u.p.parm1 = FALSE;
852         break;
853       }
854     }
855 #if USE_ITT_BUILD
856     // Calculate chunk for metadata report
857     if (itt_need_metadata_reporting)
858       cur_chunk = limit - init + 1;
859 #endif
860     if (st == 1) {
861       pr->u.p.lb = lb + init;
862       pr->u.p.ub = lb + limit;
863     } else {
864       // calculated upper bound, "ub" is user-defined upper bound
865       T ub_tmp = lb + limit * st;
866       pr->u.p.lb = lb + init * st;
867       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
868       // it exactly
869       if (st > 0) {
870         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
871       } else {
872         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
873       }
874     }
875     if (pr->ordered) {
876       pr->u.p.ordered_lower = init;
877       pr->u.p.ordered_upper = limit;
878     }
879     break;
880   } // case
881   case kmp_sch_guided_iterative_chunked: {
882     T nproc = th->th.th_team_nproc;
883     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
884                    " case\n",
885                    gtid));
886 
887     if (nproc > 1) {
888       if ((2L * chunk + 1) * nproc >= tc) {
889         /* chunk size too large, switch to dynamic */
890         schedule = kmp_sch_dynamic_chunked;
891       } else {
892         // when remaining iters become less than parm2 - switch to dynamic
893         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
894         *(double *)&pr->u.p.parm3 =
895             guided_flt_param / nproc; // may occupy parm3 and parm4
896       }
897     } else {
898       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
899                      "kmp_sch_static_greedy\n",
900                      gtid));
901       schedule = kmp_sch_static_greedy;
902       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
903       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
904                      gtid));
905       pr->u.p.parm1 = tc;
906     } // if
907   } // case
908   break;
909   case kmp_sch_guided_analytical_chunked: {
910     T nproc = th->th.th_team_nproc;
911     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
912                    " case\n",
913                    gtid));
914     if (nproc > 1) {
915       if ((2L * chunk + 1) * nproc >= tc) {
916         /* chunk size too large, switch to dynamic */
917         schedule = kmp_sch_dynamic_chunked;
918       } else {
919         /* commonly used term: (2 nproc - 1)/(2 nproc) */
920         DBL x;
921 
922 #if KMP_OS_WINDOWS && KMP_ARCH_X86
923         /* Linux* OS already has 64-bit computation by default for long double,
924            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
925            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
926            instead of the default 53-bit. Even though long double doesn't work
927            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
928            expected to impact the correctness of the algorithm, but this has not
929            been mathematically proven. */
930         // save original FPCW and set precision to 64-bit, as
931         // Windows* OS on IA-32 architecture defaults to 53-bit
932         unsigned int oldFpcw = _control87(0, 0);
933         _control87(_PC_64, _MCW_PC); // 0,0x30000
934 #endif
935         /* value used for comparison in solver for cross-over point */
936         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
937 
938         /* crossover point--chunk indexes equal to or greater than
939            this point switch to dynamic-style scheduling */
940         UT cross;
941 
942         /* commonly used term: (2 nproc - 1)/(2 nproc) */
943         x = (long double)1.0 - (long double)0.5 / nproc;
944 
945 #ifdef KMP_DEBUG
946         { // test natural alignment
947           struct _test_a {
948             char a;
949             union {
950               char b;
951               DBL d;
952             };
953           } t;
954           ptrdiff_t natural_alignment =
955               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
956           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
957           // long)natural_alignment );
958           KMP_DEBUG_ASSERT(
959               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
960         }
961 #endif // KMP_DEBUG
962 
963         /* save the term in thread private dispatch structure */
964         *(DBL *)&pr->u.p.parm3 = x;
965 
966         /* solve for the crossover point to the nearest integer i for which C_i
967            <= chunk */
968         {
969           UT left, right, mid;
970           long double p;
971 
972           /* estimate initial upper and lower bound */
973 
974           /* doesn't matter what value right is as long as it is positive, but
975              it affects performance of the solver */
976           right = 229;
977           p = __kmp_pow<UT>(x, right);
978           if (p > target) {
979             do {
980               p *= p;
981               right <<= 1;
982             } while (p > target && right < (1 << 27));
983             /* lower bound is previous (failed) estimate of upper bound */
984             left = right >> 1;
985           } else {
986             left = 0;
987           }
988 
989           /* bisection root-finding method */
990           while (left + 1 < right) {
991             mid = (left + right) / 2;
992             if (__kmp_pow<UT>(x, mid) > target) {
993               left = mid;
994             } else {
995               right = mid;
996             }
997           } // while
998           cross = right;
999         }
1000         /* assert sanity of computed crossover point */
1001         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1002                    __kmp_pow<UT>(x, cross) <= target);
1003 
1004         /* save the crossover point in thread private dispatch structure */
1005         pr->u.p.parm2 = cross;
1006 
1007 // C75803
1008 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1009 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1010 #else
1011 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1012 #endif
1013         /* dynamic-style scheduling offset */
1014         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1015                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1016                         cross * chunk;
1017 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1018         // restore FPCW
1019         _control87(oldFpcw, _MCW_PC);
1020 #endif
1021       } // if
1022     } else {
1023       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1024                      "kmp_sch_static_greedy\n",
1025                      gtid));
1026       schedule = kmp_sch_static_greedy;
1027       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1028       pr->u.p.parm1 = tc;
1029     } // if
1030   } // case
1031   break;
1032   case kmp_sch_static_greedy:
1033     KD_TRACE(100,
1034              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1035     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1036                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1037                         : tc;
1038     break;
1039   case kmp_sch_static_chunked:
1040   case kmp_sch_dynamic_chunked:
1041     if (pr->u.p.parm1 <= 0) {
1042       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1043     }
1044     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1045                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1046                    gtid));
1047     break;
1048   case kmp_sch_trapezoidal: {
1049     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1050 
1051     T parm1, parm2, parm3, parm4;
1052     KD_TRACE(100,
1053              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1054 
1055     parm1 = chunk;
1056 
1057     /* F : size of the first cycle */
1058     parm2 = (tc / (2 * th->th.th_team_nproc));
1059 
1060     if (parm2 < 1) {
1061       parm2 = 1;
1062     }
1063 
1064     /* L : size of the last cycle.  Make sure the last cycle is not larger
1065        than the first cycle. */
1066     if (parm1 < 1) {
1067       parm1 = 1;
1068     } else if (parm1 > parm2) {
1069       parm1 = parm2;
1070     }
1071 
1072     /* N : number of cycles */
1073     parm3 = (parm2 + parm1);
1074     parm3 = (2 * tc + parm3 - 1) / parm3;
1075 
1076     if (parm3 < 2) {
1077       parm3 = 2;
1078     }
1079 
1080     /* sigma : decreasing incr of the trapezoid */
1081     parm4 = (parm3 - 1);
1082     parm4 = (parm2 - parm1) / parm4;
1083 
1084     // pointless check, because parm4 >= 0 always
1085     // if ( parm4 < 0 ) {
1086     //    parm4 = 0;
1087     //}
1088 
1089     pr->u.p.parm1 = parm1;
1090     pr->u.p.parm2 = parm2;
1091     pr->u.p.parm3 = parm3;
1092     pr->u.p.parm4 = parm4;
1093   } // case
1094   break;
1095 
1096   default: {
1097     __kmp_msg(kmp_ms_fatal, // Severity
1098               KMP_MSG(UnknownSchedTypeDetected), // Primary message
1099               KMP_HNT(GetNewerLibrary), // Hint
1100               __kmp_msg_null // Variadic argument list terminator
1101               );
1102   } break;
1103   } // switch
1104   pr->schedule = schedule;
1105   if (active) {
1106     /* The name of this buffer should be my_buffer_index when it's free to use
1107      * it */
1108 
1109     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1110                    "sh->buffer_index:%d\n",
1111                    gtid, my_buffer_index, sh->buffer_index));
1112     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1113                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1114     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1115     // my_buffer_index are *always* 32-bit integers.
1116     KMP_MB(); /* is this necessary? */
1117     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1118                    "sh->buffer_index:%d\n",
1119                    gtid, my_buffer_index, sh->buffer_index));
1120 
1121     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1122     th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh;
1123 #if USE_ITT_BUILD
1124     if (pr->ordered) {
1125       __kmp_itt_ordered_init(gtid);
1126     }; // if
1127     // Report loop metadata
1128     if (itt_need_metadata_reporting) {
1129       // Only report metadata by master of active team at level 1
1130       kmp_uint64 schedtype = 0;
1131       switch (schedule) {
1132       case kmp_sch_static_chunked:
1133       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1134         break;
1135       case kmp_sch_static_greedy:
1136         cur_chunk = pr->u.p.parm1;
1137         break;
1138       case kmp_sch_dynamic_chunked:
1139         schedtype = 1;
1140         break;
1141       case kmp_sch_guided_iterative_chunked:
1142       case kmp_sch_guided_analytical_chunked:
1143         schedtype = 2;
1144         break;
1145       default:
1146         // Should we put this case under "static"?
1147         // case kmp_sch_static_steal:
1148         schedtype = 3;
1149         break;
1150       }
1151       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1152     }
1153 #endif /* USE_ITT_BUILD */
1154   }; // if
1155 
1156 #ifdef KMP_DEBUG
1157   {
1158     const char *buff;
1159     // create format specifiers before the debug output
1160     buff = __kmp_str_format(
1161         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1162         "lb:%%%s ub:%%%s"
1163         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1164         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1165         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1166         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1167         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1168         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1169     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1170                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1171                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1172                   pr->u.p.parm3, pr->u.p.parm4));
1173     __kmp_str_free(&buff);
1174   }
1175 #endif
1176 #if (KMP_STATIC_STEAL_ENABLED)
1177   // It cannot be guaranteed that after execution of a loop with some other
1178   // schedule kind all the parm3 variables will contain the same value. Even if
1179   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1180   // rather than program life-time increment. So the dedicated variable is
1181   // required. The 'static_steal_counter' is used.
1182   if (schedule == kmp_sch_static_steal) {
1183     // Other threads will inspect this variable when searching for a victim.
1184     // This is a flag showing that other threads may steal from this thread
1185     // since then.
1186     volatile T *p = &pr->u.p.static_steal_counter;
1187     *p = *p + 1;
1188   }
1189 #endif // ( KMP_STATIC_STEAL_ENABLED )
1190 
1191 #if OMPT_SUPPORT && OMPT_TRACE
1192   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1193     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1194     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1195     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1196         team_info->parallel_id, task_info->task_id, team_info->microtask);
1197   }
1198 #endif
1199 }
1200 
1201 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1202  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1203  * every chunk of iterations.  If the ordered section(s) were not executed
1204  * for this iteration (or every iteration in this chunk), we need to set the
1205  * ordered iteration counters so that the next thread can proceed. */
1206 template <typename UT>
1207 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1208   typedef typename traits_t<UT>::signed_t ST;
1209   kmp_info_t *th = __kmp_threads[gtid];
1210 
1211   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1212   if (!th->th.th_team->t.t_serialized) {
1213 
1214     dispatch_private_info_template<UT> *pr =
1215         reinterpret_cast<dispatch_private_info_template<UT> *>(
1216             th->th.th_dispatch->th_dispatch_pr_current);
1217     dispatch_shared_info_template<UT> volatile *sh =
1218         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1219             th->th.th_dispatch->th_dispatch_sh_current);
1220     KMP_DEBUG_ASSERT(pr);
1221     KMP_DEBUG_ASSERT(sh);
1222     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1223                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1224 
1225     if (pr->ordered_bumped) {
1226       KD_TRACE(
1227           1000,
1228           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1229            gtid));
1230       pr->ordered_bumped = 0;
1231     } else {
1232       UT lower = pr->u.p.ordered_lower;
1233 
1234 #ifdef KMP_DEBUG
1235       {
1236         const char *buff;
1237         // create format specifiers before the debug output
1238         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1239                                 "ordered_iteration:%%%s lower:%%%s\n",
1240                                 traits_t<UT>::spec, traits_t<UT>::spec);
1241         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1242         __kmp_str_free(&buff);
1243       }
1244 #endif
1245 
1246       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1247                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1248       KMP_MB(); /* is this necessary? */
1249 #ifdef KMP_DEBUG
1250       {
1251         const char *buff;
1252         // create format specifiers before the debug output
1253         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1254                                 "ordered_iteration:%%%s lower:%%%s\n",
1255                                 traits_t<UT>::spec, traits_t<UT>::spec);
1256         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1257         __kmp_str_free(&buff);
1258       }
1259 #endif
1260 
1261       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1262     } // if
1263   } // if
1264   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1265 }
1266 
1267 #ifdef KMP_GOMP_COMPAT
1268 
1269 template <typename UT>
1270 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1271   typedef typename traits_t<UT>::signed_t ST;
1272   kmp_info_t *th = __kmp_threads[gtid];
1273 
1274   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1275   if (!th->th.th_team->t.t_serialized) {
1276     //        int cid;
1277     dispatch_private_info_template<UT> *pr =
1278         reinterpret_cast<dispatch_private_info_template<UT> *>(
1279             th->th.th_dispatch->th_dispatch_pr_current);
1280     dispatch_shared_info_template<UT> volatile *sh =
1281         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1282             th->th.th_dispatch->th_dispatch_sh_current);
1283     KMP_DEBUG_ASSERT(pr);
1284     KMP_DEBUG_ASSERT(sh);
1285     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1286                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1287 
1288     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1289     UT lower = pr->u.p.ordered_lower;
1290     UT upper = pr->u.p.ordered_upper;
1291     UT inc = upper - lower + 1;
1292 
1293     if (pr->ordered_bumped == inc) {
1294       KD_TRACE(
1295           1000,
1296           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1297            gtid));
1298       pr->ordered_bumped = 0;
1299     } else {
1300       inc -= pr->ordered_bumped;
1301 
1302 #ifdef KMP_DEBUG
1303       {
1304         const char *buff;
1305         // create format specifiers before the debug output
1306         buff = __kmp_str_format(
1307             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1308             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1309             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1310         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1311         __kmp_str_free(&buff);
1312       }
1313 #endif
1314 
1315       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1316                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1317 
1318       KMP_MB(); /* is this necessary? */
1319       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1320                       "ordered_bumped to zero\n",
1321                       gtid));
1322       pr->ordered_bumped = 0;
1323 //!!!!! TODO check if the inc should be unsigned, or signed???
1324 #ifdef KMP_DEBUG
1325       {
1326         const char *buff;
1327         // create format specifiers before the debug output
1328         buff = __kmp_str_format(
1329             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1330             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1331             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1332             traits_t<UT>::spec);
1333         KD_TRACE(1000,
1334                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1335         __kmp_str_free(&buff);
1336       }
1337 #endif
1338 
1339       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1340     }
1341     //        }
1342   }
1343   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1344 }
1345 
1346 #endif /* KMP_GOMP_COMPAT */
1347 
1348 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1349    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1350    is not called. */
1351 #if OMPT_SUPPORT && OMPT_TRACE
1352 #define OMPT_LOOP_END                                                          \
1353   if (status == 0) {                                                           \
1354     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1355       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1356       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1357       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1358           team_info->parallel_id, task_info->task_id);                         \
1359     }                                                                          \
1360   }
1361 #else
1362 #define OMPT_LOOP_END // no-op
1363 #endif
1364 
1365 template <typename T>
1366 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1367                                T *p_lb, T *p_ub,
1368                                typename traits_t<T>::signed_t *p_st) {
1369 
1370   typedef typename traits_t<T>::unsigned_t UT;
1371   typedef typename traits_t<T>::signed_t ST;
1372   typedef typename traits_t<T>::floating_t DBL;
1373 
1374   // This is potentially slightly misleading, schedule(runtime) will appear here
1375   // even if the actual runtme schedule is static. (Which points out a
1376   // disadavantage of schedule(runtime): even when static scheduling is used it
1377   // costs more than a compile time choice to use static scheduling would.)
1378   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1379 
1380   int status;
1381   dispatch_private_info_template<T> *pr;
1382   kmp_info_t *th = __kmp_threads[gtid];
1383   kmp_team_t *team = th->th.th_team;
1384 
1385   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1386 #ifdef KMP_DEBUG
1387   {
1388     const char *buff;
1389     // create format specifiers before the debug output
1390     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1391                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1392                             traits_t<T>::spec, traits_t<T>::spec,
1393                             traits_t<ST>::spec);
1394     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1395     __kmp_str_free(&buff);
1396   }
1397 #endif
1398 
1399   if (team->t.t_serialized) {
1400     /* NOTE: serialize this dispatch becase we are not at the active level */
1401     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1402         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1403     KMP_DEBUG_ASSERT(pr);
1404 
1405     if ((status = (pr->u.p.tc != 0)) == 0) {
1406       *p_lb = 0;
1407       *p_ub = 0;
1408       //            if ( p_last != NULL )
1409       //                *p_last = 0;
1410       if (p_st != NULL)
1411         *p_st = 0;
1412       if (__kmp_env_consistency_check) {
1413         if (pr->pushed_ws != ct_none) {
1414           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1415         }
1416       }
1417     } else if (pr->nomerge) {
1418       kmp_int32 last;
1419       T start;
1420       UT limit, trip, init;
1421       ST incr;
1422       T chunk = pr->u.p.parm1;
1423 
1424       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1425                      gtid));
1426 
1427       init = chunk * pr->u.p.count++;
1428       trip = pr->u.p.tc - 1;
1429 
1430       if ((status = (init <= trip)) == 0) {
1431         *p_lb = 0;
1432         *p_ub = 0;
1433         //                if ( p_last != NULL )
1434         //                    *p_last = 0;
1435         if (p_st != NULL)
1436           *p_st = 0;
1437         if (__kmp_env_consistency_check) {
1438           if (pr->pushed_ws != ct_none) {
1439             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1440           }
1441         }
1442       } else {
1443         start = pr->u.p.lb;
1444         limit = chunk + init - 1;
1445         incr = pr->u.p.st;
1446 
1447         if ((last = (limit >= trip)) != 0) {
1448           limit = trip;
1449 #if KMP_OS_WINDOWS
1450           pr->u.p.last_upper = pr->u.p.ub;
1451 #endif /* KMP_OS_WINDOWS */
1452         }
1453         if (p_last != NULL)
1454           *p_last = last;
1455         if (p_st != NULL)
1456           *p_st = incr;
1457         if (incr == 1) {
1458           *p_lb = start + init;
1459           *p_ub = start + limit;
1460         } else {
1461           *p_lb = start + init * incr;
1462           *p_ub = start + limit * incr;
1463         }
1464 
1465         if (pr->ordered) {
1466           pr->u.p.ordered_lower = init;
1467           pr->u.p.ordered_upper = limit;
1468 #ifdef KMP_DEBUG
1469           {
1470             const char *buff;
1471             // create format specifiers before the debug output
1472             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1473                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1474                                     traits_t<UT>::spec, traits_t<UT>::spec);
1475             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1476                             pr->u.p.ordered_upper));
1477             __kmp_str_free(&buff);
1478           }
1479 #endif
1480         } // if
1481       } // if
1482     } else {
1483       pr->u.p.tc = 0;
1484       *p_lb = pr->u.p.lb;
1485       *p_ub = pr->u.p.ub;
1486 #if KMP_OS_WINDOWS
1487       pr->u.p.last_upper = *p_ub;
1488 #endif /* KMP_OS_WINDOWS */
1489       if (p_last != NULL)
1490         *p_last = TRUE;
1491       if (p_st != NULL)
1492         *p_st = pr->u.p.st;
1493     } // if
1494 #ifdef KMP_DEBUG
1495     {
1496       const char *buff;
1497       // create format specifiers before the debug output
1498       buff = __kmp_str_format(
1499           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1500           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1501           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1502       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1503       __kmp_str_free(&buff);
1504     }
1505 #endif
1506 #if INCLUDE_SSC_MARKS
1507     SSC_MARK_DISPATCH_NEXT();
1508 #endif
1509     OMPT_LOOP_END;
1510     return status;
1511   } else {
1512     kmp_int32 last = 0;
1513     dispatch_shared_info_template<UT> *sh;
1514     T start;
1515     ST incr;
1516     UT limit, trip, init;
1517 
1518     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1519                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1520 
1521     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1522         th->th.th_dispatch->th_dispatch_pr_current);
1523     KMP_DEBUG_ASSERT(pr);
1524     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1525         th->th.th_dispatch->th_dispatch_sh_current);
1526     KMP_DEBUG_ASSERT(sh);
1527 
1528     if (pr->u.p.tc == 0) {
1529       // zero trip count
1530       status = 0;
1531     } else {
1532       switch (pr->schedule) {
1533 #if (KMP_STATIC_STEAL_ENABLED)
1534       case kmp_sch_static_steal: {
1535         T chunk = pr->u.p.parm1;
1536         int nproc = th->th.th_team_nproc;
1537 
1538         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1539                        gtid));
1540 
1541         trip = pr->u.p.tc - 1;
1542 
1543         if (traits_t<T>::type_size > 4) {
1544           // use lock for 8-byte and CAS for 4-byte induction
1545           // variable. TODO (optional): check and use 16-byte CAS
1546           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1547           KMP_DEBUG_ASSERT(lck != NULL);
1548           if (pr->u.p.count < (UT)pr->u.p.ub) {
1549             __kmp_acquire_lock(lck, gtid);
1550             // try to get own chunk of iterations
1551             init = (pr->u.p.count)++;
1552             status = (init < (UT)pr->u.p.ub);
1553             __kmp_release_lock(lck, gtid);
1554           } else {
1555             status = 0; // no own chunks
1556           }
1557           if (!status) { // try to steal
1558             kmp_info_t **other_threads = team->t.t_threads;
1559             int while_limit = nproc; // nproc attempts to find a victim
1560             int while_index = 0;
1561             // TODO: algorithm of searching for a victim
1562             // should be cleaned up and measured
1563             while ((!status) && (while_limit != ++while_index)) {
1564               T remaining;
1565               T victimIdx = pr->u.p.parm4;
1566               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1567               dispatch_private_info_template<T> *victim =
1568                   reinterpret_cast<dispatch_private_info_template<T> *>(
1569                       other_threads[victimIdx]
1570                           ->th.th_dispatch->th_dispatch_pr_current);
1571               while ((victim == NULL || victim == pr ||
1572                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1573                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1574                      oldVictimIdx != victimIdx) {
1575                 victimIdx = (victimIdx + 1) % nproc;
1576                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1577                     other_threads[victimIdx]
1578                         ->th.th_dispatch->th_dispatch_pr_current);
1579               };
1580               if (!victim ||
1581                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1582                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1583                 continue; // try once more (nproc attempts in total)
1584                 // no victim is ready yet to participate in stealing
1585                 // because all victims are still in kmp_init_dispatch
1586               }
1587               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1588                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1589                 continue; // not enough chunks to steal, goto next victim
1590               }
1591 
1592               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1593               KMP_ASSERT(lck != NULL);
1594               __kmp_acquire_lock(lck, gtid);
1595               limit = victim->u.p.ub; // keep initial ub
1596               if (victim->u.p.count >= limit ||
1597                   (remaining = limit - victim->u.p.count) < 2) {
1598                 __kmp_release_lock(lck, gtid);
1599                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1600                 continue; // not enough chunks to steal
1601               }
1602               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1603               // or by 1
1604               if (remaining > 3) {
1605                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1606                 init = (victim->u.p.ub -=
1607                         (remaining >> 2)); // steal 1/4 of remaining
1608               } else {
1609                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1610                 init =
1611                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1612               }
1613               __kmp_release_lock(lck, gtid);
1614 
1615               KMP_DEBUG_ASSERT(init + 1 <= limit);
1616               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1617               status = 1;
1618               while_index = 0;
1619               // now update own count and ub with stolen range but init chunk
1620               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1621               pr->u.p.count = init + 1;
1622               pr->u.p.ub = limit;
1623               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1624             } // while (search for victim)
1625           } // if (try to find victim and steal)
1626         } else {
1627           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1628           typedef union {
1629             struct {
1630               UT count;
1631               T ub;
1632             } p;
1633             kmp_int64 b;
1634           } union_i4;
1635           // All operations on 'count' or 'ub' must be combined atomically
1636           // together.
1637           {
1638             union_i4 vold, vnew;
1639             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1640             vnew = vold;
1641             vnew.p.count++;
1642             while (!KMP_COMPARE_AND_STORE_ACQ64(
1643                 (volatile kmp_int64 *)&pr->u.p.count,
1644                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1645                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1646               KMP_CPU_PAUSE();
1647               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1648               vnew = vold;
1649               vnew.p.count++;
1650             }
1651             vnew = vold;
1652             init = vnew.p.count;
1653             status = (init < (UT)vnew.p.ub);
1654           }
1655 
1656           if (!status) {
1657             kmp_info_t **other_threads = team->t.t_threads;
1658             int while_limit = nproc; // nproc attempts to find a victim
1659             int while_index = 0;
1660 
1661             // TODO: algorithm of searching for a victim
1662             // should be cleaned up and measured
1663             while ((!status) && (while_limit != ++while_index)) {
1664               union_i4 vold, vnew;
1665               kmp_int32 remaining;
1666               T victimIdx = pr->u.p.parm4;
1667               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1668               dispatch_private_info_template<T> *victim =
1669                   reinterpret_cast<dispatch_private_info_template<T> *>(
1670                       other_threads[victimIdx]
1671                           ->th.th_dispatch->th_dispatch_pr_current);
1672               while ((victim == NULL || victim == pr ||
1673                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1674                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1675                      oldVictimIdx != victimIdx) {
1676                 victimIdx = (victimIdx + 1) % nproc;
1677                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1678                     other_threads[victimIdx]
1679                         ->th.th_dispatch->th_dispatch_pr_current);
1680               };
1681               if (!victim ||
1682                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1683                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1684                 continue; // try once more (nproc attempts in total)
1685                 // no victim is ready yet to participate in stealing
1686                 // because all victims are still in kmp_init_dispatch
1687               }
1688               pr->u.p.parm4 = victimIdx; // new victim found
1689               while (1) { // CAS loop if victim has enough chunks to steal
1690                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1691                 vnew = vold;
1692 
1693                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1694                 if (vnew.p.count >= (UT)vnew.p.ub ||
1695                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1696                   pr->u.p.parm4 =
1697                       (victimIdx + 1) % nproc; // shift start victim id
1698                   break; // not enough chunks to steal, goto next victim
1699                 }
1700                 if (remaining > 3) {
1701                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1702                 } else {
1703                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1704                 }
1705                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1706                 // TODO: Should this be acquire or release?
1707                 if (KMP_COMPARE_AND_STORE_ACQ64(
1708                         (volatile kmp_int64 *)&victim->u.p.count,
1709                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1710                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1711                   // stealing succeeded
1712                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1713                                   vold.p.ub - vnew.p.ub);
1714                   status = 1;
1715                   while_index = 0;
1716                   // now update own count and ub
1717                   init = vnew.p.ub;
1718                   vold.p.count = init + 1;
1719 #if KMP_ARCH_X86
1720                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1721                                    vold.b);
1722 #else
1723                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1724 #endif
1725                   break;
1726                 } // if (check CAS result)
1727                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1728               } // while (try to steal from particular victim)
1729             } // while (search for victim)
1730           } // if (try to find victim and steal)
1731         } // if (4-byte induction variable)
1732         if (!status) {
1733           *p_lb = 0;
1734           *p_ub = 0;
1735           if (p_st != NULL)
1736             *p_st = 0;
1737         } else {
1738           start = pr->u.p.parm2;
1739           init *= chunk;
1740           limit = chunk + init - 1;
1741           incr = pr->u.p.st;
1742           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1743 
1744           KMP_DEBUG_ASSERT(init <= trip);
1745           if ((last = (limit >= trip)) != 0)
1746             limit = trip;
1747           if (p_st != NULL)
1748             *p_st = incr;
1749 
1750           if (incr == 1) {
1751             *p_lb = start + init;
1752             *p_ub = start + limit;
1753           } else {
1754             *p_lb = start + init * incr;
1755             *p_ub = start + limit * incr;
1756           }
1757 
1758           if (pr->ordered) {
1759             pr->u.p.ordered_lower = init;
1760             pr->u.p.ordered_upper = limit;
1761 #ifdef KMP_DEBUG
1762             {
1763               const char *buff;
1764               // create format specifiers before the debug output
1765               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1766                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1767                                       traits_t<UT>::spec, traits_t<UT>::spec);
1768               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1769                               pr->u.p.ordered_upper));
1770               __kmp_str_free(&buff);
1771             }
1772 #endif
1773           } // if
1774         } // if
1775         break;
1776       } // case
1777 #endif // ( KMP_STATIC_STEAL_ENABLED )
1778       case kmp_sch_static_balanced: {
1779         KD_TRACE(
1780             100,
1781             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1782         if ((status = !pr->u.p.count) !=
1783             0) { /* check if thread has any iteration to do */
1784           pr->u.p.count = 1;
1785           *p_lb = pr->u.p.lb;
1786           *p_ub = pr->u.p.ub;
1787           last = pr->u.p.parm1;
1788           if (p_st != NULL)
1789             *p_st = pr->u.p.st;
1790         } else { /* no iterations to do */
1791           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1792         }
1793         if (pr->ordered) {
1794 #ifdef KMP_DEBUG
1795           {
1796             const char *buff;
1797             // create format specifiers before the debug output
1798             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1799                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1800                                     traits_t<UT>::spec, traits_t<UT>::spec);
1801             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1802                             pr->u.p.ordered_upper));
1803             __kmp_str_free(&buff);
1804           }
1805 #endif
1806         } // if
1807       } // case
1808       break;
1809       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1810                                      merged here */
1811       case kmp_sch_static_chunked: {
1812         T parm1;
1813 
1814         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1815                        "kmp_sch_static_[affinity|chunked] case\n",
1816                        gtid));
1817         parm1 = pr->u.p.parm1;
1818 
1819         trip = pr->u.p.tc - 1;
1820         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1821 
1822         if ((status = (init <= trip)) != 0) {
1823           start = pr->u.p.lb;
1824           incr = pr->u.p.st;
1825           limit = parm1 + init - 1;
1826 
1827           if ((last = (limit >= trip)) != 0)
1828             limit = trip;
1829 
1830           if (p_st != NULL)
1831             *p_st = incr;
1832 
1833           pr->u.p.count += th->th.th_team_nproc;
1834 
1835           if (incr == 1) {
1836             *p_lb = start + init;
1837             *p_ub = start + limit;
1838           } else {
1839             *p_lb = start + init * incr;
1840             *p_ub = start + limit * incr;
1841           }
1842 
1843           if (pr->ordered) {
1844             pr->u.p.ordered_lower = init;
1845             pr->u.p.ordered_upper = limit;
1846 #ifdef KMP_DEBUG
1847             {
1848               const char *buff;
1849               // create format specifiers before the debug output
1850               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1851                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1852                                       traits_t<UT>::spec, traits_t<UT>::spec);
1853               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1854                               pr->u.p.ordered_upper));
1855               __kmp_str_free(&buff);
1856             }
1857 #endif
1858           } // if
1859         } // if
1860       } // case
1861       break;
1862 
1863       case kmp_sch_dynamic_chunked: {
1864         T chunk = pr->u.p.parm1;
1865 
1866         KD_TRACE(
1867             100,
1868             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1869 
1870         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1871         trip = pr->u.p.tc - 1;
1872 
1873         if ((status = (init <= trip)) == 0) {
1874           *p_lb = 0;
1875           *p_ub = 0;
1876           if (p_st != NULL)
1877             *p_st = 0;
1878         } else {
1879           start = pr->u.p.lb;
1880           limit = chunk + init - 1;
1881           incr = pr->u.p.st;
1882 
1883           if ((last = (limit >= trip)) != 0)
1884             limit = trip;
1885 
1886           if (p_st != NULL)
1887             *p_st = incr;
1888 
1889           if (incr == 1) {
1890             *p_lb = start + init;
1891             *p_ub = start + limit;
1892           } else {
1893             *p_lb = start + init * incr;
1894             *p_ub = start + limit * incr;
1895           }
1896 
1897           if (pr->ordered) {
1898             pr->u.p.ordered_lower = init;
1899             pr->u.p.ordered_upper = limit;
1900 #ifdef KMP_DEBUG
1901             {
1902               const char *buff;
1903               // create format specifiers before the debug output
1904               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1905                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1906                                       traits_t<UT>::spec, traits_t<UT>::spec);
1907               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1908                               pr->u.p.ordered_upper));
1909               __kmp_str_free(&buff);
1910             }
1911 #endif
1912           } // if
1913         } // if
1914       } // case
1915       break;
1916 
1917       case kmp_sch_guided_iterative_chunked: {
1918         T chunkspec = pr->u.p.parm1;
1919         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1920                        "iterative case\n",
1921                        gtid));
1922         trip = pr->u.p.tc;
1923         // Start atomic part of calculations
1924         while (1) {
1925           ST remaining; // signed, because can be < 0
1926           init = sh->u.s.iteration; // shared value
1927           remaining = trip - init;
1928           if (remaining <= 0) { // AC: need to compare with 0 first
1929             // nothing to do, don't try atomic op
1930             status = 0;
1931             break;
1932           }
1933           if ((T)remaining <
1934               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1935             // use dynamic-style shcedule
1936             // atomically inrement iterations, get old value
1937             init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec);
1938             remaining = trip - init;
1939             if (remaining <= 0) {
1940               status = 0; // all iterations got by other threads
1941             } else { // got some iterations to work on
1942               status = 1;
1943               if ((T)remaining > chunkspec) {
1944                 limit = init + chunkspec - 1;
1945               } else {
1946                 last = 1; // the last chunk
1947                 limit = init + remaining - 1;
1948               } // if
1949             } // if
1950             break;
1951           } // if
1952           limit = init + (UT)(remaining *
1953                               *(double *)&pr->u.p.parm3); // divide by K*nproc
1954           if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
1955                                    (ST)limit)) {
1956             // CAS was successful, chunk obtained
1957             status = 1;
1958             --limit;
1959             break;
1960           } // if
1961         } // while
1962         if (status != 0) {
1963           start = pr->u.p.lb;
1964           incr = pr->u.p.st;
1965           if (p_st != NULL)
1966             *p_st = incr;
1967           *p_lb = start + init * incr;
1968           *p_ub = start + limit * incr;
1969           if (pr->ordered) {
1970             pr->u.p.ordered_lower = init;
1971             pr->u.p.ordered_upper = limit;
1972 #ifdef KMP_DEBUG
1973             {
1974               const char *buff;
1975               // create format specifiers before the debug output
1976               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1977                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1978                                       traits_t<UT>::spec, traits_t<UT>::spec);
1979               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1980                               pr->u.p.ordered_upper));
1981               __kmp_str_free(&buff);
1982             }
1983 #endif
1984           } // if
1985         } else {
1986           *p_lb = 0;
1987           *p_ub = 0;
1988           if (p_st != NULL)
1989             *p_st = 0;
1990         } // if
1991       } // case
1992       break;
1993 
1994       case kmp_sch_guided_analytical_chunked: {
1995         T chunkspec = pr->u.p.parm1;
1996         UT chunkIdx;
1997 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1998         /* for storing original FPCW value for Windows* OS on
1999            IA-32 architecture 8-byte version */
2000         unsigned int oldFpcw;
2001         unsigned int fpcwSet = 0;
2002 #endif
2003         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2004                        "analytical case\n",
2005                        gtid));
2006 
2007         trip = pr->u.p.tc;
2008 
2009         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2010         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2011                          trip);
2012 
2013         while (1) { /* this while loop is a safeguard against unexpected zero
2014                        chunk sizes */
2015           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2016           if (chunkIdx >= (UT)pr->u.p.parm2) {
2017             --trip;
2018             /* use dynamic-style scheduling */
2019             init = chunkIdx * chunkspec + pr->u.p.count;
2020             /* need to verify init > 0 in case of overflow in the above
2021              * calculation */
2022             if ((status = (init > 0 && init <= trip)) != 0) {
2023               limit = init + chunkspec - 1;
2024 
2025               if ((last = (limit >= trip)) != 0)
2026                 limit = trip;
2027             }
2028             break;
2029           } else {
2030 /* use exponential-style scheduling */
2031 /* The following check is to workaround the lack of long double precision on
2032    Windows* OS.
2033    This check works around the possible effect that init != 0 for chunkIdx == 0.
2034  */
2035 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2036             /* If we haven't already done so, save original FPCW and set
2037                precision to 64-bit, as Windows* OS on IA-32 architecture
2038                defaults to 53-bit */
2039             if (!fpcwSet) {
2040               oldFpcw = _control87(0, 0);
2041               _control87(_PC_64, _MCW_PC);
2042               fpcwSet = 0x30000;
2043             }
2044 #endif
2045             if (chunkIdx) {
2046               init = __kmp_dispatch_guided_remaining<T>(
2047                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2048               KMP_DEBUG_ASSERT(init);
2049               init = trip - init;
2050             } else
2051               init = 0;
2052             limit = trip - __kmp_dispatch_guided_remaining<T>(
2053                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2054             KMP_ASSERT(init <= limit);
2055             if (init < limit) {
2056               KMP_DEBUG_ASSERT(limit <= trip);
2057               --limit;
2058               status = 1;
2059               break;
2060             } // if
2061           } // if
2062         } // while (1)
2063 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2064         /* restore FPCW if necessary
2065            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2066            here */
2067         if (fpcwSet && (oldFpcw & fpcwSet))
2068           _control87(oldFpcw, _MCW_PC);
2069 #endif
2070         if (status != 0) {
2071           start = pr->u.p.lb;
2072           incr = pr->u.p.st;
2073           if (p_st != NULL)
2074             *p_st = incr;
2075           *p_lb = start + init * incr;
2076           *p_ub = start + limit * incr;
2077           if (pr->ordered) {
2078             pr->u.p.ordered_lower = init;
2079             pr->u.p.ordered_upper = limit;
2080 #ifdef KMP_DEBUG
2081             {
2082               const char *buff;
2083               // create format specifiers before the debug output
2084               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2085                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2086                                       traits_t<UT>::spec, traits_t<UT>::spec);
2087               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2088                               pr->u.p.ordered_upper));
2089               __kmp_str_free(&buff);
2090             }
2091 #endif
2092           }
2093         } else {
2094           *p_lb = 0;
2095           *p_ub = 0;
2096           if (p_st != NULL)
2097             *p_st = 0;
2098         }
2099       } // case
2100       break;
2101 
2102       case kmp_sch_trapezoidal: {
2103         UT index;
2104         T parm2 = pr->u.p.parm2;
2105         T parm3 = pr->u.p.parm3;
2106         T parm4 = pr->u.p.parm4;
2107         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2108                        gtid));
2109 
2110         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2111 
2112         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2113         trip = pr->u.p.tc - 1;
2114 
2115         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2116           *p_lb = 0;
2117           *p_ub = 0;
2118           if (p_st != NULL)
2119             *p_st = 0;
2120         } else {
2121           start = pr->u.p.lb;
2122           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2123           incr = pr->u.p.st;
2124 
2125           if ((last = (limit >= trip)) != 0)
2126             limit = trip;
2127 
2128           if (p_st != NULL)
2129             *p_st = incr;
2130 
2131           if (incr == 1) {
2132             *p_lb = start + init;
2133             *p_ub = start + limit;
2134           } else {
2135             *p_lb = start + init * incr;
2136             *p_ub = start + limit * incr;
2137           }
2138 
2139           if (pr->ordered) {
2140             pr->u.p.ordered_lower = init;
2141             pr->u.p.ordered_upper = limit;
2142 #ifdef KMP_DEBUG
2143             {
2144               const char *buff;
2145               // create format specifiers before the debug output
2146               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2147                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2148                                       traits_t<UT>::spec, traits_t<UT>::spec);
2149               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2150                               pr->u.p.ordered_upper));
2151               __kmp_str_free(&buff);
2152             }
2153 #endif
2154           } // if
2155         } // if
2156       } // case
2157       break;
2158       default: {
2159         status = 0; // to avoid complaints on uninitialized variable use
2160         __kmp_msg(kmp_ms_fatal, // Severity
2161                   KMP_MSG(UnknownSchedTypeDetected), // Primary message
2162                   KMP_HNT(GetNewerLibrary), // Hint
2163                   __kmp_msg_null // Variadic argument list terminator
2164                   );
2165       } break;
2166       } // switch
2167     } // if tc == 0;
2168 
2169     if (status == 0) {
2170       UT num_done;
2171 
2172       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2173 #ifdef KMP_DEBUG
2174       {
2175         const char *buff;
2176         // create format specifiers before the debug output
2177         buff = __kmp_str_format(
2178             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2179             traits_t<UT>::spec);
2180         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2181         __kmp_str_free(&buff);
2182       }
2183 #endif
2184 
2185       if ((ST)num_done == th->th.th_team_nproc - 1) {
2186 #if (KMP_STATIC_STEAL_ENABLED)
2187         if (pr->schedule == kmp_sch_static_steal &&
2188             traits_t<T>::type_size > 4) {
2189           int i;
2190           kmp_info_t **other_threads = team->t.t_threads;
2191           // loop complete, safe to destroy locks used for stealing
2192           for (i = 0; i < th->th.th_team_nproc; ++i) {
2193             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2194             KMP_ASSERT(lck != NULL);
2195             __kmp_destroy_lock(lck);
2196             __kmp_free(lck);
2197             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2198           }
2199         }
2200 #endif
2201         /* NOTE: release this buffer to be reused */
2202 
2203         KMP_MB(); /* Flush all pending memory write invalidates.  */
2204 
2205         sh->u.s.num_done = 0;
2206         sh->u.s.iteration = 0;
2207 
2208         /* TODO replace with general release procedure? */
2209         if (pr->ordered) {
2210           sh->u.s.ordered_iteration = 0;
2211         }
2212 
2213         KMP_MB(); /* Flush all pending memory write invalidates.  */
2214 
2215         sh->buffer_index += __kmp_dispatch_num_buffers;
2216         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2217                        gtid, sh->buffer_index));
2218 
2219         KMP_MB(); /* Flush all pending memory write invalidates.  */
2220 
2221       } // if
2222       if (__kmp_env_consistency_check) {
2223         if (pr->pushed_ws != ct_none) {
2224           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225         }
2226       }
2227 
2228       th->th.th_dispatch->th_deo_fcn = NULL;
2229       th->th.th_dispatch->th_dxo_fcn = NULL;
2230       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2231       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2232     } // if (status == 0)
2233 #if KMP_OS_WINDOWS
2234     else if (last) {
2235       pr->u.p.last_upper = pr->u.p.ub;
2236     }
2237 #endif /* KMP_OS_WINDOWS */
2238     if (p_last != NULL && status != 0)
2239       *p_last = last;
2240   } // if
2241 
2242 #ifdef KMP_DEBUG
2243   {
2244     const char *buff;
2245     // create format specifiers before the debug output
2246     buff = __kmp_str_format(
2247         "__kmp_dispatch_next: T#%%d normal case: "
2248         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2249         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2250     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2251     __kmp_str_free(&buff);
2252   }
2253 #endif
2254 #if INCLUDE_SSC_MARKS
2255   SSC_MARK_DISPATCH_NEXT();
2256 #endif
2257   OMPT_LOOP_END;
2258   return status;
2259 }
2260 
2261 template <typename T>
2262 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2263                                   kmp_int32 *plastiter, T *plower, T *pupper,
2264                                   typename traits_t<T>::signed_t incr) {
2265   typedef typename traits_t<T>::unsigned_t UT;
2266   typedef typename traits_t<T>::signed_t ST;
2267   register kmp_uint32 team_id;
2268   register kmp_uint32 nteams;
2269   register UT trip_count;
2270   register kmp_team_t *team;
2271   kmp_info_t *th;
2272 
2273   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2274   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2275 #ifdef KMP_DEBUG
2276   {
2277     const char *buff;
2278     // create format specifiers before the debug output
2279     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2280                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2281                             traits_t<T>::spec, traits_t<T>::spec,
2282                             traits_t<ST>::spec, traits_t<T>::spec);
2283     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2284     __kmp_str_free(&buff);
2285   }
2286 #endif
2287 
2288   if (__kmp_env_consistency_check) {
2289     if (incr == 0) {
2290       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2291                             loc);
2292     }
2293     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2294       // The loop is illegal.
2295       // Some zero-trip loops maintained by compiler, e.g.:
2296       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2297       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2298       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2299       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2300       // Compiler does not check the following illegal loops:
2301       //   for(i=0;i<10;i+=incr) // where incr<0
2302       //   for(i=10;i>0;i-=incr) // where incr<0
2303       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2304     }
2305   }
2306   th = __kmp_threads[gtid];
2307   team = th->th.th_team;
2308 #if OMP_40_ENABLED
2309   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2310   nteams = th->th.th_teams_size.nteams;
2311 #endif
2312   team_id = team->t.t_master_tid;
2313   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2314 
2315   // compute global trip count
2316   if (incr == 1) {
2317     trip_count = *pupper - *plower + 1;
2318   } else if (incr == -1) {
2319     trip_count = *plower - *pupper + 1;
2320   } else if (incr > 0) {
2321     // upper-lower can exceed the limit of signed type
2322     trip_count = (UT)(*pupper - *plower) / incr + 1;
2323   } else {
2324     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2325   }
2326 
2327   if (trip_count <= nteams) {
2328     KMP_DEBUG_ASSERT(
2329         __kmp_static == kmp_sch_static_greedy ||
2330         __kmp_static ==
2331             kmp_sch_static_balanced); // Unknown static scheduling type.
2332     // only some teams get single iteration, others get nothing
2333     if (team_id < trip_count) {
2334       *pupper = *plower = *plower + team_id * incr;
2335     } else {
2336       *plower = *pupper + incr; // zero-trip loop
2337     }
2338     if (plastiter != NULL)
2339       *plastiter = (team_id == trip_count - 1);
2340   } else {
2341     if (__kmp_static == kmp_sch_static_balanced) {
2342       register UT chunk = trip_count / nteams;
2343       register UT extras = trip_count % nteams;
2344       *plower +=
2345           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2346       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2347       if (plastiter != NULL)
2348         *plastiter = (team_id == nteams - 1);
2349     } else {
2350       register T chunk_inc_count =
2351           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2352       register T upper = *pupper;
2353       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2354       // Unknown static scheduling type.
2355       *plower += team_id * chunk_inc_count;
2356       *pupper = *plower + chunk_inc_count - incr;
2357       // Check/correct bounds if needed
2358       if (incr > 0) {
2359         if (*pupper < *plower)
2360           *pupper = traits_t<T>::max_value;
2361         if (plastiter != NULL)
2362           *plastiter = *plower <= upper && *pupper > upper - incr;
2363         if (*pupper > upper)
2364           *pupper = upper; // tracker C73258
2365       } else {
2366         if (*pupper > *plower)
2367           *pupper = traits_t<T>::min_value;
2368         if (plastiter != NULL)
2369           *plastiter = *plower >= upper && *pupper < upper - incr;
2370         if (*pupper < upper)
2371           *pupper = upper; // tracker C73258
2372       }
2373     }
2374   }
2375 }
2376 
2377 //-----------------------------------------------------------------------------
2378 // Dispatch routines
2379 //    Transfer call to template< type T >
2380 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2381 //                         T lb, T ub, ST st, ST chunk )
2382 extern "C" {
2383 
2384 /*!
2385 @ingroup WORK_SHARING
2386 @{
2387 @param loc Source location
2388 @param gtid Global thread id
2389 @param schedule Schedule type
2390 @param lb  Lower bound
2391 @param ub  Upper bound
2392 @param st  Step (or increment if you prefer)
2393 @param chunk The chunk size to block with
2394 
2395 This function prepares the runtime to start a dynamically scheduled for loop,
2396 saving the loop arguments.
2397 These functions are all identical apart from the types of the arguments.
2398 */
2399 
2400 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2401                             enum sched_type schedule, kmp_int32 lb,
2402                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2403   KMP_DEBUG_ASSERT(__kmp_init_serial);
2404   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2405 }
2406 /*!
2407 See @ref __kmpc_dispatch_init_4
2408 */
2409 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2410                              enum sched_type schedule, kmp_uint32 lb,
2411                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2412   KMP_DEBUG_ASSERT(__kmp_init_serial);
2413   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2414 }
2415 
2416 /*!
2417 See @ref __kmpc_dispatch_init_4
2418 */
2419 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2420                             enum sched_type schedule, kmp_int64 lb,
2421                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2422   KMP_DEBUG_ASSERT(__kmp_init_serial);
2423   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2424 }
2425 
2426 /*!
2427 See @ref __kmpc_dispatch_init_4
2428 */
2429 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2430                              enum sched_type schedule, kmp_uint64 lb,
2431                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2432   KMP_DEBUG_ASSERT(__kmp_init_serial);
2433   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2434 }
2435 
2436 /*!
2437 See @ref __kmpc_dispatch_init_4
2438 
2439 Difference from __kmpc_dispatch_init set of functions is these functions
2440 are called for composite distribute parallel for construct. Thus before
2441 regular iterations dispatching we need to calc per-team iteration space.
2442 
2443 These functions are all identical apart from the types of the arguments.
2444 */
2445 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2446                                  enum sched_type schedule, kmp_int32 *p_last,
2447                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2448                                  kmp_int32 chunk) {
2449   KMP_DEBUG_ASSERT(__kmp_init_serial);
2450   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2451   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2452 }
2453 
2454 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2455                                   enum sched_type schedule, kmp_int32 *p_last,
2456                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2457                                   kmp_int32 chunk) {
2458   KMP_DEBUG_ASSERT(__kmp_init_serial);
2459   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2460   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2461 }
2462 
2463 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2464                                  enum sched_type schedule, kmp_int32 *p_last,
2465                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2466                                  kmp_int64 chunk) {
2467   KMP_DEBUG_ASSERT(__kmp_init_serial);
2468   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2469   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2470 }
2471 
2472 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2473                                   enum sched_type schedule, kmp_int32 *p_last,
2474                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2475                                   kmp_int64 chunk) {
2476   KMP_DEBUG_ASSERT(__kmp_init_serial);
2477   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2478   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2479 }
2480 
2481 /*!
2482 @param loc Source code location
2483 @param gtid Global thread id
2484 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2485 otherwise
2486 @param p_lb   Pointer to the lower bound for the next chunk of work
2487 @param p_ub   Pointer to the upper bound for the next chunk of work
2488 @param p_st   Pointer to the stride for the next chunk of work
2489 @return one if there is work to be done, zero otherwise
2490 
2491 Get the next dynamically allocated chunk of work for this thread.
2492 If there is no more work, then the lb,ub and stride need not be modified.
2493 */
2494 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2495                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2496   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2497 }
2498 
2499 /*!
2500 See @ref __kmpc_dispatch_next_4
2501 */
2502 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2503                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2504                             kmp_int32 *p_st) {
2505   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2506 }
2507 
2508 /*!
2509 See @ref __kmpc_dispatch_next_4
2510 */
2511 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2512                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2513   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2514 }
2515 
2516 /*!
2517 See @ref __kmpc_dispatch_next_4
2518 */
2519 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2520                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2521                             kmp_int64 *p_st) {
2522   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2523 }
2524 
2525 /*!
2526 @param loc Source code location
2527 @param gtid Global thread id
2528 
2529 Mark the end of a dynamic loop.
2530 */
2531 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2532   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2533 }
2534 
2535 /*!
2536 See @ref __kmpc_dispatch_fini_4
2537 */
2538 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2539   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2540 }
2541 
2542 /*!
2543 See @ref __kmpc_dispatch_fini_4
2544 */
2545 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2546   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2547 }
2548 
2549 /*!
2550 See @ref __kmpc_dispatch_fini_4
2551 */
2552 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2553   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2554 }
2555 /*! @} */
2556 
2557 //-----------------------------------------------------------------------------
2558 // Non-template routines from kmp_dispatch.cpp used in other sources
2559 
2560 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2561   return value == checker;
2562 }
2563 
2564 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2565   return value != checker;
2566 }
2567 
2568 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2569   return value < checker;
2570 }
2571 
2572 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2573   return value >= checker;
2574 }
2575 
2576 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2577   return value <= checker;
2578 }
2579 
2580 kmp_uint32
2581 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2582                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2583                    void *obj // Higher-level synchronization object, or NULL.
2584                    ) {
2585   // note: we may not belong to a team at this point
2586   register volatile kmp_uint32 *spin = spinner;
2587   register kmp_uint32 check = checker;
2588   register kmp_uint32 spins;
2589   register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2590   register kmp_uint32 r;
2591 
2592   KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
2593   KMP_INIT_YIELD(spins);
2594   // main wait spin loop
2595   while (!f(r = TCR_4(*spin), check)) {
2596     KMP_FSYNC_SPIN_PREPARE(obj);
2597     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2598        split. It causes problems with infinite recursion because of exit lock */
2599     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2600         __kmp_abort_thread(); */
2601 
2602     /* if we have waited a bit, or are oversubscribed, yield */
2603     /* pause is in the following code */
2604     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2605     KMP_YIELD_SPIN(spins);
2606   }
2607   KMP_FSYNC_SPIN_ACQUIRED(obj);
2608   return r;
2609 }
2610 
2611 void __kmp_wait_yield_4_ptr(
2612     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2613     void *obj // Higher-level synchronization object, or NULL.
2614     ) {
2615   // note: we may not belong to a team at this point
2616   register void *spin = spinner;
2617   register kmp_uint32 check = checker;
2618   register kmp_uint32 spins;
2619   register kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2620 
2621   KMP_FSYNC_SPIN_INIT(obj, spin);
2622   KMP_INIT_YIELD(spins);
2623   // main wait spin loop
2624   while (!f(spin, check)) {
2625     KMP_FSYNC_SPIN_PREPARE(obj);
2626     /* if we have waited a bit, or are oversubscribed, yield */
2627     /* pause is in the following code */
2628     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2629     KMP_YIELD_SPIN(spins);
2630   }
2631   KMP_FSYNC_SPIN_ACQUIRED(obj);
2632 }
2633 
2634 } // extern "C"
2635 
2636 #ifdef KMP_GOMP_COMPAT
2637 
2638 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2639                                enum sched_type schedule, kmp_int32 lb,
2640                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2641                                int push_ws) {
2642   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2643                                  push_ws);
2644 }
2645 
2646 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2647                                 enum sched_type schedule, kmp_uint32 lb,
2648                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2649                                 int push_ws) {
2650   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2651                                   push_ws);
2652 }
2653 
2654 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2655                                enum sched_type schedule, kmp_int64 lb,
2656                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2657                                int push_ws) {
2658   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2659                                  push_ws);
2660 }
2661 
2662 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2663                                 enum sched_type schedule, kmp_uint64 lb,
2664                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2665                                 int push_ws) {
2666   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2667                                   push_ws);
2668 }
2669 
2670 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2671   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2672 }
2673 
2674 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2675   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2676 }
2677 
2678 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2679   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2680 }
2681 
2682 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2683   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2684 }
2685 
2686 #endif /* KMP_GOMP_COMPAT */
2687 
2688 /* ------------------------------------------------------------------------ */
2689