1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                      gtid));
395       if (pr->u.p.parm1 <= 0)
396         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397       break;
398     } // if
399   } // case
400 #endif
401   case kmp_sch_static_balanced: {
402     T init, limit;
403 
404     KD_TRACE(
405         100,
406         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407          gtid));
408 
409     if (nproc > 1) {
410       T id = tid;
411 
412       if (tc < nproc) {
413         if (id < tc) {
414           init = id;
415           limit = id;
416           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
417         } else {
418           pr->u.p.count = 1; /* means no more chunks to execute */
419           pr->u.p.parm1 = FALSE;
420           break;
421         }
422       } else {
423         T small_chunk = tc / nproc;
424         T extras = tc % nproc;
425         init = id * small_chunk + (id < extras ? id : extras);
426         limit = init + small_chunk - (id < extras ? 0 : 1);
427         pr->u.p.parm1 = (id == nproc - 1);
428       }
429     } else {
430       if (tc > 0) {
431         init = 0;
432         limit = tc - 1;
433         pr->u.p.parm1 = TRUE;
434       } else {
435         // zero trip count
436         pr->u.p.count = 1; /* means no more chunks to execute */
437         pr->u.p.parm1 = FALSE;
438         break;
439       }
440     }
441 #if USE_ITT_BUILD
442     // Calculate chunk for metadata report
443     if (itt_need_metadata_reporting)
444       if (cur_chunk)
445         *cur_chunk = limit - init + 1;
446 #endif
447     if (st == 1) {
448       pr->u.p.lb = lb + init;
449       pr->u.p.ub = lb + limit;
450     } else {
451       // calculated upper bound, "ub" is user-defined upper bound
452       T ub_tmp = lb + limit * st;
453       pr->u.p.lb = lb + init * st;
454       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
455       // it exactly
456       if (st > 0) {
457         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
458       } else {
459         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
460       }
461     }
462     if (pr->flags.ordered) {
463       pr->u.p.ordered_lower = init;
464       pr->u.p.ordered_upper = limit;
465     }
466     break;
467   } // case
468   case kmp_sch_static_balanced_chunked: {
469     // similar to balanced, but chunk adjusted to multiple of simd width
470     T nth = nproc;
471     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472                    " -> falling-through to static_greedy\n",
473                    gtid));
474     schedule = kmp_sch_static_greedy;
475     if (nth > 1)
476       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477     else
478       pr->u.p.parm1 = tc;
479     break;
480   } // case
481   case kmp_sch_guided_simd:
482   case kmp_sch_guided_iterative_chunked: {
483     KD_TRACE(
484         100,
485         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
486          " case\n",
487          gtid));
488 
489     if (nproc > 1) {
490       if ((2L * chunk + 1) * nproc >= tc) {
491         /* chunk size too large, switch to dynamic */
492         schedule = kmp_sch_dynamic_chunked;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522       } else {
523         /* commonly used term: (2 nproc - 1)/(2 nproc) */
524         DBL x;
525 
526 #if KMP_USE_X87CONTROL
527         /* Linux* OS already has 64-bit computation by default for long double,
528            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
529            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
530            instead of the default 53-bit. Even though long double doesn't work
531            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
532            expected to impact the correctness of the algorithm, but this has not
533            been mathematically proven. */
534         // save original FPCW and set precision to 64-bit, as
535         // Windows* OS on IA-32 architecture defaults to 53-bit
536         unsigned int oldFpcw = _control87(0, 0);
537         _control87(_PC_64, _MCW_PC); // 0,0x30000
538 #endif
539         /* value used for comparison in solver for cross-over point */
540         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
541 
542         /* crossover point--chunk indexes equal to or greater than
543            this point switch to dynamic-style scheduling */
544         UT cross;
545 
546         /* commonly used term: (2 nproc - 1)/(2 nproc) */
547         x = 1.0 - 0.5 / (double)nproc;
548 
549 #ifdef KMP_DEBUG
550         { // test natural alignment
551           struct _test_a {
552             char a;
553             union {
554               char b;
555               DBL d;
556             };
557           } t;
558           ptrdiff_t natural_alignment =
559               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
560           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
561           // long)natural_alignment );
562           KMP_DEBUG_ASSERT(
563               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
564         }
565 #endif // KMP_DEBUG
566 
567         /* save the term in thread private dispatch structure */
568         *(DBL *)&pr->u.p.parm3 = x;
569 
570         /* solve for the crossover point to the nearest integer i for which C_i
571            <= chunk */
572         {
573           UT left, right, mid;
574           long double p;
575 
576           /* estimate initial upper and lower bound */
577 
578           /* doesn't matter what value right is as long as it is positive, but
579              it affects performance of the solver */
580           right = 229;
581           p = __kmp_pow<UT>(x, right);
582           if (p > target) {
583             do {
584               p *= p;
585               right <<= 1;
586             } while (p > target && right < (1 << 27));
587             /* lower bound is previous (failed) estimate of upper bound */
588             left = right >> 1;
589           } else {
590             left = 0;
591           }
592 
593           /* bisection root-finding method */
594           while (left + 1 < right) {
595             mid = (left + right) / 2;
596             if (__kmp_pow<UT>(x, mid) > target) {
597               left = mid;
598             } else {
599               right = mid;
600             }
601           } // while
602           cross = right;
603         }
604         /* assert sanity of computed crossover point */
605         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606                    __kmp_pow<UT>(x, cross) <= target);
607 
608         /* save the crossover point in thread private dispatch structure */
609         pr->u.p.parm2 = cross;
610 
611 // C75803
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
614 #else
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
616 #endif
617         /* dynamic-style scheduling offset */
618         pr->u.p.count = tc -
619                         __kmp_dispatch_guided_remaining(
620                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
621                         cross * chunk;
622 #if KMP_USE_X87CONTROL
623         // restore FPCW
624         _control87(oldFpcw, _MCW_PC);
625 #endif
626       } // if
627     } else {
628       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
629                      "kmp_sch_static_greedy\n",
630                      gtid));
631       schedule = kmp_sch_static_greedy;
632       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
633       pr->u.p.parm1 = tc;
634     } // if
635   } // case
636   break;
637   case kmp_sch_static_greedy:
638     KD_TRACE(
639         100,
640         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
641          gtid));
642     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
643     break;
644   case kmp_sch_static_chunked:
645   case kmp_sch_dynamic_chunked:
646     if (pr->u.p.parm1 <= 0)
647       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
648     else if (pr->u.p.parm1 > tc)
649       pr->u.p.parm1 = tc;
650     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
651                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
652                    gtid));
653     break;
654   case kmp_sch_trapezoidal: {
655     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
656 
657     T parm1, parm2, parm3, parm4;
658     KD_TRACE(100,
659              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
660               gtid));
661 
662     parm1 = chunk;
663 
664     /* F : size of the first cycle */
665     parm2 = (tc / (2 * nproc));
666 
667     if (parm2 < 1) {
668       parm2 = 1;
669     }
670 
671     /* L : size of the last cycle.  Make sure the last cycle is not larger
672        than the first cycle. */
673     if (parm1 < 1) {
674       parm1 = 1;
675     } else if (parm1 > parm2) {
676       parm1 = parm2;
677     }
678 
679     /* N : number of cycles */
680     parm3 = (parm2 + parm1);
681     parm3 = (2 * tc + parm3 - 1) / parm3;
682 
683     if (parm3 < 2) {
684       parm3 = 2;
685     }
686 
687     /* sigma : decreasing incr of the trapezoid */
688     parm4 = (parm3 - 1);
689     parm4 = (parm2 - parm1) / parm4;
690 
691     // pointless check, because parm4 >= 0 always
692     // if ( parm4 < 0 ) {
693     //    parm4 = 0;
694     //}
695 
696     pr->u.p.parm1 = parm1;
697     pr->u.p.parm2 = parm2;
698     pr->u.p.parm3 = parm3;
699     pr->u.p.parm4 = parm4;
700   } // case
701   break;
702 
703   default: {
704     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
705                 KMP_HNT(GetNewerLibrary), // Hint
706                 __kmp_msg_null // Variadic argument list terminator
707     );
708   } break;
709   } // switch
710   pr->schedule = schedule;
711 }
712 
713 #if KMP_USE_HIER_SCHED
714 template <typename T>
715 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
716                                              typename traits_t<T>::signed_t st);
717 template <>
718 inline void
719 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
720                                             kmp_int32 ub, kmp_int32 st) {
721   __kmp_dispatch_init_hierarchy<kmp_int32>(
722       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
723       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
724 }
725 template <>
726 inline void
727 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
728                                              kmp_uint32 ub, kmp_int32 st) {
729   __kmp_dispatch_init_hierarchy<kmp_uint32>(
730       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
731       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
732 }
733 template <>
734 inline void
735 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
736                                             kmp_int64 ub, kmp_int64 st) {
737   __kmp_dispatch_init_hierarchy<kmp_int64>(
738       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
739       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
740 }
741 template <>
742 inline void
743 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
744                                              kmp_uint64 ub, kmp_int64 st) {
745   __kmp_dispatch_init_hierarchy<kmp_uint64>(
746       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
747       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
748 }
749 
750 // free all the hierarchy scheduling memory associated with the team
751 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
752   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
753   for (int i = 0; i < num_disp_buff; ++i) {
754     // type does not matter here so use kmp_int32
755     auto sh =
756         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
757             &team->t.t_disp_buffer[i]);
758     if (sh->hier) {
759       sh->hier->deallocate();
760       __kmp_free(sh->hier);
761     }
762   }
763 }
764 #endif
765 
766 // UT - unsigned flavor of T, ST - signed flavor of T,
767 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
768 template <typename T>
769 static void
770 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
771                     T ub, typename traits_t<T>::signed_t st,
772                     typename traits_t<T>::signed_t chunk, int push_ws) {
773   typedef typename traits_t<T>::unsigned_t UT;
774 
775   int active;
776   kmp_info_t *th;
777   kmp_team_t *team;
778   kmp_uint32 my_buffer_index;
779   dispatch_private_info_template<T> *pr;
780   dispatch_shared_info_template<T> volatile *sh;
781 
782   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
783                    sizeof(dispatch_private_info));
784   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
785                    sizeof(dispatch_shared_info));
786   __kmp_assert_valid_gtid(gtid);
787 
788   if (!TCR_4(__kmp_init_parallel))
789     __kmp_parallel_initialize();
790 
791   __kmp_resume_if_soft_paused();
792 
793 #if INCLUDE_SSC_MARKS
794   SSC_MARK_DISPATCH_INIT();
795 #endif
796 #ifdef KMP_DEBUG
797   typedef typename traits_t<T>::signed_t ST;
798   {
799     char *buff;
800     // create format specifiers before the debug output
801     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
802                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
803                             traits_t<ST>::spec, traits_t<T>::spec,
804                             traits_t<T>::spec, traits_t<ST>::spec);
805     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
806     __kmp_str_free(&buff);
807   }
808 #endif
809   /* setup data */
810   th = __kmp_threads[gtid];
811   team = th->th.th_team;
812   active = !team->t.t_serialized;
813   th->th.th_ident = loc;
814 
815   // Any half-decent optimizer will remove this test when the blocks are empty
816   // since the macros expand to nothing
817   // when statistics are disabled.
818   if (schedule == __kmp_static) {
819     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
820   } else {
821     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
822   }
823 
824 #if KMP_USE_HIER_SCHED
825   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
826   // Hierarchical scheduling does not work with ordered, so if ordered is
827   // detected, then revert back to threaded scheduling.
828   bool ordered;
829   enum sched_type my_sched = schedule;
830   my_buffer_index = th->th.th_dispatch->th_disp_index;
831   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
832       &th->th.th_dispatch
833            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
834   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
835   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
836     my_sched =
837         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
838   ordered = (kmp_ord_lower & my_sched);
839   if (pr->flags.use_hier) {
840     if (ordered) {
841       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
842                      "Disabling hierarchical scheduling.\n",
843                      gtid));
844       pr->flags.use_hier = FALSE;
845     }
846   }
847   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
848     // Don't use hierarchical for ordered parallel loops and don't
849     // use the runtime hierarchy if one was specified in the program
850     if (!ordered && !pr->flags.use_hier)
851       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
852   }
853 #endif // KMP_USE_HIER_SCHED
854 
855 #if USE_ITT_BUILD
856   kmp_uint64 cur_chunk = chunk;
857   int itt_need_metadata_reporting =
858       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
859       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
860       team->t.t_active_level == 1;
861 #endif
862   if (!active) {
863     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
864         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
865   } else {
866     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
867                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
868 
869     my_buffer_index = th->th.th_dispatch->th_disp_index++;
870 
871     /* What happens when number of threads changes, need to resize buffer? */
872     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
873         &th->th.th_dispatch
874              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
876         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
877     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
878                   my_buffer_index));
879   }
880 
881   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
882 #if USE_ITT_BUILD
883                                 &cur_chunk,
884 #endif
885                                 chunk, (T)th->th.th_team_nproc,
886                                 (T)th->th.th_info.ds.ds_tid);
887   if (active) {
888     if (pr->flags.ordered == 0) {
889       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
890       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
891     } else {
892       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
893       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
894     }
895   }
896 
897   if (active) {
898     /* The name of this buffer should be my_buffer_index when it's free to use
899      * it */
900 
901     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
902                    "sh->buffer_index:%d\n",
903                    gtid, my_buffer_index, sh->buffer_index));
904     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
905                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
906     // Note: KMP_WAIT() cannot be used there: buffer index and
907     // my_buffer_index are *always* 32-bit integers.
908     KMP_MB(); /* is this necessary? */
909     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
910                    "sh->buffer_index:%d\n",
911                    gtid, my_buffer_index, sh->buffer_index));
912 
913     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
914     th->th.th_dispatch->th_dispatch_sh_current =
915         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
916 #if USE_ITT_BUILD
917     if (pr->flags.ordered) {
918       __kmp_itt_ordered_init(gtid);
919     }
920     // Report loop metadata
921     if (itt_need_metadata_reporting) {
922       // Only report metadata by master of active team at level 1
923       kmp_uint64 schedtype = 0;
924       switch (schedule) {
925       case kmp_sch_static_chunked:
926       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
927         break;
928       case kmp_sch_static_greedy:
929         cur_chunk = pr->u.p.parm1;
930         break;
931       case kmp_sch_dynamic_chunked:
932         schedtype = 1;
933         break;
934       case kmp_sch_guided_iterative_chunked:
935       case kmp_sch_guided_analytical_chunked:
936       case kmp_sch_guided_simd:
937         schedtype = 2;
938         break;
939       default:
940         // Should we put this case under "static"?
941         // case kmp_sch_static_steal:
942         schedtype = 3;
943         break;
944       }
945       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
946     }
947 #if KMP_USE_HIER_SCHED
948     if (pr->flags.use_hier) {
949       pr->u.p.count = 0;
950       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
951     }
952 #endif // KMP_USER_HIER_SCHED
953 #endif /* USE_ITT_BUILD */
954   }
955 
956 #ifdef KMP_DEBUG
957   {
958     char *buff;
959     // create format specifiers before the debug output
960     buff = __kmp_str_format(
961         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
962         "lb:%%%s ub:%%%s"
963         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
964         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
965         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
966         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
967         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
968         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
969     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
970                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
971                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
972                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
973     __kmp_str_free(&buff);
974   }
975 #endif
976 #if (KMP_STATIC_STEAL_ENABLED)
977   // It cannot be guaranteed that after execution of a loop with some other
978   // schedule kind all the parm3 variables will contain the same value. Even if
979   // all parm3 will be the same, it still exists a bad case like using 0 and 1
980   // rather than program life-time increment. So the dedicated variable is
981   // required. The 'static_steal_counter' is used.
982   if (pr->schedule == kmp_sch_static_steal) {
983     // Other threads will inspect this variable when searching for a victim.
984     // This is a flag showing that other threads may steal from this thread
985     // since then.
986     volatile T *p = &pr->u.p.static_steal_counter;
987     *p = *p + 1;
988   }
989 #endif // ( KMP_STATIC_STEAL_ENABLED )
990 
991 #if OMPT_SUPPORT && OMPT_OPTIONAL
992   if (ompt_enabled.ompt_callback_work) {
993     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
994     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
995     ompt_callbacks.ompt_callback(ompt_callback_work)(
996         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
997         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
998   }
999 #endif
1000   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1001 }
1002 
1003 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1004  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1005  * every chunk of iterations.  If the ordered section(s) were not executed
1006  * for this iteration (or every iteration in this chunk), we need to set the
1007  * ordered iteration counters so that the next thread can proceed. */
1008 template <typename UT>
1009 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1010   typedef typename traits_t<UT>::signed_t ST;
1011   __kmp_assert_valid_gtid(gtid);
1012   kmp_info_t *th = __kmp_threads[gtid];
1013 
1014   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1015   if (!th->th.th_team->t.t_serialized) {
1016 
1017     dispatch_private_info_template<UT> *pr =
1018         reinterpret_cast<dispatch_private_info_template<UT> *>(
1019             th->th.th_dispatch->th_dispatch_pr_current);
1020     dispatch_shared_info_template<UT> volatile *sh =
1021         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1022             th->th.th_dispatch->th_dispatch_sh_current);
1023     KMP_DEBUG_ASSERT(pr);
1024     KMP_DEBUG_ASSERT(sh);
1025     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1026                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1027 
1028     if (pr->ordered_bumped) {
1029       KD_TRACE(
1030           1000,
1031           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1032            gtid));
1033       pr->ordered_bumped = 0;
1034     } else {
1035       UT lower = pr->u.p.ordered_lower;
1036 
1037 #ifdef KMP_DEBUG
1038       {
1039         char *buff;
1040         // create format specifiers before the debug output
1041         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1042                                 "ordered_iteration:%%%s lower:%%%s\n",
1043                                 traits_t<UT>::spec, traits_t<UT>::spec);
1044         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1045         __kmp_str_free(&buff);
1046       }
1047 #endif
1048 
1049       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1050                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1051       KMP_MB(); /* is this necessary? */
1052 #ifdef KMP_DEBUG
1053       {
1054         char *buff;
1055         // create format specifiers before the debug output
1056         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1057                                 "ordered_iteration:%%%s lower:%%%s\n",
1058                                 traits_t<UT>::spec, traits_t<UT>::spec);
1059         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1060         __kmp_str_free(&buff);
1061       }
1062 #endif
1063 
1064       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1065     } // if
1066   } // if
1067   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1068 }
1069 
1070 #ifdef KMP_GOMP_COMPAT
1071 
1072 template <typename UT>
1073 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1074   typedef typename traits_t<UT>::signed_t ST;
1075   __kmp_assert_valid_gtid(gtid);
1076   kmp_info_t *th = __kmp_threads[gtid];
1077 
1078   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1079   if (!th->th.th_team->t.t_serialized) {
1080     //        int cid;
1081     dispatch_private_info_template<UT> *pr =
1082         reinterpret_cast<dispatch_private_info_template<UT> *>(
1083             th->th.th_dispatch->th_dispatch_pr_current);
1084     dispatch_shared_info_template<UT> volatile *sh =
1085         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1086             th->th.th_dispatch->th_dispatch_sh_current);
1087     KMP_DEBUG_ASSERT(pr);
1088     KMP_DEBUG_ASSERT(sh);
1089     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1090                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1091 
1092     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1093     UT lower = pr->u.p.ordered_lower;
1094     UT upper = pr->u.p.ordered_upper;
1095     UT inc = upper - lower + 1;
1096 
1097     if (pr->ordered_bumped == inc) {
1098       KD_TRACE(
1099           1000,
1100           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1101            gtid));
1102       pr->ordered_bumped = 0;
1103     } else {
1104       inc -= pr->ordered_bumped;
1105 
1106 #ifdef KMP_DEBUG
1107       {
1108         char *buff;
1109         // create format specifiers before the debug output
1110         buff = __kmp_str_format(
1111             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1112             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1113             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1114         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1115         __kmp_str_free(&buff);
1116       }
1117 #endif
1118 
1119       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1120                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1121 
1122       KMP_MB(); /* is this necessary? */
1123       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1124                       "ordered_bumped to zero\n",
1125                       gtid));
1126       pr->ordered_bumped = 0;
1127 //!!!!! TODO check if the inc should be unsigned, or signed???
1128 #ifdef KMP_DEBUG
1129       {
1130         char *buff;
1131         // create format specifiers before the debug output
1132         buff = __kmp_str_format(
1133             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1134             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1135             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1136             traits_t<UT>::spec);
1137         KD_TRACE(1000,
1138                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1139         __kmp_str_free(&buff);
1140       }
1141 #endif
1142 
1143       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1144     }
1145     //        }
1146   }
1147   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1148 }
1149 
1150 #endif /* KMP_GOMP_COMPAT */
1151 
1152 template <typename T>
1153 int __kmp_dispatch_next_algorithm(int gtid,
1154                                   dispatch_private_info_template<T> *pr,
1155                                   dispatch_shared_info_template<T> volatile *sh,
1156                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1157                                   typename traits_t<T>::signed_t *p_st, T nproc,
1158                                   T tid) {
1159   typedef typename traits_t<T>::unsigned_t UT;
1160   typedef typename traits_t<T>::signed_t ST;
1161   typedef typename traits_t<T>::floating_t DBL;
1162   int status = 0;
1163   bool last = false;
1164   T start;
1165   ST incr;
1166   UT limit, trip, init;
1167   kmp_info_t *th = __kmp_threads[gtid];
1168   kmp_team_t *team = th->th.th_team;
1169 
1170   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1171                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1172   KMP_DEBUG_ASSERT(pr);
1173   KMP_DEBUG_ASSERT(sh);
1174   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1175 #ifdef KMP_DEBUG
1176   {
1177     char *buff;
1178     // create format specifiers before the debug output
1179     buff =
1180         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1181                          "sh:%%p nproc:%%%s tid:%%%s\n",
1182                          traits_t<T>::spec, traits_t<T>::spec);
1183     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1184     __kmp_str_free(&buff);
1185   }
1186 #endif
1187 
1188   // zero trip count
1189   if (pr->u.p.tc == 0) {
1190     KD_TRACE(10,
1191              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1192               "zero status:%d\n",
1193               gtid, status));
1194     return 0;
1195   }
1196 
1197   switch (pr->schedule) {
1198 #if (KMP_STATIC_STEAL_ENABLED)
1199   case kmp_sch_static_steal: {
1200     T chunk = pr->u.p.parm1;
1201 
1202     KD_TRACE(100,
1203              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1204               gtid));
1205 
1206     trip = pr->u.p.tc - 1;
1207 
1208     if (traits_t<T>::type_size > 4) {
1209       // use lock for 8-byte and CAS for 4-byte induction
1210       // variable. TODO (optional): check and use 16-byte CAS
1211       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1212       KMP_DEBUG_ASSERT(lck != NULL);
1213       if (pr->u.p.count < (UT)pr->u.p.ub) {
1214         __kmp_acquire_lock(lck, gtid);
1215         // try to get own chunk of iterations
1216         init = (pr->u.p.count)++;
1217         status = (init < (UT)pr->u.p.ub);
1218         __kmp_release_lock(lck, gtid);
1219       } else {
1220         status = 0; // no own chunks
1221       }
1222       if (!status) { // try to steal
1223         kmp_info_t **other_threads = team->t.t_threads;
1224         T while_limit = pr->u.p.parm3;
1225         T while_index = 0;
1226         T id = pr->u.p.static_steal_counter; // loop id
1227         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1228                   __kmp_dispatch_num_buffers; // current loop index
1229         // note: victim thread can potentially execute another loop
1230         // TODO: algorithm of searching for a victim
1231         // should be cleaned up and measured
1232         while ((!status) && (while_limit != ++while_index)) {
1233           dispatch_private_info_template<T> *victim;
1234           T remaining;
1235           T victimIdx = pr->u.p.parm4;
1236           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1237           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1238               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1239           KMP_DEBUG_ASSERT(victim);
1240           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1241                  oldVictimIdx != victimIdx) {
1242             victimIdx = (victimIdx + 1) % nproc;
1243             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1244                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1245             KMP_DEBUG_ASSERT(victim);
1246           }
1247           if (victim == pr || id != victim->u.p.static_steal_counter) {
1248             continue; // try once more (nproc attempts in total)
1249             // no victim is ready yet to participate in stealing
1250             // because no victim passed kmp_init_dispatch yet
1251           }
1252           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1253             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1254             continue; // not enough chunks to steal, goto next victim
1255           }
1256 
1257           lck = victim->u.p.th_steal_lock;
1258           KMP_ASSERT(lck != NULL);
1259           __kmp_acquire_lock(lck, gtid);
1260           limit = victim->u.p.ub; // keep initial ub
1261           if (victim->u.p.count >= limit ||
1262               (remaining = limit - victim->u.p.count) < 2) {
1263             __kmp_release_lock(lck, gtid);
1264             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1265             continue; // not enough chunks to steal
1266           }
1267           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1268           // by 1
1269           if (remaining > 3) {
1270             // steal 1/4 of remaining
1271             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1272             init = (victim->u.p.ub -= (remaining >> 2));
1273           } else {
1274             // steal 1 chunk of 2 or 3 remaining
1275             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1276             init = (victim->u.p.ub -= 1);
1277           }
1278           __kmp_release_lock(lck, gtid);
1279 
1280           KMP_DEBUG_ASSERT(init + 1 <= limit);
1281           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1282           status = 1;
1283           while_index = 0;
1284           // now update own count and ub with stolen range but init chunk
1285           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1286           pr->u.p.count = init + 1;
1287           pr->u.p.ub = limit;
1288           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1289         } // while (search for victim)
1290       } // if (try to find victim and steal)
1291     } else {
1292       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1293       typedef union {
1294         struct {
1295           UT count;
1296           T ub;
1297         } p;
1298         kmp_int64 b;
1299       } union_i4;
1300       // All operations on 'count' or 'ub' must be combined atomically
1301       // together.
1302       {
1303         union_i4 vold, vnew;
1304         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1305         vnew = vold;
1306         vnew.p.count++;
1307         while (!KMP_COMPARE_AND_STORE_ACQ64(
1308             (volatile kmp_int64 *)&pr->u.p.count,
1309             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1310             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1311           KMP_CPU_PAUSE();
1312           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1313           vnew = vold;
1314           vnew.p.count++;
1315         }
1316         vnew = vold;
1317         init = vnew.p.count;
1318         status = (init < (UT)vnew.p.ub);
1319       }
1320 
1321       if (!status) {
1322         kmp_info_t **other_threads = team->t.t_threads;
1323         T while_limit = pr->u.p.parm3;
1324         T while_index = 0;
1325         T id = pr->u.p.static_steal_counter; // loop id
1326         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1327                   __kmp_dispatch_num_buffers; // current loop index
1328         // note: victim thread can potentially execute another loop
1329         // TODO: algorithm of searching for a victim
1330         // should be cleaned up and measured
1331         while ((!status) && (while_limit != ++while_index)) {
1332           dispatch_private_info_template<T> *victim;
1333           union_i4 vold, vnew;
1334           T remaining;
1335           T victimIdx = pr->u.p.parm4;
1336           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1337           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1338               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1339           KMP_DEBUG_ASSERT(victim);
1340           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1341                  oldVictimIdx != victimIdx) {
1342             victimIdx = (victimIdx + 1) % nproc;
1343             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1344                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1345             KMP_DEBUG_ASSERT(victim);
1346           }
1347           if (victim == pr || id != victim->u.p.static_steal_counter) {
1348             continue; // try once more (nproc attempts in total)
1349             // no victim is ready yet to participate in stealing
1350             // because no victim passed kmp_init_dispatch yet
1351           }
1352           pr->u.p.parm4 = victimIdx; // new victim found
1353           while (1) { // CAS loop if victim has enough chunks to steal
1354             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1355             vnew = vold;
1356 
1357             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1358             if (vnew.p.count >= (UT)vnew.p.ub ||
1359                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1360               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1361               break; // not enough chunks to steal, goto next victim
1362             }
1363             if (remaining > 3) {
1364               // try to steal 1/4 of remaining
1365               vnew.p.ub -= remaining >> 2;
1366             } else {
1367               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1368             }
1369             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1370             // TODO: Should this be acquire or release?
1371             if (KMP_COMPARE_AND_STORE_ACQ64(
1372                     (volatile kmp_int64 *)&victim->u.p.count,
1373                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1374                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1375               // stealing succeeded
1376               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1377                                         vold.p.ub - vnew.p.ub);
1378               status = 1;
1379               while_index = 0;
1380               // now update own count and ub
1381               init = vnew.p.ub;
1382               vold.p.count = init + 1;
1383 #if KMP_ARCH_X86
1384               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1385 #else
1386               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1387 #endif
1388               break;
1389             } // if (check CAS result)
1390             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1391           } // while (try to steal from particular victim)
1392         } // while (search for victim)
1393       } // if (try to find victim and steal)
1394     } // if (4-byte induction variable)
1395     if (!status) {
1396       *p_lb = 0;
1397       *p_ub = 0;
1398       if (p_st != NULL)
1399         *p_st = 0;
1400     } else {
1401       start = pr->u.p.parm2;
1402       init *= chunk;
1403       limit = chunk + init - 1;
1404       incr = pr->u.p.st;
1405       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1406 
1407       KMP_DEBUG_ASSERT(init <= trip);
1408       if ((last = (limit >= trip)) != 0)
1409         limit = trip;
1410       if (p_st != NULL)
1411         *p_st = incr;
1412 
1413       if (incr == 1) {
1414         *p_lb = start + init;
1415         *p_ub = start + limit;
1416       } else {
1417         *p_lb = start + init * incr;
1418         *p_ub = start + limit * incr;
1419       }
1420 
1421       if (pr->flags.ordered) {
1422         pr->u.p.ordered_lower = init;
1423         pr->u.p.ordered_upper = limit;
1424       } // if
1425     } // if
1426     break;
1427   } // case
1428 #endif // ( KMP_STATIC_STEAL_ENABLED )
1429   case kmp_sch_static_balanced: {
1430     KD_TRACE(
1431         10,
1432         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1433          gtid));
1434     /* check if thread has any iteration to do */
1435     if ((status = !pr->u.p.count) != 0) {
1436       pr->u.p.count = 1;
1437       *p_lb = pr->u.p.lb;
1438       *p_ub = pr->u.p.ub;
1439       last = (pr->u.p.parm1 != 0);
1440       if (p_st != NULL)
1441         *p_st = pr->u.p.st;
1442     } else { /* no iterations to do */
1443       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1444     }
1445   } // case
1446   break;
1447   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1448                                  merged here */
1449   case kmp_sch_static_chunked: {
1450     T parm1;
1451 
1452     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1453                    "kmp_sch_static_[affinity|chunked] case\n",
1454                    gtid));
1455     parm1 = pr->u.p.parm1;
1456 
1457     trip = pr->u.p.tc - 1;
1458     init = parm1 * (pr->u.p.count + tid);
1459 
1460     if ((status = (init <= trip)) != 0) {
1461       start = pr->u.p.lb;
1462       incr = pr->u.p.st;
1463       limit = parm1 + init - 1;
1464 
1465       if ((last = (limit >= trip)) != 0)
1466         limit = trip;
1467 
1468       if (p_st != NULL)
1469         *p_st = incr;
1470 
1471       pr->u.p.count += nproc;
1472 
1473       if (incr == 1) {
1474         *p_lb = start + init;
1475         *p_ub = start + limit;
1476       } else {
1477         *p_lb = start + init * incr;
1478         *p_ub = start + limit * incr;
1479       }
1480 
1481       if (pr->flags.ordered) {
1482         pr->u.p.ordered_lower = init;
1483         pr->u.p.ordered_upper = limit;
1484       } // if
1485     } // if
1486   } // case
1487   break;
1488 
1489   case kmp_sch_dynamic_chunked: {
1490     T chunk = pr->u.p.parm1;
1491 
1492     KD_TRACE(
1493         100,
1494         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1495          gtid));
1496 
1497     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1498     trip = pr->u.p.tc - 1;
1499 
1500     if ((status = (init <= trip)) == 0) {
1501       *p_lb = 0;
1502       *p_ub = 0;
1503       if (p_st != NULL)
1504         *p_st = 0;
1505     } else {
1506       start = pr->u.p.lb;
1507       limit = chunk + init - 1;
1508       incr = pr->u.p.st;
1509 
1510       if ((last = (limit >= trip)) != 0)
1511         limit = trip;
1512 
1513       if (p_st != NULL)
1514         *p_st = incr;
1515 
1516       if (incr == 1) {
1517         *p_lb = start + init;
1518         *p_ub = start + limit;
1519       } else {
1520         *p_lb = start + init * incr;
1521         *p_ub = start + limit * incr;
1522       }
1523 
1524       if (pr->flags.ordered) {
1525         pr->u.p.ordered_lower = init;
1526         pr->u.p.ordered_upper = limit;
1527       } // if
1528     } // if
1529   } // case
1530   break;
1531 
1532   case kmp_sch_guided_iterative_chunked: {
1533     T chunkspec = pr->u.p.parm1;
1534     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1535                    "iterative case\n",
1536                    gtid));
1537     trip = pr->u.p.tc;
1538     // Start atomic part of calculations
1539     while (1) {
1540       ST remaining; // signed, because can be < 0
1541       init = sh->u.s.iteration; // shared value
1542       remaining = trip - init;
1543       if (remaining <= 0) { // AC: need to compare with 0 first
1544         // nothing to do, don't try atomic op
1545         status = 0;
1546         break;
1547       }
1548       if ((T)remaining <
1549           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1550         // use dynamic-style schedule
1551         // atomically increment iterations, get old value
1552         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1553                                  (ST)chunkspec);
1554         remaining = trip - init;
1555         if (remaining <= 0) {
1556           status = 0; // all iterations got by other threads
1557         } else {
1558           // got some iterations to work on
1559           status = 1;
1560           if ((T)remaining > chunkspec) {
1561             limit = init + chunkspec - 1;
1562           } else {
1563             last = true; // the last chunk
1564             limit = init + remaining - 1;
1565           } // if
1566         } // if
1567         break;
1568       } // if
1569       limit = init + (UT)((double)remaining *
1570                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1571       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1572                                (ST)init, (ST)limit)) {
1573         // CAS was successful, chunk obtained
1574         status = 1;
1575         --limit;
1576         break;
1577       } // if
1578     } // while
1579     if (status != 0) {
1580       start = pr->u.p.lb;
1581       incr = pr->u.p.st;
1582       if (p_st != NULL)
1583         *p_st = incr;
1584       *p_lb = start + init * incr;
1585       *p_ub = start + limit * incr;
1586       if (pr->flags.ordered) {
1587         pr->u.p.ordered_lower = init;
1588         pr->u.p.ordered_upper = limit;
1589       } // if
1590     } else {
1591       *p_lb = 0;
1592       *p_ub = 0;
1593       if (p_st != NULL)
1594         *p_st = 0;
1595     } // if
1596   } // case
1597   break;
1598 
1599   case kmp_sch_guided_simd: {
1600     // same as iterative but curr-chunk adjusted to be multiple of given
1601     // chunk
1602     T chunk = pr->u.p.parm1;
1603     KD_TRACE(100,
1604              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1605               gtid));
1606     trip = pr->u.p.tc;
1607     // Start atomic part of calculations
1608     while (1) {
1609       ST remaining; // signed, because can be < 0
1610       init = sh->u.s.iteration; // shared value
1611       remaining = trip - init;
1612       if (remaining <= 0) { // AC: need to compare with 0 first
1613         status = 0; // nothing to do, don't try atomic op
1614         break;
1615       }
1616       KMP_DEBUG_ASSERT(init % chunk == 0);
1617       // compare with K*nproc*(chunk+1), K=2 by default
1618       if ((T)remaining < pr->u.p.parm2) {
1619         // use dynamic-style schedule
1620         // atomically increment iterations, get old value
1621         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1622                                  (ST)chunk);
1623         remaining = trip - init;
1624         if (remaining <= 0) {
1625           status = 0; // all iterations got by other threads
1626         } else {
1627           // got some iterations to work on
1628           status = 1;
1629           if ((T)remaining > chunk) {
1630             limit = init + chunk - 1;
1631           } else {
1632             last = true; // the last chunk
1633             limit = init + remaining - 1;
1634           } // if
1635         } // if
1636         break;
1637       } // if
1638       // divide by K*nproc
1639       UT span;
1640       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1641                          &span);
1642       UT rem = span % chunk;
1643       if (rem) // adjust so that span%chunk == 0
1644         span += chunk - rem;
1645       limit = init + span;
1646       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1647                                (ST)init, (ST)limit)) {
1648         // CAS was successful, chunk obtained
1649         status = 1;
1650         --limit;
1651         break;
1652       } // if
1653     } // while
1654     if (status != 0) {
1655       start = pr->u.p.lb;
1656       incr = pr->u.p.st;
1657       if (p_st != NULL)
1658         *p_st = incr;
1659       *p_lb = start + init * incr;
1660       *p_ub = start + limit * incr;
1661       if (pr->flags.ordered) {
1662         pr->u.p.ordered_lower = init;
1663         pr->u.p.ordered_upper = limit;
1664       } // if
1665     } else {
1666       *p_lb = 0;
1667       *p_ub = 0;
1668       if (p_st != NULL)
1669         *p_st = 0;
1670     } // if
1671   } // case
1672   break;
1673 
1674   case kmp_sch_guided_analytical_chunked: {
1675     T chunkspec = pr->u.p.parm1;
1676     UT chunkIdx;
1677 #if KMP_USE_X87CONTROL
1678     /* for storing original FPCW value for Windows* OS on
1679        IA-32 architecture 8-byte version */
1680     unsigned int oldFpcw;
1681     unsigned int fpcwSet = 0;
1682 #endif
1683     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1684                    "kmp_sch_guided_analytical_chunked case\n",
1685                    gtid));
1686 
1687     trip = pr->u.p.tc;
1688 
1689     KMP_DEBUG_ASSERT(nproc > 1);
1690     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1691 
1692     while (1) { /* this while loop is a safeguard against unexpected zero
1693                    chunk sizes */
1694       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1695       if (chunkIdx >= (UT)pr->u.p.parm2) {
1696         --trip;
1697         /* use dynamic-style scheduling */
1698         init = chunkIdx * chunkspec + pr->u.p.count;
1699         /* need to verify init > 0 in case of overflow in the above
1700          * calculation */
1701         if ((status = (init > 0 && init <= trip)) != 0) {
1702           limit = init + chunkspec - 1;
1703 
1704           if ((last = (limit >= trip)) != 0)
1705             limit = trip;
1706         }
1707         break;
1708       } else {
1709 /* use exponential-style scheduling */
1710 /* The following check is to workaround the lack of long double precision on
1711    Windows* OS.
1712    This check works around the possible effect that init != 0 for chunkIdx == 0.
1713  */
1714 #if KMP_USE_X87CONTROL
1715         /* If we haven't already done so, save original
1716            FPCW and set precision to 64-bit, as Windows* OS
1717            on IA-32 architecture defaults to 53-bit */
1718         if (!fpcwSet) {
1719           oldFpcw = _control87(0, 0);
1720           _control87(_PC_64, _MCW_PC);
1721           fpcwSet = 0x30000;
1722         }
1723 #endif
1724         if (chunkIdx) {
1725           init = __kmp_dispatch_guided_remaining<T>(
1726               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1727           KMP_DEBUG_ASSERT(init);
1728           init = trip - init;
1729         } else
1730           init = 0;
1731         limit = trip - __kmp_dispatch_guided_remaining<T>(
1732                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1733         KMP_ASSERT(init <= limit);
1734         if (init < limit) {
1735           KMP_DEBUG_ASSERT(limit <= trip);
1736           --limit;
1737           status = 1;
1738           break;
1739         } // if
1740       } // if
1741     } // while (1)
1742 #if KMP_USE_X87CONTROL
1743     /* restore FPCW if necessary
1744        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1745     */
1746     if (fpcwSet && (oldFpcw & fpcwSet))
1747       _control87(oldFpcw, _MCW_PC);
1748 #endif
1749     if (status != 0) {
1750       start = pr->u.p.lb;
1751       incr = pr->u.p.st;
1752       if (p_st != NULL)
1753         *p_st = incr;
1754       *p_lb = start + init * incr;
1755       *p_ub = start + limit * incr;
1756       if (pr->flags.ordered) {
1757         pr->u.p.ordered_lower = init;
1758         pr->u.p.ordered_upper = limit;
1759       }
1760     } else {
1761       *p_lb = 0;
1762       *p_ub = 0;
1763       if (p_st != NULL)
1764         *p_st = 0;
1765     }
1766   } // case
1767   break;
1768 
1769   case kmp_sch_trapezoidal: {
1770     UT index;
1771     T parm2 = pr->u.p.parm2;
1772     T parm3 = pr->u.p.parm3;
1773     T parm4 = pr->u.p.parm4;
1774     KD_TRACE(100,
1775              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1776               gtid));
1777 
1778     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1779 
1780     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1781     trip = pr->u.p.tc - 1;
1782 
1783     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1784       *p_lb = 0;
1785       *p_ub = 0;
1786       if (p_st != NULL)
1787         *p_st = 0;
1788     } else {
1789       start = pr->u.p.lb;
1790       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1791       incr = pr->u.p.st;
1792 
1793       if ((last = (limit >= trip)) != 0)
1794         limit = trip;
1795 
1796       if (p_st != NULL)
1797         *p_st = incr;
1798 
1799       if (incr == 1) {
1800         *p_lb = start + init;
1801         *p_ub = start + limit;
1802       } else {
1803         *p_lb = start + init * incr;
1804         *p_ub = start + limit * incr;
1805       }
1806 
1807       if (pr->flags.ordered) {
1808         pr->u.p.ordered_lower = init;
1809         pr->u.p.ordered_upper = limit;
1810       } // if
1811     } // if
1812   } // case
1813   break;
1814   default: {
1815     status = 0; // to avoid complaints on uninitialized variable use
1816     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1817                 KMP_HNT(GetNewerLibrary), // Hint
1818                 __kmp_msg_null // Variadic argument list terminator
1819     );
1820   } break;
1821   } // switch
1822   if (p_last)
1823     *p_last = last;
1824 #ifdef KMP_DEBUG
1825   if (pr->flags.ordered) {
1826     char *buff;
1827     // create format specifiers before the debug output
1828     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1829                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1830                             traits_t<UT>::spec, traits_t<UT>::spec);
1831     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1832     __kmp_str_free(&buff);
1833   }
1834   {
1835     char *buff;
1836     // create format specifiers before the debug output
1837     buff = __kmp_str_format(
1838         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1839         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1840         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1841     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1842     __kmp_str_free(&buff);
1843   }
1844 #endif
1845   return status;
1846 }
1847 
1848 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1849    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1850    is not called. */
1851 #if OMPT_SUPPORT && OMPT_OPTIONAL
1852 #define OMPT_LOOP_END                                                          \
1853   if (status == 0) {                                                           \
1854     if (ompt_enabled.ompt_callback_work) {                                     \
1855       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1856       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1857       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1858           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1859           &(task_info->task_data), 0, codeptr);                                \
1860     }                                                                          \
1861   }
1862 // TODO: implement count
1863 #else
1864 #define OMPT_LOOP_END // no-op
1865 #endif
1866 
1867 #if KMP_STATS_ENABLED
1868 #define KMP_STATS_LOOP_END                                                     \
1869   {                                                                            \
1870     kmp_int64 u, l, t, i;                                                      \
1871     l = (kmp_int64)(*p_lb);                                                    \
1872     u = (kmp_int64)(*p_ub);                                                    \
1873     i = (kmp_int64)(pr->u.p.st);                                               \
1874     if (status == 0) {                                                         \
1875       t = 0;                                                                   \
1876       KMP_POP_PARTITIONED_TIMER();                                             \
1877     } else if (i == 1) {                                                       \
1878       if (u >= l)                                                              \
1879         t = u - l + 1;                                                         \
1880       else                                                                     \
1881         t = 0;                                                                 \
1882     } else if (i < 0) {                                                        \
1883       if (l >= u)                                                              \
1884         t = (l - u) / (-i) + 1;                                                \
1885       else                                                                     \
1886         t = 0;                                                                 \
1887     } else {                                                                   \
1888       if (u >= l)                                                              \
1889         t = (u - l) / i + 1;                                                   \
1890       else                                                                     \
1891         t = 0;                                                                 \
1892     }                                                                          \
1893     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1894   }
1895 #else
1896 #define KMP_STATS_LOOP_END /* Nothing */
1897 #endif
1898 
1899 template <typename T>
1900 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1901                                T *p_lb, T *p_ub,
1902                                typename traits_t<T>::signed_t *p_st
1903 #if OMPT_SUPPORT && OMPT_OPTIONAL
1904                                ,
1905                                void *codeptr
1906 #endif
1907 ) {
1908 
1909   typedef typename traits_t<T>::unsigned_t UT;
1910   typedef typename traits_t<T>::signed_t ST;
1911   // This is potentially slightly misleading, schedule(runtime) will appear here
1912   // even if the actual runtime schedule is static. (Which points out a
1913   // disadvantage of schedule(runtime): even when static scheduling is used it
1914   // costs more than a compile time choice to use static scheduling would.)
1915   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1916 
1917   int status;
1918   dispatch_private_info_template<T> *pr;
1919   __kmp_assert_valid_gtid(gtid);
1920   kmp_info_t *th = __kmp_threads[gtid];
1921   kmp_team_t *team = th->th.th_team;
1922 
1923   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1924   KD_TRACE(
1925       1000,
1926       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1927        gtid, p_lb, p_ub, p_st, p_last));
1928 
1929   if (team->t.t_serialized) {
1930     /* NOTE: serialize this dispatch because we are not at the active level */
1931     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1932         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1933     KMP_DEBUG_ASSERT(pr);
1934 
1935     if ((status = (pr->u.p.tc != 0)) == 0) {
1936       *p_lb = 0;
1937       *p_ub = 0;
1938       //            if ( p_last != NULL )
1939       //                *p_last = 0;
1940       if (p_st != NULL)
1941         *p_st = 0;
1942       if (__kmp_env_consistency_check) {
1943         if (pr->pushed_ws != ct_none) {
1944           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1945         }
1946       }
1947     } else if (pr->flags.nomerge) {
1948       kmp_int32 last;
1949       T start;
1950       UT limit, trip, init;
1951       ST incr;
1952       T chunk = pr->u.p.parm1;
1953 
1954       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1955                      gtid));
1956 
1957       init = chunk * pr->u.p.count++;
1958       trip = pr->u.p.tc - 1;
1959 
1960       if ((status = (init <= trip)) == 0) {
1961         *p_lb = 0;
1962         *p_ub = 0;
1963         //                if ( p_last != NULL )
1964         //                    *p_last = 0;
1965         if (p_st != NULL)
1966           *p_st = 0;
1967         if (__kmp_env_consistency_check) {
1968           if (pr->pushed_ws != ct_none) {
1969             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1970           }
1971         }
1972       } else {
1973         start = pr->u.p.lb;
1974         limit = chunk + init - 1;
1975         incr = pr->u.p.st;
1976 
1977         if ((last = (limit >= trip)) != 0) {
1978           limit = trip;
1979 #if KMP_OS_WINDOWS
1980           pr->u.p.last_upper = pr->u.p.ub;
1981 #endif /* KMP_OS_WINDOWS */
1982         }
1983         if (p_last != NULL)
1984           *p_last = last;
1985         if (p_st != NULL)
1986           *p_st = incr;
1987         if (incr == 1) {
1988           *p_lb = start + init;
1989           *p_ub = start + limit;
1990         } else {
1991           *p_lb = start + init * incr;
1992           *p_ub = start + limit * incr;
1993         }
1994 
1995         if (pr->flags.ordered) {
1996           pr->u.p.ordered_lower = init;
1997           pr->u.p.ordered_upper = limit;
1998 #ifdef KMP_DEBUG
1999           {
2000             char *buff;
2001             // create format specifiers before the debug output
2002             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2003                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2004                                     traits_t<UT>::spec, traits_t<UT>::spec);
2005             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2006                             pr->u.p.ordered_upper));
2007             __kmp_str_free(&buff);
2008           }
2009 #endif
2010         } // if
2011       } // if
2012     } else {
2013       pr->u.p.tc = 0;
2014       *p_lb = pr->u.p.lb;
2015       *p_ub = pr->u.p.ub;
2016 #if KMP_OS_WINDOWS
2017       pr->u.p.last_upper = *p_ub;
2018 #endif /* KMP_OS_WINDOWS */
2019       if (p_last != NULL)
2020         *p_last = TRUE;
2021       if (p_st != NULL)
2022         *p_st = pr->u.p.st;
2023     } // if
2024 #ifdef KMP_DEBUG
2025     {
2026       char *buff;
2027       // create format specifiers before the debug output
2028       buff = __kmp_str_format(
2029           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2030           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2031           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2032       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2033                     (p_last ? *p_last : 0), status));
2034       __kmp_str_free(&buff);
2035     }
2036 #endif
2037 #if INCLUDE_SSC_MARKS
2038     SSC_MARK_DISPATCH_NEXT();
2039 #endif
2040     OMPT_LOOP_END;
2041     KMP_STATS_LOOP_END;
2042     return status;
2043   } else {
2044     kmp_int32 last = 0;
2045     dispatch_shared_info_template<T> volatile *sh;
2046 
2047     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2048                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2049 
2050     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2051         th->th.th_dispatch->th_dispatch_pr_current);
2052     KMP_DEBUG_ASSERT(pr);
2053     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2054         th->th.th_dispatch->th_dispatch_sh_current);
2055     KMP_DEBUG_ASSERT(sh);
2056 
2057 #if KMP_USE_HIER_SCHED
2058     if (pr->flags.use_hier)
2059       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2060     else
2061 #endif // KMP_USE_HIER_SCHED
2062       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2063                                                 p_st, th->th.th_team_nproc,
2064                                                 th->th.th_info.ds.ds_tid);
2065     // status == 0: no more iterations to execute
2066     if (status == 0) {
2067       UT num_done;
2068 
2069       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2070 #ifdef KMP_DEBUG
2071       {
2072         char *buff;
2073         // create format specifiers before the debug output
2074         buff = __kmp_str_format(
2075             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2076             traits_t<UT>::spec);
2077         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2078         __kmp_str_free(&buff);
2079       }
2080 #endif
2081 
2082 #if KMP_USE_HIER_SCHED
2083       pr->flags.use_hier = FALSE;
2084 #endif
2085       if ((ST)num_done == th->th.th_team_nproc - 1) {
2086 #if (KMP_STATIC_STEAL_ENABLED)
2087         if (pr->schedule == kmp_sch_static_steal &&
2088             traits_t<T>::type_size > 4) {
2089           int i;
2090           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2091                     __kmp_dispatch_num_buffers; // current loop index
2092           kmp_info_t **other_threads = team->t.t_threads;
2093           // loop complete, safe to destroy locks used for stealing
2094           for (i = 0; i < th->th.th_team_nproc; ++i) {
2095             dispatch_private_info_template<T> *buf =
2096                 reinterpret_cast<dispatch_private_info_template<T> *>(
2097                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2098             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2099             KMP_ASSERT(lck != NULL);
2100             __kmp_destroy_lock(lck);
2101             __kmp_free(lck);
2102             buf->u.p.th_steal_lock = NULL;
2103           }
2104         }
2105 #endif
2106         /* NOTE: release this buffer to be reused */
2107 
2108         KMP_MB(); /* Flush all pending memory write invalidates.  */
2109 
2110         sh->u.s.num_done = 0;
2111         sh->u.s.iteration = 0;
2112 
2113         /* TODO replace with general release procedure? */
2114         if (pr->flags.ordered) {
2115           sh->u.s.ordered_iteration = 0;
2116         }
2117 
2118         KMP_MB(); /* Flush all pending memory write invalidates.  */
2119 
2120         sh->buffer_index += __kmp_dispatch_num_buffers;
2121         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2122                        gtid, sh->buffer_index));
2123 
2124         KMP_MB(); /* Flush all pending memory write invalidates.  */
2125 
2126       } // if
2127       if (__kmp_env_consistency_check) {
2128         if (pr->pushed_ws != ct_none) {
2129           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2130         }
2131       }
2132 
2133       th->th.th_dispatch->th_deo_fcn = NULL;
2134       th->th.th_dispatch->th_dxo_fcn = NULL;
2135       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2136       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2137     } // if (status == 0)
2138 #if KMP_OS_WINDOWS
2139     else if (last) {
2140       pr->u.p.last_upper = pr->u.p.ub;
2141     }
2142 #endif /* KMP_OS_WINDOWS */
2143     if (p_last != NULL && status != 0)
2144       *p_last = last;
2145   } // if
2146 
2147 #ifdef KMP_DEBUG
2148   {
2149     char *buff;
2150     // create format specifiers before the debug output
2151     buff = __kmp_str_format(
2152         "__kmp_dispatch_next: T#%%d normal case: "
2153         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2154         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2155     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2156                   (p_last ? *p_last : 0), status));
2157     __kmp_str_free(&buff);
2158   }
2159 #endif
2160 #if INCLUDE_SSC_MARKS
2161   SSC_MARK_DISPATCH_NEXT();
2162 #endif
2163   OMPT_LOOP_END;
2164   KMP_STATS_LOOP_END;
2165   return status;
2166 }
2167 
2168 template <typename T>
2169 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2170                                   kmp_int32 *plastiter, T *plower, T *pupper,
2171                                   typename traits_t<T>::signed_t incr) {
2172   typedef typename traits_t<T>::unsigned_t UT;
2173   kmp_uint32 team_id;
2174   kmp_uint32 nteams;
2175   UT trip_count;
2176   kmp_team_t *team;
2177   kmp_info_t *th;
2178 
2179   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2180   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2181 #ifdef KMP_DEBUG
2182   typedef typename traits_t<T>::signed_t ST;
2183   {
2184     char *buff;
2185     // create format specifiers before the debug output
2186     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2187                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2188                             traits_t<T>::spec, traits_t<T>::spec,
2189                             traits_t<ST>::spec, traits_t<T>::spec);
2190     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2191     __kmp_str_free(&buff);
2192   }
2193 #endif
2194 
2195   if (__kmp_env_consistency_check) {
2196     if (incr == 0) {
2197       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2198                             loc);
2199     }
2200     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2201       // The loop is illegal.
2202       // Some zero-trip loops maintained by compiler, e.g.:
2203       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2204       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2205       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2206       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2207       // Compiler does not check the following illegal loops:
2208       //   for(i=0;i<10;i+=incr) // where incr<0
2209       //   for(i=10;i>0;i-=incr) // where incr<0
2210       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2211     }
2212   }
2213   __kmp_assert_valid_gtid(gtid);
2214   th = __kmp_threads[gtid];
2215   team = th->th.th_team;
2216   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2217   nteams = th->th.th_teams_size.nteams;
2218   team_id = team->t.t_master_tid;
2219   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2220 
2221   // compute global trip count
2222   if (incr == 1) {
2223     trip_count = *pupper - *plower + 1;
2224   } else if (incr == -1) {
2225     trip_count = *plower - *pupper + 1;
2226   } else if (incr > 0) {
2227     // upper-lower can exceed the limit of signed type
2228     trip_count = (UT)(*pupper - *plower) / incr + 1;
2229   } else {
2230     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2231   }
2232 
2233   if (trip_count <= nteams) {
2234     KMP_DEBUG_ASSERT(
2235         __kmp_static == kmp_sch_static_greedy ||
2236         __kmp_static ==
2237             kmp_sch_static_balanced); // Unknown static scheduling type.
2238     // only some teams get single iteration, others get nothing
2239     if (team_id < trip_count) {
2240       *pupper = *plower = *plower + team_id * incr;
2241     } else {
2242       *plower = *pupper + incr; // zero-trip loop
2243     }
2244     if (plastiter != NULL)
2245       *plastiter = (team_id == trip_count - 1);
2246   } else {
2247     if (__kmp_static == kmp_sch_static_balanced) {
2248       UT chunk = trip_count / nteams;
2249       UT extras = trip_count % nteams;
2250       *plower +=
2251           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2252       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2253       if (plastiter != NULL)
2254         *plastiter = (team_id == nteams - 1);
2255     } else {
2256       T chunk_inc_count =
2257           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2258       T upper = *pupper;
2259       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2260       // Unknown static scheduling type.
2261       *plower += team_id * chunk_inc_count;
2262       *pupper = *plower + chunk_inc_count - incr;
2263       // Check/correct bounds if needed
2264       if (incr > 0) {
2265         if (*pupper < *plower)
2266           *pupper = traits_t<T>::max_value;
2267         if (plastiter != NULL)
2268           *plastiter = *plower <= upper && *pupper > upper - incr;
2269         if (*pupper > upper)
2270           *pupper = upper; // tracker C73258
2271       } else {
2272         if (*pupper > *plower)
2273           *pupper = traits_t<T>::min_value;
2274         if (plastiter != NULL)
2275           *plastiter = *plower >= upper && *pupper < upper - incr;
2276         if (*pupper < upper)
2277           *pupper = upper; // tracker C73258
2278       }
2279     }
2280   }
2281 }
2282 
2283 //-----------------------------------------------------------------------------
2284 // Dispatch routines
2285 //    Transfer call to template< type T >
2286 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2287 //                         T lb, T ub, ST st, ST chunk )
2288 extern "C" {
2289 
2290 /*!
2291 @ingroup WORK_SHARING
2292 @{
2293 @param loc Source location
2294 @param gtid Global thread id
2295 @param schedule Schedule type
2296 @param lb  Lower bound
2297 @param ub  Upper bound
2298 @param st  Step (or increment if you prefer)
2299 @param chunk The chunk size to block with
2300 
2301 This function prepares the runtime to start a dynamically scheduled for loop,
2302 saving the loop arguments.
2303 These functions are all identical apart from the types of the arguments.
2304 */
2305 
2306 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2307                             enum sched_type schedule, kmp_int32 lb,
2308                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2309   KMP_DEBUG_ASSERT(__kmp_init_serial);
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311   OMPT_STORE_RETURN_ADDRESS(gtid);
2312 #endif
2313   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2314 }
2315 /*!
2316 See @ref __kmpc_dispatch_init_4
2317 */
2318 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2319                              enum sched_type schedule, kmp_uint32 lb,
2320                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2321   KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323   OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2326 }
2327 
2328 /*!
2329 See @ref __kmpc_dispatch_init_4
2330 */
2331 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2332                             enum sched_type schedule, kmp_int64 lb,
2333                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336   OMPT_STORE_RETURN_ADDRESS(gtid);
2337 #endif
2338   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2339 }
2340 
2341 /*!
2342 See @ref __kmpc_dispatch_init_4
2343 */
2344 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2345                              enum sched_type schedule, kmp_uint64 lb,
2346                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2347   KMP_DEBUG_ASSERT(__kmp_init_serial);
2348 #if OMPT_SUPPORT && OMPT_OPTIONAL
2349   OMPT_STORE_RETURN_ADDRESS(gtid);
2350 #endif
2351   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2352 }
2353 
2354 /*!
2355 See @ref __kmpc_dispatch_init_4
2356 
2357 Difference from __kmpc_dispatch_init set of functions is these functions
2358 are called for composite distribute parallel for construct. Thus before
2359 regular iterations dispatching we need to calc per-team iteration space.
2360 
2361 These functions are all identical apart from the types of the arguments.
2362 */
2363 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2364                                  enum sched_type schedule, kmp_int32 *p_last,
2365                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2366                                  kmp_int32 chunk) {
2367   KMP_DEBUG_ASSERT(__kmp_init_serial);
2368 #if OMPT_SUPPORT && OMPT_OPTIONAL
2369   OMPT_STORE_RETURN_ADDRESS(gtid);
2370 #endif
2371   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2372   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2373 }
2374 
2375 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2376                                   enum sched_type schedule, kmp_int32 *p_last,
2377                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2378                                   kmp_int32 chunk) {
2379   KMP_DEBUG_ASSERT(__kmp_init_serial);
2380 #if OMPT_SUPPORT && OMPT_OPTIONAL
2381   OMPT_STORE_RETURN_ADDRESS(gtid);
2382 #endif
2383   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2384   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2385 }
2386 
2387 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2388                                  enum sched_type schedule, kmp_int32 *p_last,
2389                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2390                                  kmp_int64 chunk) {
2391   KMP_DEBUG_ASSERT(__kmp_init_serial);
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393   OMPT_STORE_RETURN_ADDRESS(gtid);
2394 #endif
2395   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2396   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2397 }
2398 
2399 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2400                                   enum sched_type schedule, kmp_int32 *p_last,
2401                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2402                                   kmp_int64 chunk) {
2403   KMP_DEBUG_ASSERT(__kmp_init_serial);
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405   OMPT_STORE_RETURN_ADDRESS(gtid);
2406 #endif
2407   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2408   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2409 }
2410 
2411 /*!
2412 @param loc Source code location
2413 @param gtid Global thread id
2414 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2415 otherwise
2416 @param p_lb   Pointer to the lower bound for the next chunk of work
2417 @param p_ub   Pointer to the upper bound for the next chunk of work
2418 @param p_st   Pointer to the stride for the next chunk of work
2419 @return one if there is work to be done, zero otherwise
2420 
2421 Get the next dynamically allocated chunk of work for this thread.
2422 If there is no more work, then the lb,ub and stride need not be modified.
2423 */
2424 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2425                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2426 #if OMPT_SUPPORT && OMPT_OPTIONAL
2427   OMPT_STORE_RETURN_ADDRESS(gtid);
2428 #endif
2429   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431                                         ,
2432                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2433 #endif
2434   );
2435 }
2436 
2437 /*!
2438 See @ref __kmpc_dispatch_next_4
2439 */
2440 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2441                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2442                             kmp_int32 *p_st) {
2443 #if OMPT_SUPPORT && OMPT_OPTIONAL
2444   OMPT_STORE_RETURN_ADDRESS(gtid);
2445 #endif
2446   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2447 #if OMPT_SUPPORT && OMPT_OPTIONAL
2448                                          ,
2449                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2450 #endif
2451   );
2452 }
2453 
2454 /*!
2455 See @ref __kmpc_dispatch_next_4
2456 */
2457 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2458                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2459 #if OMPT_SUPPORT && OMPT_OPTIONAL
2460   OMPT_STORE_RETURN_ADDRESS(gtid);
2461 #endif
2462   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2463 #if OMPT_SUPPORT && OMPT_OPTIONAL
2464                                         ,
2465                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2466 #endif
2467   );
2468 }
2469 
2470 /*!
2471 See @ref __kmpc_dispatch_next_4
2472 */
2473 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2474                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2475                             kmp_int64 *p_st) {
2476 #if OMPT_SUPPORT && OMPT_OPTIONAL
2477   OMPT_STORE_RETURN_ADDRESS(gtid);
2478 #endif
2479   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2480 #if OMPT_SUPPORT && OMPT_OPTIONAL
2481                                          ,
2482                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2483 #endif
2484   );
2485 }
2486 
2487 /*!
2488 @param loc Source code location
2489 @param gtid Global thread id
2490 
2491 Mark the end of a dynamic loop.
2492 */
2493 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2494   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2495 }
2496 
2497 /*!
2498 See @ref __kmpc_dispatch_fini_4
2499 */
2500 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2501   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2502 }
2503 
2504 /*!
2505 See @ref __kmpc_dispatch_fini_4
2506 */
2507 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2508   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2509 }
2510 
2511 /*!
2512 See @ref __kmpc_dispatch_fini_4
2513 */
2514 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2515   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2516 }
2517 /*! @} */
2518 
2519 //-----------------------------------------------------------------------------
2520 // Non-template routines from kmp_dispatch.cpp used in other sources
2521 
2522 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2523   return value == checker;
2524 }
2525 
2526 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2527   return value != checker;
2528 }
2529 
2530 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2531   return value < checker;
2532 }
2533 
2534 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2535   return value >= checker;
2536 }
2537 
2538 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2539   return value <= checker;
2540 }
2541 
2542 kmp_uint32
2543 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2544              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2545              void *obj // Higher-level synchronization object, or NULL.
2546 ) {
2547   // note: we may not belong to a team at this point
2548   volatile kmp_uint32 *spin = spinner;
2549   kmp_uint32 check = checker;
2550   kmp_uint32 spins;
2551   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2552   kmp_uint32 r;
2553 
2554   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2555   KMP_INIT_YIELD(spins);
2556   // main wait spin loop
2557   while (!f(r = TCR_4(*spin), check)) {
2558     KMP_FSYNC_SPIN_PREPARE(obj);
2559     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2560        split. It causes problems with infinite recursion because of exit lock */
2561     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2562         __kmp_abort_thread(); */
2563     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2564   }
2565   KMP_FSYNC_SPIN_ACQUIRED(obj);
2566   return r;
2567 }
2568 
2569 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2570                       kmp_uint32 (*pred)(void *, kmp_uint32),
2571                       void *obj // Higher-level synchronization object, or NULL.
2572 ) {
2573   // note: we may not belong to a team at this point
2574   void *spin = spinner;
2575   kmp_uint32 check = checker;
2576   kmp_uint32 spins;
2577   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2578 
2579   KMP_FSYNC_SPIN_INIT(obj, spin);
2580   KMP_INIT_YIELD(spins);
2581   // main wait spin loop
2582   while (!f(spin, check)) {
2583     KMP_FSYNC_SPIN_PREPARE(obj);
2584     /* if we have waited a bit, or are noversubscribed, yield */
2585     /* pause is in the following code */
2586     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2587   }
2588   KMP_FSYNC_SPIN_ACQUIRED(obj);
2589 }
2590 
2591 } // extern "C"
2592 
2593 #ifdef KMP_GOMP_COMPAT
2594 
2595 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2596                                enum sched_type schedule, kmp_int32 lb,
2597                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2598                                int push_ws) {
2599   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2600                                  push_ws);
2601 }
2602 
2603 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2604                                 enum sched_type schedule, kmp_uint32 lb,
2605                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2606                                 int push_ws) {
2607   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2608                                   push_ws);
2609 }
2610 
2611 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2612                                enum sched_type schedule, kmp_int64 lb,
2613                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2614                                int push_ws) {
2615   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2616                                  push_ws);
2617 }
2618 
2619 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2620                                 enum sched_type schedule, kmp_uint64 lb,
2621                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2622                                 int push_ws) {
2623   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2624                                   push_ws);
2625 }
2626 
2627 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2628   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2629 }
2630 
2631 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2632   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2633 }
2634 
2635 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2636   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2637 }
2638 
2639 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2640   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2641 }
2642 
2643 #endif /* KMP_GOMP_COMPAT */
2644 
2645 /* ------------------------------------------------------------------------ */
2646