1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Initialize a dispatch_private_info_template<T> buffer for a particular
72 // type of schedule,chunk.  The loop description is found in lb (lower bound),
73 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
74 // to the scheduling (often the number of threads in a team, but not always if
75 // hierarchical scheduling is used).  tid is the id of the thread calling
76 // the function within the group of nproc threads.  It will have a value
77 // between 0 and nproc - 1.  This is often just the thread id within a team, but
78 // is not necessarily the case when using hierarchical scheduling.
79 // loc is the source file location of the corresponding loop
80 // gtid is the global thread id
81 template <typename T>
82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
83                                    dispatch_private_info_template<T> *pr,
84                                    enum sched_type schedule, T lb, T ub,
85                                    typename traits_t<T>::signed_t st,
86 #if USE_ITT_BUILD
87                                    kmp_uint64 *cur_chunk,
88 #endif
89                                    typename traits_t<T>::signed_t chunk,
90                                    T nproc, T tid) {
91   typedef typename traits_t<T>::unsigned_t UT;
92   typedef typename traits_t<T>::floating_t DBL;
93 
94   int active;
95   T tc;
96   kmp_info_t *th;
97   kmp_team_t *team;
98 
99 #ifdef KMP_DEBUG
100   typedef typename traits_t<T>::signed_t ST;
101   {
102     char *buff;
103     // create format specifiers before the debug output
104     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
105                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
106                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107                             traits_t<T>::spec, traits_t<T>::spec,
108                             traits_t<ST>::spec, traits_t<ST>::spec,
109                             traits_t<T>::spec, traits_t<T>::spec);
110     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111     __kmp_str_free(&buff);
112   }
113 #endif
114   /* setup data */
115   th = __kmp_threads[gtid];
116   team = th->th.th_team;
117   active = !team->t.t_serialized;
118 
119 #if USE_ITT_BUILD
120   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121                                     __kmp_forkjoin_frames_mode == 3 &&
122                                     KMP_MASTER_GTID(gtid) &&
123 #if OMP_40_ENABLED
124                                     th->th.th_teams_microtask == NULL &&
125 #endif
126                                     team->t.t_active_level == 1;
127 #endif
128 #if (KMP_STATIC_STEAL_ENABLED)
129   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
130     // AC: we now have only one implementation of stealing, so use it
131     schedule = kmp_sch_static_steal;
132   else
133 #endif
134     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
135 
136   /* Pick up the nomerge/ordered bits from the scheduling type */
137   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
138     pr->flags.nomerge = TRUE;
139     schedule =
140         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
141   } else {
142     pr->flags.nomerge = FALSE;
143   }
144   pr->type_size = traits_t<T>::type_size; // remember the size of variables
145   if (kmp_ord_lower & schedule) {
146     pr->flags.ordered = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
149   } else {
150     pr->flags.ordered = FALSE;
151   }
152 
153   if (schedule == kmp_sch_static) {
154     schedule = __kmp_static;
155   } else {
156     if (schedule == kmp_sch_runtime) {
157       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
158       // not specified)
159       schedule = team->t.t_sched.r_sched_type;
160       // Detail the schedule if needed (global controls are differentiated
161       // appropriately)
162       if (schedule == kmp_sch_guided_chunked) {
163         schedule = __kmp_guided;
164       } else if (schedule == kmp_sch_static) {
165         schedule = __kmp_static;
166       }
167       // Use the chunk size specified by OMP_SCHEDULE (or default if not
168       // specified)
169       chunk = team->t.t_sched.chunk;
170 #if USE_ITT_BUILD
171       if (cur_chunk)
172         *cur_chunk = chunk;
173 #endif
174 #ifdef KMP_DEBUG
175       {
176         char *buff;
177         // create format specifiers before the debug output
178         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
179                                 "schedule:%%d chunk:%%%s\n",
180                                 traits_t<ST>::spec);
181         KD_TRACE(10, (buff, gtid, schedule, chunk));
182         __kmp_str_free(&buff);
183       }
184 #endif
185     } else {
186       if (schedule == kmp_sch_guided_chunked) {
187         schedule = __kmp_guided;
188       }
189       if (chunk <= 0) {
190         chunk = KMP_DEFAULT_CHUNK;
191       }
192     }
193 
194     if (schedule == kmp_sch_auto) {
195       // mapping and differentiation: in the __kmp_do_serial_initialize()
196       schedule = __kmp_auto;
197 #ifdef KMP_DEBUG
198       {
199         char *buff;
200         // create format specifiers before the debug output
201         buff = __kmp_str_format(
202             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
203             "schedule:%%d chunk:%%%s\n",
204             traits_t<ST>::spec);
205         KD_TRACE(10, (buff, gtid, schedule, chunk));
206         __kmp_str_free(&buff);
207       }
208 #endif
209     }
210 
211     /* guided analytical not safe for too many threads */
212     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213       schedule = kmp_sch_guided_iterative_chunked;
214       KMP_WARNING(DispatchManyThreads);
215     }
216 #if OMP_45_ENABLED
217     if (schedule == kmp_sch_runtime_simd) {
218       // compiler provides simd_width in the chunk parameter
219       schedule = team->t.t_sched.r_sched_type;
220       // Detail the schedule if needed (global controls are differentiated
221       // appropriately)
222       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
223           schedule == __kmp_static) {
224         schedule = kmp_sch_static_balanced_chunked;
225       } else {
226         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
227           schedule = kmp_sch_guided_simd;
228         }
229         chunk = team->t.t_sched.chunk * chunk;
230       }
231 #if USE_ITT_BUILD
232       if (cur_chunk)
233         *cur_chunk = chunk;
234 #endif
235 #ifdef KMP_DEBUG
236       {
237         char *buff;
238         // create format specifiers before the debug output
239         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
240                                 " chunk:%%%s\n",
241                                 traits_t<ST>::spec);
242         KD_TRACE(10, (buff, gtid, schedule, chunk));
243         __kmp_str_free(&buff);
244       }
245 #endif
246     }
247 #endif // OMP_45_ENABLED
248     pr->u.p.parm1 = chunk;
249   }
250   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
251               "unknown scheduling type");
252 
253   pr->u.p.count = 0;
254 
255   if (__kmp_env_consistency_check) {
256     if (st == 0) {
257       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
259     }
260   }
261   // compute trip count
262   if (st == 1) { // most common case
263     if (ub >= lb) {
264       tc = ub - lb + 1;
265     } else { // ub < lb
266       tc = 0; // zero-trip
267     }
268   } else if (st < 0) {
269     if (lb >= ub) {
270       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
271       // where the division needs to be unsigned regardless of the result type
272       tc = (UT)(lb - ub) / (-st) + 1;
273     } else { // lb < ub
274       tc = 0; // zero-trip
275     }
276   } else { // st > 0
277     if (ub >= lb) {
278       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(ub - lb) / st + 1;
281     } else { // ub < lb
282       tc = 0; // zero-trip
283     }
284   }
285 
286 #if KMP_STATS_ENABLED
287   if (KMP_MASTER_GTID(gtid)) {
288     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
289   }
290 #endif
291 
292   pr->u.p.lb = lb;
293   pr->u.p.ub = ub;
294   pr->u.p.st = st;
295   pr->u.p.tc = tc;
296 
297 #if KMP_OS_WINDOWS
298   pr->u.p.last_upper = ub + st;
299 #endif /* KMP_OS_WINDOWS */
300 
301   /* NOTE: only the active parallel region(s) has active ordered sections */
302 
303   if (active) {
304     if (pr->flags.ordered) {
305       pr->ordered_bumped = 0;
306       pr->u.p.ordered_lower = 1;
307       pr->u.p.ordered_upper = 0;
308     }
309   }
310 
311   switch (schedule) {
312 #if (KMP_STATIC_STEAL_ENABLED)
313   case kmp_sch_static_steal: {
314     T ntc, init;
315 
316     KD_TRACE(100,
317              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
318               gtid));
319 
320     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
321     if (nproc > 1 && ntc >= nproc) {
322       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
323       T id = tid;
324       T small_chunk, extras;
325 
326       small_chunk = ntc / nproc;
327       extras = ntc % nproc;
328 
329       init = id * small_chunk + (id < extras ? id : extras);
330       pr->u.p.count = init;
331       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
332 
333       pr->u.p.parm2 = lb;
334       // pr->pfields.parm3 = 0; // it's not used in static_steal
335       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
336       pr->u.p.st = st;
337       if (traits_t<T>::type_size > 4) {
338         // AC: TODO: check if 16-byte CAS available and use it to
339         // improve performance (probably wait for explicit request
340         // before spending time on this).
341         // For now use dynamically allocated per-thread lock,
342         // free memory in __kmp_dispatch_next when status==0.
343         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
344         th->th.th_dispatch->th_steal_lock =
345             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
346         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
347       }
348       break;
349     } else {
350       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
351                      "kmp_sch_static_balanced\n",
352                      gtid));
353       schedule = kmp_sch_static_balanced;
354       /* too few iterations: fall-through to kmp_sch_static_balanced */
355     } // if
356     /* FALL-THROUGH to static balanced */
357     KMP_FALLTHROUGH();
358   } // case
359 #endif
360   case kmp_sch_static_balanced: {
361     T init, limit;
362 
363     KD_TRACE(
364         100,
365         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
366          gtid));
367 
368     if (nproc > 1) {
369       T id = tid;
370 
371       if (tc < nproc) {
372         if (id < tc) {
373           init = id;
374           limit = id;
375           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
376         } else {
377           pr->u.p.count = 1; /* means no more chunks to execute */
378           pr->u.p.parm1 = FALSE;
379           break;
380         }
381       } else {
382         T small_chunk = tc / nproc;
383         T extras = tc % nproc;
384         init = id * small_chunk + (id < extras ? id : extras);
385         limit = init + small_chunk - (id < extras ? 0 : 1);
386         pr->u.p.parm1 = (id == nproc - 1);
387       }
388     } else {
389       if (tc > 0) {
390         init = 0;
391         limit = tc - 1;
392         pr->u.p.parm1 = TRUE;
393       } else {
394         // zero trip count
395         pr->u.p.count = 1; /* means no more chunks to execute */
396         pr->u.p.parm1 = FALSE;
397         break;
398       }
399     }
400 #if USE_ITT_BUILD
401     // Calculate chunk for metadata report
402     if (itt_need_metadata_reporting)
403       if (cur_chunk)
404         *cur_chunk = limit - init + 1;
405 #endif
406     if (st == 1) {
407       pr->u.p.lb = lb + init;
408       pr->u.p.ub = lb + limit;
409     } else {
410       // calculated upper bound, "ub" is user-defined upper bound
411       T ub_tmp = lb + limit * st;
412       pr->u.p.lb = lb + init * st;
413       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
414       // it exactly
415       if (st > 0) {
416         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
417       } else {
418         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
419       }
420     }
421     if (pr->flags.ordered) {
422       pr->u.p.ordered_lower = init;
423       pr->u.p.ordered_upper = limit;
424     }
425     break;
426   } // case
427 #if OMP_45_ENABLED
428   case kmp_sch_static_balanced_chunked: {
429     // similar to balanced, but chunk adjusted to multiple of simd width
430     T nth = nproc;
431     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
432                    " -> falling-through to static_greedy\n",
433                    gtid));
434     schedule = kmp_sch_static_greedy;
435     if (nth > 1)
436       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
437     else
438       pr->u.p.parm1 = tc;
439     break;
440   } // case
441   case kmp_sch_guided_simd:
442 #endif // OMP_45_ENABLED
443   case kmp_sch_guided_iterative_chunked: {
444     KD_TRACE(
445         100,
446         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
447          " case\n",
448          gtid));
449 
450     if (nproc > 1) {
451       if ((2L * chunk + 1) * nproc >= tc) {
452         /* chunk size too large, switch to dynamic */
453         schedule = kmp_sch_dynamic_chunked;
454       } else {
455         // when remaining iters become less than parm2 - switch to dynamic
456         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
457         *(double *)&pr->u.p.parm3 =
458             guided_flt_param / nproc; // may occupy parm3 and parm4
459       }
460     } else {
461       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
462                      "kmp_sch_static_greedy\n",
463                      gtid));
464       schedule = kmp_sch_static_greedy;
465       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
466       KD_TRACE(
467           100,
468           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
469            gtid));
470       pr->u.p.parm1 = tc;
471     } // if
472   } // case
473   break;
474   case kmp_sch_guided_analytical_chunked: {
475     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
476                    "kmp_sch_guided_analytical_chunked case\n",
477                    gtid));
478 
479     if (nproc > 1) {
480       if ((2L * chunk + 1) * nproc >= tc) {
481         /* chunk size too large, switch to dynamic */
482         schedule = kmp_sch_dynamic_chunked;
483       } else {
484         /* commonly used term: (2 nproc - 1)/(2 nproc) */
485         DBL x;
486 
487 #if KMP_USE_X87CONTROL
488         /* Linux* OS already has 64-bit computation by default for long double,
489            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
490            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
491            instead of the default 53-bit. Even though long double doesn't work
492            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
493            expected to impact the correctness of the algorithm, but this has not
494            been mathematically proven. */
495         // save original FPCW and set precision to 64-bit, as
496         // Windows* OS on IA-32 architecture defaults to 53-bit
497         unsigned int oldFpcw = _control87(0, 0);
498         _control87(_PC_64, _MCW_PC); // 0,0x30000
499 #endif
500         /* value used for comparison in solver for cross-over point */
501         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
502 
503         /* crossover point--chunk indexes equal to or greater than
504            this point switch to dynamic-style scheduling */
505         UT cross;
506 
507         /* commonly used term: (2 nproc - 1)/(2 nproc) */
508         x = (long double)1.0 - (long double)0.5 / nproc;
509 
510 #ifdef KMP_DEBUG
511         { // test natural alignment
512           struct _test_a {
513             char a;
514             union {
515               char b;
516               DBL d;
517             };
518           } t;
519           ptrdiff_t natural_alignment =
520               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
521           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
522           // long)natural_alignment );
523           KMP_DEBUG_ASSERT(
524               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
525         }
526 #endif // KMP_DEBUG
527 
528         /* save the term in thread private dispatch structure */
529         *(DBL *)&pr->u.p.parm3 = x;
530 
531         /* solve for the crossover point to the nearest integer i for which C_i
532            <= chunk */
533         {
534           UT left, right, mid;
535           long double p;
536 
537           /* estimate initial upper and lower bound */
538 
539           /* doesn't matter what value right is as long as it is positive, but
540              it affects performance of the solver */
541           right = 229;
542           p = __kmp_pow<UT>(x, right);
543           if (p > target) {
544             do {
545               p *= p;
546               right <<= 1;
547             } while (p > target && right < (1 << 27));
548             /* lower bound is previous (failed) estimate of upper bound */
549             left = right >> 1;
550           } else {
551             left = 0;
552           }
553 
554           /* bisection root-finding method */
555           while (left + 1 < right) {
556             mid = (left + right) / 2;
557             if (__kmp_pow<UT>(x, mid) > target) {
558               left = mid;
559             } else {
560               right = mid;
561             }
562           } // while
563           cross = right;
564         }
565         /* assert sanity of computed crossover point */
566         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
567                    __kmp_pow<UT>(x, cross) <= target);
568 
569         /* save the crossover point in thread private dispatch structure */
570         pr->u.p.parm2 = cross;
571 
572 // C75803
573 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
574 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
575 #else
576 #define GUIDED_ANALYTICAL_WORKAROUND (x)
577 #endif
578         /* dynamic-style scheduling offset */
579         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
580                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
581                         cross * chunk;
582 #if KMP_USE_X87CONTROL
583         // restore FPCW
584         _control87(oldFpcw, _MCW_PC);
585 #endif
586       } // if
587     } else {
588       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
589                      "kmp_sch_static_greedy\n",
590                      gtid));
591       schedule = kmp_sch_static_greedy;
592       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
593       pr->u.p.parm1 = tc;
594     } // if
595   } // case
596   break;
597   case kmp_sch_static_greedy:
598     KD_TRACE(
599         100,
600         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
601          gtid));
602     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
603     break;
604   case kmp_sch_static_chunked:
605   case kmp_sch_dynamic_chunked:
606     if (pr->u.p.parm1 <= 0) {
607       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
608     }
609     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
610                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
611                    gtid));
612     break;
613   case kmp_sch_trapezoidal: {
614     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
615 
616     T parm1, parm2, parm3, parm4;
617     KD_TRACE(100,
618              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
619               gtid));
620 
621     parm1 = chunk;
622 
623     /* F : size of the first cycle */
624     parm2 = (tc / (2 * nproc));
625 
626     if (parm2 < 1) {
627       parm2 = 1;
628     }
629 
630     /* L : size of the last cycle.  Make sure the last cycle is not larger
631        than the first cycle. */
632     if (parm1 < 1) {
633       parm1 = 1;
634     } else if (parm1 > parm2) {
635       parm1 = parm2;
636     }
637 
638     /* N : number of cycles */
639     parm3 = (parm2 + parm1);
640     parm3 = (2 * tc + parm3 - 1) / parm3;
641 
642     if (parm3 < 2) {
643       parm3 = 2;
644     }
645 
646     /* sigma : decreasing incr of the trapezoid */
647     parm4 = (parm3 - 1);
648     parm4 = (parm2 - parm1) / parm4;
649 
650     // pointless check, because parm4 >= 0 always
651     // if ( parm4 < 0 ) {
652     //    parm4 = 0;
653     //}
654 
655     pr->u.p.parm1 = parm1;
656     pr->u.p.parm2 = parm2;
657     pr->u.p.parm3 = parm3;
658     pr->u.p.parm4 = parm4;
659   } // case
660   break;
661 
662   default: {
663     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
664                 KMP_HNT(GetNewerLibrary), // Hint
665                 __kmp_msg_null // Variadic argument list terminator
666                 );
667   } break;
668   } // switch
669   pr->schedule = schedule;
670 }
671 
672 #if KMP_USE_HIER_SCHED
673 template <typename T>
674 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
675                                              typename traits_t<T>::signed_t st);
676 template <>
677 inline void
678 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
679                                             kmp_int32 ub, kmp_int32 st) {
680   __kmp_dispatch_init_hierarchy<kmp_int32>(
681       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
682       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
683 }
684 template <>
685 inline void
686 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
687                                              kmp_uint32 ub, kmp_int32 st) {
688   __kmp_dispatch_init_hierarchy<kmp_uint32>(
689       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
690       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
691 }
692 template <>
693 inline void
694 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
695                                             kmp_int64 ub, kmp_int64 st) {
696   __kmp_dispatch_init_hierarchy<kmp_int64>(
697       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
698       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
699 }
700 template <>
701 inline void
702 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
703                                              kmp_uint64 ub, kmp_int64 st) {
704   __kmp_dispatch_init_hierarchy<kmp_uint64>(
705       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
706       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
707 }
708 
709 // free all the hierarchy scheduling memory associated with the team
710 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
711   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
712   for (int i = 0; i < num_disp_buff; ++i) {
713     // type does not matter here so use kmp_int32
714     auto sh =
715         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
716             &team->t.t_disp_buffer[i]);
717     if (sh->hier) {
718       sh->hier->deallocate();
719       __kmp_free(sh->hier);
720     }
721   }
722 }
723 #endif
724 
725 // UT - unsigned flavor of T, ST - signed flavor of T,
726 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
727 template <typename T>
728 static void
729 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
730                     T ub, typename traits_t<T>::signed_t st,
731                     typename traits_t<T>::signed_t chunk, int push_ws) {
732   typedef typename traits_t<T>::unsigned_t UT;
733 
734   int active;
735   kmp_info_t *th;
736   kmp_team_t *team;
737   kmp_uint32 my_buffer_index;
738   dispatch_private_info_template<T> *pr;
739   dispatch_shared_info_template<T> volatile *sh;
740 
741   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
742                    sizeof(dispatch_private_info));
743   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
744                    sizeof(dispatch_shared_info));
745 
746   if (!TCR_4(__kmp_init_parallel))
747     __kmp_parallel_initialize();
748 
749 #if OMP_50_ENABLED
750   __kmp_resume_if_soft_paused();
751 #endif
752 
753 #if INCLUDE_SSC_MARKS
754   SSC_MARK_DISPATCH_INIT();
755 #endif
756 #ifdef KMP_DEBUG
757   typedef typename traits_t<T>::signed_t ST;
758   {
759     char *buff;
760     // create format specifiers before the debug output
761     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
762                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
763                             traits_t<ST>::spec, traits_t<T>::spec,
764                             traits_t<T>::spec, traits_t<ST>::spec);
765     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
766     __kmp_str_free(&buff);
767   }
768 #endif
769   /* setup data */
770   th = __kmp_threads[gtid];
771   team = th->th.th_team;
772   active = !team->t.t_serialized;
773   th->th.th_ident = loc;
774 
775   // Any half-decent optimizer will remove this test when the blocks are empty
776   // since the macros expand to nothing
777   // when statistics are disabled.
778   if (schedule == __kmp_static) {
779     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
780   } else {
781     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
782   }
783 
784 #if KMP_USE_HIER_SCHED
785   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
786   // Hierarchical scheduling does not work with ordered, so if ordered is
787   // detected, then revert back to threaded scheduling.
788   bool ordered;
789   enum sched_type my_sched = schedule;
790   my_buffer_index = th->th.th_dispatch->th_disp_index;
791   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
792       &th->th.th_dispatch
793            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
794   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
795   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
796     my_sched =
797         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
798   ordered = (kmp_ord_lower & my_sched);
799   if (pr->flags.use_hier) {
800     if (ordered) {
801       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
802                      "Disabling hierarchical scheduling.\n",
803                      gtid));
804       pr->flags.use_hier = FALSE;
805     }
806   }
807   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
808     // Don't use hierarchical for ordered parallel loops and don't
809     // use the runtime hierarchy if one was specified in the program
810     if (!ordered && !pr->flags.use_hier)
811       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
812   }
813 #endif // KMP_USE_HIER_SCHED
814 
815 #if USE_ITT_BUILD
816   kmp_uint64 cur_chunk = chunk;
817   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
818                                     __kmp_forkjoin_frames_mode == 3 &&
819                                     KMP_MASTER_GTID(gtid) &&
820 #if OMP_40_ENABLED
821                                     th->th.th_teams_microtask == NULL &&
822 #endif
823                                     team->t.t_active_level == 1;
824 #endif
825   if (!active) {
826     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
827         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
828   } else {
829     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
830                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
831 
832     my_buffer_index = th->th.th_dispatch->th_disp_index++;
833 
834     /* What happens when number of threads changes, need to resize buffer? */
835     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
836         &th->th.th_dispatch
837              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
839         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
840     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
841                   my_buffer_index));
842   }
843 
844   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
845 #if USE_ITT_BUILD
846                                 &cur_chunk,
847 #endif
848                                 chunk, (T)th->th.th_team_nproc,
849                                 (T)th->th.th_info.ds.ds_tid);
850   if (active) {
851     if (pr->flags.ordered == 0) {
852       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
853       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
854     } else {
855       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
856       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
857     }
858   }
859 
860   if (active) {
861     /* The name of this buffer should be my_buffer_index when it's free to use
862      * it */
863 
864     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
865                    "sh->buffer_index:%d\n",
866                    gtid, my_buffer_index, sh->buffer_index));
867     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
868                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
869     // Note: KMP_WAIT() cannot be used there: buffer index and
870     // my_buffer_index are *always* 32-bit integers.
871     KMP_MB(); /* is this necessary? */
872     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
873                    "sh->buffer_index:%d\n",
874                    gtid, my_buffer_index, sh->buffer_index));
875 
876     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
877     th->th.th_dispatch->th_dispatch_sh_current =
878         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
879 #if USE_ITT_BUILD
880     if (pr->flags.ordered) {
881       __kmp_itt_ordered_init(gtid);
882     }
883     // Report loop metadata
884     if (itt_need_metadata_reporting) {
885       // Only report metadata by master of active team at level 1
886       kmp_uint64 schedtype = 0;
887       switch (schedule) {
888       case kmp_sch_static_chunked:
889       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
890         break;
891       case kmp_sch_static_greedy:
892         cur_chunk = pr->u.p.parm1;
893         break;
894       case kmp_sch_dynamic_chunked:
895         schedtype = 1;
896         break;
897       case kmp_sch_guided_iterative_chunked:
898       case kmp_sch_guided_analytical_chunked:
899 #if OMP_45_ENABLED
900       case kmp_sch_guided_simd:
901 #endif
902         schedtype = 2;
903         break;
904       default:
905         // Should we put this case under "static"?
906         // case kmp_sch_static_steal:
907         schedtype = 3;
908         break;
909       }
910       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
911     }
912 #if KMP_USE_HIER_SCHED
913     if (pr->flags.use_hier) {
914       pr->u.p.count = 0;
915       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
916     }
917 #endif // KMP_USER_HIER_SCHED
918 #endif /* USE_ITT_BUILD */
919   }
920 
921 #ifdef KMP_DEBUG
922   {
923     char *buff;
924     // create format specifiers before the debug output
925     buff = __kmp_str_format(
926         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
927         "lb:%%%s ub:%%%s"
928         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
929         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
930         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
931         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
932         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
933         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
934     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
935                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
936                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
937                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
938     __kmp_str_free(&buff);
939   }
940 #endif
941 #if (KMP_STATIC_STEAL_ENABLED)
942   // It cannot be guaranteed that after execution of a loop with some other
943   // schedule kind all the parm3 variables will contain the same value. Even if
944   // all parm3 will be the same, it still exists a bad case like using 0 and 1
945   // rather than program life-time increment. So the dedicated variable is
946   // required. The 'static_steal_counter' is used.
947   if (schedule == kmp_sch_static_steal) {
948     // Other threads will inspect this variable when searching for a victim.
949     // This is a flag showing that other threads may steal from this thread
950     // since then.
951     volatile T *p = &pr->u.p.static_steal_counter;
952     *p = *p + 1;
953   }
954 #endif // ( KMP_STATIC_STEAL_ENABLED )
955 
956 #if OMPT_SUPPORT && OMPT_OPTIONAL
957   if (ompt_enabled.ompt_callback_work) {
958     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
959     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
960     ompt_callbacks.ompt_callback(ompt_callback_work)(
961         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
962         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
963   }
964 #endif
965   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
966 }
967 
968 /* For ordered loops, either __kmp_dispatch_finish() should be called after
969  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
970  * every chunk of iterations.  If the ordered section(s) were not executed
971  * for this iteration (or every iteration in this chunk), we need to set the
972  * ordered iteration counters so that the next thread can proceed. */
973 template <typename UT>
974 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
975   typedef typename traits_t<UT>::signed_t ST;
976   kmp_info_t *th = __kmp_threads[gtid];
977 
978   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
979   if (!th->th.th_team->t.t_serialized) {
980 
981     dispatch_private_info_template<UT> *pr =
982         reinterpret_cast<dispatch_private_info_template<UT> *>(
983             th->th.th_dispatch->th_dispatch_pr_current);
984     dispatch_shared_info_template<UT> volatile *sh =
985         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
986             th->th.th_dispatch->th_dispatch_sh_current);
987     KMP_DEBUG_ASSERT(pr);
988     KMP_DEBUG_ASSERT(sh);
989     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
990                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
991 
992     if (pr->ordered_bumped) {
993       KD_TRACE(
994           1000,
995           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
996            gtid));
997       pr->ordered_bumped = 0;
998     } else {
999       UT lower = pr->u.p.ordered_lower;
1000 
1001 #ifdef KMP_DEBUG
1002       {
1003         char *buff;
1004         // create format specifiers before the debug output
1005         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1006                                 "ordered_iteration:%%%s lower:%%%s\n",
1007                                 traits_t<UT>::spec, traits_t<UT>::spec);
1008         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1009         __kmp_str_free(&buff);
1010       }
1011 #endif
1012 
1013       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1014                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1015       KMP_MB(); /* is this necessary? */
1016 #ifdef KMP_DEBUG
1017       {
1018         char *buff;
1019         // create format specifiers before the debug output
1020         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1021                                 "ordered_iteration:%%%s lower:%%%s\n",
1022                                 traits_t<UT>::spec, traits_t<UT>::spec);
1023         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1024         __kmp_str_free(&buff);
1025       }
1026 #endif
1027 
1028       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1029     } // if
1030   } // if
1031   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1032 }
1033 
1034 #ifdef KMP_GOMP_COMPAT
1035 
1036 template <typename UT>
1037 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1038   typedef typename traits_t<UT>::signed_t ST;
1039   kmp_info_t *th = __kmp_threads[gtid];
1040 
1041   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1042   if (!th->th.th_team->t.t_serialized) {
1043     //        int cid;
1044     dispatch_private_info_template<UT> *pr =
1045         reinterpret_cast<dispatch_private_info_template<UT> *>(
1046             th->th.th_dispatch->th_dispatch_pr_current);
1047     dispatch_shared_info_template<UT> volatile *sh =
1048         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1049             th->th.th_dispatch->th_dispatch_sh_current);
1050     KMP_DEBUG_ASSERT(pr);
1051     KMP_DEBUG_ASSERT(sh);
1052     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1053                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1054 
1055     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1056     UT lower = pr->u.p.ordered_lower;
1057     UT upper = pr->u.p.ordered_upper;
1058     UT inc = upper - lower + 1;
1059 
1060     if (pr->ordered_bumped == inc) {
1061       KD_TRACE(
1062           1000,
1063           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1064            gtid));
1065       pr->ordered_bumped = 0;
1066     } else {
1067       inc -= pr->ordered_bumped;
1068 
1069 #ifdef KMP_DEBUG
1070       {
1071         char *buff;
1072         // create format specifiers before the debug output
1073         buff = __kmp_str_format(
1074             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1075             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1076             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1077         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1078         __kmp_str_free(&buff);
1079       }
1080 #endif
1081 
1082       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1083                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1084 
1085       KMP_MB(); /* is this necessary? */
1086       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1087                       "ordered_bumped to zero\n",
1088                       gtid));
1089       pr->ordered_bumped = 0;
1090 //!!!!! TODO check if the inc should be unsigned, or signed???
1091 #ifdef KMP_DEBUG
1092       {
1093         char *buff;
1094         // create format specifiers before the debug output
1095         buff = __kmp_str_format(
1096             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1097             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1098             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1099             traits_t<UT>::spec);
1100         KD_TRACE(1000,
1101                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1102         __kmp_str_free(&buff);
1103       }
1104 #endif
1105 
1106       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1107     }
1108     //        }
1109   }
1110   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1111 }
1112 
1113 #endif /* KMP_GOMP_COMPAT */
1114 
1115 template <typename T>
1116 int __kmp_dispatch_next_algorithm(int gtid,
1117                                   dispatch_private_info_template<T> *pr,
1118                                   dispatch_shared_info_template<T> volatile *sh,
1119                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1120                                   typename traits_t<T>::signed_t *p_st, T nproc,
1121                                   T tid) {
1122   typedef typename traits_t<T>::unsigned_t UT;
1123   typedef typename traits_t<T>::signed_t ST;
1124   typedef typename traits_t<T>::floating_t DBL;
1125   int status = 0;
1126   kmp_int32 last = 0;
1127   T start;
1128   ST incr;
1129   UT limit, trip, init;
1130   kmp_info_t *th = __kmp_threads[gtid];
1131   kmp_team_t *team = th->th.th_team;
1132 
1133   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1134                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1135   KMP_DEBUG_ASSERT(pr);
1136   KMP_DEBUG_ASSERT(sh);
1137   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1138 #ifdef KMP_DEBUG
1139   {
1140     char *buff;
1141     // create format specifiers before the debug output
1142     buff =
1143         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1144                          "sh:%%p nproc:%%%s tid:%%%s\n",
1145                          traits_t<T>::spec, traits_t<T>::spec);
1146     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1147     __kmp_str_free(&buff);
1148   }
1149 #endif
1150 
1151   // zero trip count
1152   if (pr->u.p.tc == 0) {
1153     KD_TRACE(10,
1154              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1155               "zero status:%d\n",
1156               gtid, status));
1157     return 0;
1158   }
1159 
1160   switch (pr->schedule) {
1161 #if (KMP_STATIC_STEAL_ENABLED)
1162   case kmp_sch_static_steal: {
1163     T chunk = pr->u.p.parm1;
1164 
1165     KD_TRACE(100,
1166              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1167               gtid));
1168 
1169     trip = pr->u.p.tc - 1;
1170 
1171     if (traits_t<T>::type_size > 4) {
1172       // use lock for 8-byte and CAS for 4-byte induction
1173       // variable. TODO (optional): check and use 16-byte CAS
1174       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1175       KMP_DEBUG_ASSERT(lck != NULL);
1176       if (pr->u.p.count < (UT)pr->u.p.ub) {
1177         __kmp_acquire_lock(lck, gtid);
1178         // try to get own chunk of iterations
1179         init = (pr->u.p.count)++;
1180         status = (init < (UT)pr->u.p.ub);
1181         __kmp_release_lock(lck, gtid);
1182       } else {
1183         status = 0; // no own chunks
1184       }
1185       if (!status) { // try to steal
1186         kmp_info_t **other_threads = team->t.t_threads;
1187         int while_limit = nproc; // nproc attempts to find a victim
1188         int while_index = 0;
1189         // TODO: algorithm of searching for a victim
1190         // should be cleaned up and measured
1191         while ((!status) && (while_limit != ++while_index)) {
1192           T remaining;
1193           T victimIdx = pr->u.p.parm4;
1194           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1195           dispatch_private_info_template<T> *victim =
1196               reinterpret_cast<dispatch_private_info_template<T> *>(
1197                   other_threads[victimIdx]
1198                       ->th.th_dispatch->th_dispatch_pr_current);
1199           while ((victim == NULL || victim == pr ||
1200                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1201                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1202                  oldVictimIdx != victimIdx) {
1203             victimIdx = (victimIdx + 1) % nproc;
1204             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1205                 other_threads[victimIdx]
1206                     ->th.th_dispatch->th_dispatch_pr_current);
1207           }
1208           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1209                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1210             continue; // try once more (nproc attempts in total)
1211             // no victim is ready yet to participate in stealing
1212             // because all victims are still in kmp_init_dispatch
1213           }
1214           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1215             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1216             continue; // not enough chunks to steal, goto next victim
1217           }
1218 
1219           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1220           KMP_ASSERT(lck != NULL);
1221           __kmp_acquire_lock(lck, gtid);
1222           limit = victim->u.p.ub; // keep initial ub
1223           if (victim->u.p.count >= limit ||
1224               (remaining = limit - victim->u.p.count) < 2) {
1225             __kmp_release_lock(lck, gtid);
1226             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1227             continue; // not enough chunks to steal
1228           }
1229           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1230           // by 1
1231           if (remaining > 3) {
1232             // steal 1/4 of remaining
1233             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1234             init = (victim->u.p.ub -= (remaining >> 2));
1235           } else {
1236             // steal 1 chunk of 2 or 3 remaining
1237             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1238             init = (victim->u.p.ub -= 1);
1239           }
1240           __kmp_release_lock(lck, gtid);
1241 
1242           KMP_DEBUG_ASSERT(init + 1 <= limit);
1243           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1244           status = 1;
1245           while_index = 0;
1246           // now update own count and ub with stolen range but init chunk
1247           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1248           pr->u.p.count = init + 1;
1249           pr->u.p.ub = limit;
1250           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1251         } // while (search for victim)
1252       } // if (try to find victim and steal)
1253     } else {
1254       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1255       typedef union {
1256         struct {
1257           UT count;
1258           T ub;
1259         } p;
1260         kmp_int64 b;
1261       } union_i4;
1262       // All operations on 'count' or 'ub' must be combined atomically
1263       // together.
1264       {
1265         union_i4 vold, vnew;
1266         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1267         vnew = vold;
1268         vnew.p.count++;
1269         while (!KMP_COMPARE_AND_STORE_ACQ64(
1270             (volatile kmp_int64 *)&pr->u.p.count,
1271             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1272             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1273           KMP_CPU_PAUSE();
1274           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1275           vnew = vold;
1276           vnew.p.count++;
1277         }
1278         vnew = vold;
1279         init = vnew.p.count;
1280         status = (init < (UT)vnew.p.ub);
1281       }
1282 
1283       if (!status) {
1284         kmp_info_t **other_threads = team->t.t_threads;
1285         int while_limit = nproc; // nproc attempts to find a victim
1286         int while_index = 0;
1287 
1288         // TODO: algorithm of searching for a victim
1289         // should be cleaned up and measured
1290         while ((!status) && (while_limit != ++while_index)) {
1291           union_i4 vold, vnew;
1292           kmp_int32 remaining;
1293           T victimIdx = pr->u.p.parm4;
1294           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1295           dispatch_private_info_template<T> *victim =
1296               reinterpret_cast<dispatch_private_info_template<T> *>(
1297                   other_threads[victimIdx]
1298                       ->th.th_dispatch->th_dispatch_pr_current);
1299           while ((victim == NULL || victim == pr ||
1300                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1301                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1302                  oldVictimIdx != victimIdx) {
1303             victimIdx = (victimIdx + 1) % nproc;
1304             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1305                 other_threads[victimIdx]
1306                     ->th.th_dispatch->th_dispatch_pr_current);
1307           }
1308           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1309                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1310             continue; // try once more (nproc attempts in total)
1311             // no victim is ready yet to participate in stealing
1312             // because all victims are still in kmp_init_dispatch
1313           }
1314           pr->u.p.parm4 = victimIdx; // new victim found
1315           while (1) { // CAS loop if victim has enough chunks to steal
1316             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1317             vnew = vold;
1318 
1319             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1320             if (vnew.p.count >= (UT)vnew.p.ub ||
1321                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1322               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1323               break; // not enough chunks to steal, goto next victim
1324             }
1325             if (remaining > 3) {
1326               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1327             } else {
1328               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1329             }
1330             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1331             // TODO: Should this be acquire or release?
1332             if (KMP_COMPARE_AND_STORE_ACQ64(
1333                     (volatile kmp_int64 *)&victim->u.p.count,
1334                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1335                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1336               // stealing succedded
1337               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1338                                         vold.p.ub - vnew.p.ub);
1339               status = 1;
1340               while_index = 0;
1341               // now update own count and ub
1342               init = vnew.p.ub;
1343               vold.p.count = init + 1;
1344 #if KMP_ARCH_X86
1345               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1346 #else
1347               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1348 #endif
1349               break;
1350             } // if (check CAS result)
1351             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1352           } // while (try to steal from particular victim)
1353         } // while (search for victim)
1354       } // if (try to find victim and steal)
1355     } // if (4-byte induction variable)
1356     if (!status) {
1357       *p_lb = 0;
1358       *p_ub = 0;
1359       if (p_st != NULL)
1360         *p_st = 0;
1361     } else {
1362       start = pr->u.p.parm2;
1363       init *= chunk;
1364       limit = chunk + init - 1;
1365       incr = pr->u.p.st;
1366       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1367 
1368       KMP_DEBUG_ASSERT(init <= trip);
1369       if ((last = (limit >= trip)) != 0)
1370         limit = trip;
1371       if (p_st != NULL)
1372         *p_st = incr;
1373 
1374       if (incr == 1) {
1375         *p_lb = start + init;
1376         *p_ub = start + limit;
1377       } else {
1378         *p_lb = start + init * incr;
1379         *p_ub = start + limit * incr;
1380       }
1381 
1382       if (pr->flags.ordered) {
1383         pr->u.p.ordered_lower = init;
1384         pr->u.p.ordered_upper = limit;
1385       } // if
1386     } // if
1387     break;
1388   } // case
1389 #endif // ( KMP_STATIC_STEAL_ENABLED )
1390   case kmp_sch_static_balanced: {
1391     KD_TRACE(
1392         10,
1393         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1394          gtid));
1395     /* check if thread has any iteration to do */
1396     if ((status = !pr->u.p.count) != 0) {
1397       pr->u.p.count = 1;
1398       *p_lb = pr->u.p.lb;
1399       *p_ub = pr->u.p.ub;
1400       last = pr->u.p.parm1;
1401       if (p_st != NULL)
1402         *p_st = pr->u.p.st;
1403     } else { /* no iterations to do */
1404       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1405     }
1406   } // case
1407   break;
1408   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1409                                  merged here */
1410   case kmp_sch_static_chunked: {
1411     T parm1;
1412 
1413     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1414                    "kmp_sch_static_[affinity|chunked] case\n",
1415                    gtid));
1416     parm1 = pr->u.p.parm1;
1417 
1418     trip = pr->u.p.tc - 1;
1419     init = parm1 * (pr->u.p.count + tid);
1420 
1421     if ((status = (init <= trip)) != 0) {
1422       start = pr->u.p.lb;
1423       incr = pr->u.p.st;
1424       limit = parm1 + init - 1;
1425 
1426       if ((last = (limit >= trip)) != 0)
1427         limit = trip;
1428 
1429       if (p_st != NULL)
1430         *p_st = incr;
1431 
1432       pr->u.p.count += nproc;
1433 
1434       if (incr == 1) {
1435         *p_lb = start + init;
1436         *p_ub = start + limit;
1437       } else {
1438         *p_lb = start + init * incr;
1439         *p_ub = start + limit * incr;
1440       }
1441 
1442       if (pr->flags.ordered) {
1443         pr->u.p.ordered_lower = init;
1444         pr->u.p.ordered_upper = limit;
1445       } // if
1446     } // if
1447   } // case
1448   break;
1449 
1450   case kmp_sch_dynamic_chunked: {
1451     T chunk = pr->u.p.parm1;
1452 
1453     KD_TRACE(
1454         100,
1455         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1456          gtid));
1457 
1458     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1459     trip = pr->u.p.tc - 1;
1460 
1461     if ((status = (init <= trip)) == 0) {
1462       *p_lb = 0;
1463       *p_ub = 0;
1464       if (p_st != NULL)
1465         *p_st = 0;
1466     } else {
1467       start = pr->u.p.lb;
1468       limit = chunk + init - 1;
1469       incr = pr->u.p.st;
1470 
1471       if ((last = (limit >= trip)) != 0)
1472         limit = trip;
1473 
1474       if (p_st != NULL)
1475         *p_st = incr;
1476 
1477       if (incr == 1) {
1478         *p_lb = start + init;
1479         *p_ub = start + limit;
1480       } else {
1481         *p_lb = start + init * incr;
1482         *p_ub = start + limit * incr;
1483       }
1484 
1485       if (pr->flags.ordered) {
1486         pr->u.p.ordered_lower = init;
1487         pr->u.p.ordered_upper = limit;
1488       } // if
1489     } // if
1490   } // case
1491   break;
1492 
1493   case kmp_sch_guided_iterative_chunked: {
1494     T chunkspec = pr->u.p.parm1;
1495     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1496                    "iterative case\n",
1497                    gtid));
1498     trip = pr->u.p.tc;
1499     // Start atomic part of calculations
1500     while (1) {
1501       ST remaining; // signed, because can be < 0
1502       init = sh->u.s.iteration; // shared value
1503       remaining = trip - init;
1504       if (remaining <= 0) { // AC: need to compare with 0 first
1505         // nothing to do, don't try atomic op
1506         status = 0;
1507         break;
1508       }
1509       if ((T)remaining <
1510           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1511         // use dynamic-style shcedule
1512         // atomically inrement iterations, get old value
1513         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1514                                  (ST)chunkspec);
1515         remaining = trip - init;
1516         if (remaining <= 0) {
1517           status = 0; // all iterations got by other threads
1518         } else {
1519           // got some iterations to work on
1520           status = 1;
1521           if ((T)remaining > chunkspec) {
1522             limit = init + chunkspec - 1;
1523           } else {
1524             last = 1; // the last chunk
1525             limit = init + remaining - 1;
1526           } // if
1527         } // if
1528         break;
1529       } // if
1530       limit = init +
1531               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1532       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1533                                (ST)init, (ST)limit)) {
1534         // CAS was successful, chunk obtained
1535         status = 1;
1536         --limit;
1537         break;
1538       } // if
1539     } // while
1540     if (status != 0) {
1541       start = pr->u.p.lb;
1542       incr = pr->u.p.st;
1543       if (p_st != NULL)
1544         *p_st = incr;
1545       *p_lb = start + init * incr;
1546       *p_ub = start + limit * incr;
1547       if (pr->flags.ordered) {
1548         pr->u.p.ordered_lower = init;
1549         pr->u.p.ordered_upper = limit;
1550       } // if
1551     } else {
1552       *p_lb = 0;
1553       *p_ub = 0;
1554       if (p_st != NULL)
1555         *p_st = 0;
1556     } // if
1557   } // case
1558   break;
1559 
1560 #if OMP_45_ENABLED
1561   case kmp_sch_guided_simd: {
1562     // same as iterative but curr-chunk adjusted to be multiple of given
1563     // chunk
1564     T chunk = pr->u.p.parm1;
1565     KD_TRACE(100,
1566              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1567               gtid));
1568     trip = pr->u.p.tc;
1569     // Start atomic part of calculations
1570     while (1) {
1571       ST remaining; // signed, because can be < 0
1572       init = sh->u.s.iteration; // shared value
1573       remaining = trip - init;
1574       if (remaining <= 0) { // AC: need to compare with 0 first
1575         status = 0; // nothing to do, don't try atomic op
1576         break;
1577       }
1578       KMP_DEBUG_ASSERT(init % chunk == 0);
1579       // compare with K*nproc*(chunk+1), K=2 by default
1580       if ((T)remaining < pr->u.p.parm2) {
1581         // use dynamic-style shcedule
1582         // atomically inrement iterations, get old value
1583         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1584                                  (ST)chunk);
1585         remaining = trip - init;
1586         if (remaining <= 0) {
1587           status = 0; // all iterations got by other threads
1588         } else {
1589           // got some iterations to work on
1590           status = 1;
1591           if ((T)remaining > chunk) {
1592             limit = init + chunk - 1;
1593           } else {
1594             last = 1; // the last chunk
1595             limit = init + remaining - 1;
1596           } // if
1597         } // if
1598         break;
1599       } // if
1600       // divide by K*nproc
1601       UT span = remaining * (*(double *)&pr->u.p.parm3);
1602       UT rem = span % chunk;
1603       if (rem) // adjust so that span%chunk == 0
1604         span += chunk - rem;
1605       limit = init + span;
1606       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1607                                (ST)init, (ST)limit)) {
1608         // CAS was successful, chunk obtained
1609         status = 1;
1610         --limit;
1611         break;
1612       } // if
1613     } // while
1614     if (status != 0) {
1615       start = pr->u.p.lb;
1616       incr = pr->u.p.st;
1617       if (p_st != NULL)
1618         *p_st = incr;
1619       *p_lb = start + init * incr;
1620       *p_ub = start + limit * incr;
1621       if (pr->flags.ordered) {
1622         pr->u.p.ordered_lower = init;
1623         pr->u.p.ordered_upper = limit;
1624       } // if
1625     } else {
1626       *p_lb = 0;
1627       *p_ub = 0;
1628       if (p_st != NULL)
1629         *p_st = 0;
1630     } // if
1631   } // case
1632   break;
1633 #endif // OMP_45_ENABLED
1634 
1635   case kmp_sch_guided_analytical_chunked: {
1636     T chunkspec = pr->u.p.parm1;
1637     UT chunkIdx;
1638 #if KMP_USE_X87CONTROL
1639     /* for storing original FPCW value for Windows* OS on
1640        IA-32 architecture 8-byte version */
1641     unsigned int oldFpcw;
1642     unsigned int fpcwSet = 0;
1643 #endif
1644     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1645                    "kmp_sch_guided_analytical_chunked case\n",
1646                    gtid));
1647 
1648     trip = pr->u.p.tc;
1649 
1650     KMP_DEBUG_ASSERT(nproc > 1);
1651     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1652 
1653     while (1) { /* this while loop is a safeguard against unexpected zero
1654                    chunk sizes */
1655       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1656       if (chunkIdx >= (UT)pr->u.p.parm2) {
1657         --trip;
1658         /* use dynamic-style scheduling */
1659         init = chunkIdx * chunkspec + pr->u.p.count;
1660         /* need to verify init > 0 in case of overflow in the above
1661          * calculation */
1662         if ((status = (init > 0 && init <= trip)) != 0) {
1663           limit = init + chunkspec - 1;
1664 
1665           if ((last = (limit >= trip)) != 0)
1666             limit = trip;
1667         }
1668         break;
1669       } else {
1670 /* use exponential-style scheduling */
1671 /* The following check is to workaround the lack of long double precision on
1672    Windows* OS.
1673    This check works around the possible effect that init != 0 for chunkIdx == 0.
1674  */
1675 #if KMP_USE_X87CONTROL
1676         /* If we haven't already done so, save original
1677            FPCW and set precision to 64-bit, as Windows* OS
1678            on IA-32 architecture defaults to 53-bit */
1679         if (!fpcwSet) {
1680           oldFpcw = _control87(0, 0);
1681           _control87(_PC_64, _MCW_PC);
1682           fpcwSet = 0x30000;
1683         }
1684 #endif
1685         if (chunkIdx) {
1686           init = __kmp_dispatch_guided_remaining<T>(
1687               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1688           KMP_DEBUG_ASSERT(init);
1689           init = trip - init;
1690         } else
1691           init = 0;
1692         limit = trip - __kmp_dispatch_guided_remaining<T>(
1693                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1694         KMP_ASSERT(init <= limit);
1695         if (init < limit) {
1696           KMP_DEBUG_ASSERT(limit <= trip);
1697           --limit;
1698           status = 1;
1699           break;
1700         } // if
1701       } // if
1702     } // while (1)
1703 #if KMP_USE_X87CONTROL
1704     /* restore FPCW if necessary
1705        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1706     */
1707     if (fpcwSet && (oldFpcw & fpcwSet))
1708       _control87(oldFpcw, _MCW_PC);
1709 #endif
1710     if (status != 0) {
1711       start = pr->u.p.lb;
1712       incr = pr->u.p.st;
1713       if (p_st != NULL)
1714         *p_st = incr;
1715       *p_lb = start + init * incr;
1716       *p_ub = start + limit * incr;
1717       if (pr->flags.ordered) {
1718         pr->u.p.ordered_lower = init;
1719         pr->u.p.ordered_upper = limit;
1720       }
1721     } else {
1722       *p_lb = 0;
1723       *p_ub = 0;
1724       if (p_st != NULL)
1725         *p_st = 0;
1726     }
1727   } // case
1728   break;
1729 
1730   case kmp_sch_trapezoidal: {
1731     UT index;
1732     T parm2 = pr->u.p.parm2;
1733     T parm3 = pr->u.p.parm3;
1734     T parm4 = pr->u.p.parm4;
1735     KD_TRACE(100,
1736              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1737               gtid));
1738 
1739     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1740 
1741     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1742     trip = pr->u.p.tc - 1;
1743 
1744     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1745       *p_lb = 0;
1746       *p_ub = 0;
1747       if (p_st != NULL)
1748         *p_st = 0;
1749     } else {
1750       start = pr->u.p.lb;
1751       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1752       incr = pr->u.p.st;
1753 
1754       if ((last = (limit >= trip)) != 0)
1755         limit = trip;
1756 
1757       if (p_st != NULL)
1758         *p_st = incr;
1759 
1760       if (incr == 1) {
1761         *p_lb = start + init;
1762         *p_ub = start + limit;
1763       } else {
1764         *p_lb = start + init * incr;
1765         *p_ub = start + limit * incr;
1766       }
1767 
1768       if (pr->flags.ordered) {
1769         pr->u.p.ordered_lower = init;
1770         pr->u.p.ordered_upper = limit;
1771       } // if
1772     } // if
1773   } // case
1774   break;
1775   default: {
1776     status = 0; // to avoid complaints on uninitialized variable use
1777     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1778                 KMP_HNT(GetNewerLibrary), // Hint
1779                 __kmp_msg_null // Variadic argument list terminator
1780                 );
1781   } break;
1782   } // switch
1783   if (p_last)
1784     *p_last = last;
1785 #ifdef KMP_DEBUG
1786   if (pr->flags.ordered) {
1787     char *buff;
1788     // create format specifiers before the debug output
1789     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1790                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1791                             traits_t<UT>::spec, traits_t<UT>::spec);
1792     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1793     __kmp_str_free(&buff);
1794   }
1795   {
1796     char *buff;
1797     // create format specifiers before the debug output
1798     buff = __kmp_str_format(
1799         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1800         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1801         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1802     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1803     __kmp_str_free(&buff);
1804   }
1805 #endif
1806   return status;
1807 }
1808 
1809 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1810    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1811    is not called. */
1812 #if OMPT_SUPPORT && OMPT_OPTIONAL
1813 #define OMPT_LOOP_END                                                          \
1814   if (status == 0) {                                                           \
1815     if (ompt_enabled.ompt_callback_work) {                                     \
1816       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1817       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1818       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1819           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1820           &(task_info->task_data), 0, codeptr);                                \
1821     }                                                                          \
1822   }
1823 // TODO: implement count
1824 #else
1825 #define OMPT_LOOP_END // no-op
1826 #endif
1827 
1828 #if KMP_STATS_ENABLED
1829 #define KMP_STATS_LOOP_END                                                     \
1830   {                                                                            \
1831     kmp_int64 u, l, t, i;                                                      \
1832     l = (kmp_int64)(*p_lb);                                                    \
1833     u = (kmp_int64)(*p_ub);                                                    \
1834     i = (kmp_int64)(pr->u.p.st);                                               \
1835     if (status == 0) {                                                         \
1836       t = 0;                                                                   \
1837       KMP_POP_PARTITIONED_TIMER();                                             \
1838     } else if (i == 1) {                                                       \
1839       if (u >= l)                                                              \
1840         t = u - l + 1;                                                         \
1841       else                                                                     \
1842         t = 0;                                                                 \
1843     } else if (i < 0) {                                                        \
1844       if (l >= u)                                                              \
1845         t = (l - u) / (-i) + 1;                                                \
1846       else                                                                     \
1847         t = 0;                                                                 \
1848     } else {                                                                   \
1849       if (u >= l)                                                              \
1850         t = (u - l) / i + 1;                                                   \
1851       else                                                                     \
1852         t = 0;                                                                 \
1853     }                                                                          \
1854     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1855   }
1856 #else
1857 #define KMP_STATS_LOOP_END /* Nothing */
1858 #endif
1859 
1860 template <typename T>
1861 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1862                                T *p_lb, T *p_ub,
1863                                typename traits_t<T>::signed_t *p_st
1864 #if OMPT_SUPPORT && OMPT_OPTIONAL
1865                                ,
1866                                void *codeptr
1867 #endif
1868                                ) {
1869 
1870   typedef typename traits_t<T>::unsigned_t UT;
1871   typedef typename traits_t<T>::signed_t ST;
1872   // This is potentially slightly misleading, schedule(runtime) will appear here
1873   // even if the actual runtme schedule is static. (Which points out a
1874   // disadavantage of schedule(runtime): even when static scheduling is used it
1875   // costs more than a compile time choice to use static scheduling would.)
1876   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1877 
1878   int status;
1879   dispatch_private_info_template<T> *pr;
1880   kmp_info_t *th = __kmp_threads[gtid];
1881   kmp_team_t *team = th->th.th_team;
1882 
1883   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1884   KD_TRACE(
1885       1000,
1886       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1887        gtid, p_lb, p_ub, p_st, p_last));
1888 
1889   if (team->t.t_serialized) {
1890     /* NOTE: serialize this dispatch becase we are not at the active level */
1891     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1892         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1893     KMP_DEBUG_ASSERT(pr);
1894 
1895     if ((status = (pr->u.p.tc != 0)) == 0) {
1896       *p_lb = 0;
1897       *p_ub = 0;
1898       //            if ( p_last != NULL )
1899       //                *p_last = 0;
1900       if (p_st != NULL)
1901         *p_st = 0;
1902       if (__kmp_env_consistency_check) {
1903         if (pr->pushed_ws != ct_none) {
1904           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1905         }
1906       }
1907     } else if (pr->flags.nomerge) {
1908       kmp_int32 last;
1909       T start;
1910       UT limit, trip, init;
1911       ST incr;
1912       T chunk = pr->u.p.parm1;
1913 
1914       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1915                      gtid));
1916 
1917       init = chunk * pr->u.p.count++;
1918       trip = pr->u.p.tc - 1;
1919 
1920       if ((status = (init <= trip)) == 0) {
1921         *p_lb = 0;
1922         *p_ub = 0;
1923         //                if ( p_last != NULL )
1924         //                    *p_last = 0;
1925         if (p_st != NULL)
1926           *p_st = 0;
1927         if (__kmp_env_consistency_check) {
1928           if (pr->pushed_ws != ct_none) {
1929             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1930           }
1931         }
1932       } else {
1933         start = pr->u.p.lb;
1934         limit = chunk + init - 1;
1935         incr = pr->u.p.st;
1936 
1937         if ((last = (limit >= trip)) != 0) {
1938           limit = trip;
1939 #if KMP_OS_WINDOWS
1940           pr->u.p.last_upper = pr->u.p.ub;
1941 #endif /* KMP_OS_WINDOWS */
1942         }
1943         if (p_last != NULL)
1944           *p_last = last;
1945         if (p_st != NULL)
1946           *p_st = incr;
1947         if (incr == 1) {
1948           *p_lb = start + init;
1949           *p_ub = start + limit;
1950         } else {
1951           *p_lb = start + init * incr;
1952           *p_ub = start + limit * incr;
1953         }
1954 
1955         if (pr->flags.ordered) {
1956           pr->u.p.ordered_lower = init;
1957           pr->u.p.ordered_upper = limit;
1958 #ifdef KMP_DEBUG
1959           {
1960             char *buff;
1961             // create format specifiers before the debug output
1962             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1963                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1964                                     traits_t<UT>::spec, traits_t<UT>::spec);
1965             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1966                             pr->u.p.ordered_upper));
1967             __kmp_str_free(&buff);
1968           }
1969 #endif
1970         } // if
1971       } // if
1972     } else {
1973       pr->u.p.tc = 0;
1974       *p_lb = pr->u.p.lb;
1975       *p_ub = pr->u.p.ub;
1976 #if KMP_OS_WINDOWS
1977       pr->u.p.last_upper = *p_ub;
1978 #endif /* KMP_OS_WINDOWS */
1979       if (p_last != NULL)
1980         *p_last = TRUE;
1981       if (p_st != NULL)
1982         *p_st = pr->u.p.st;
1983     } // if
1984 #ifdef KMP_DEBUG
1985     {
1986       char *buff;
1987       // create format specifiers before the debug output
1988       buff = __kmp_str_format(
1989           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1990           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1991           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1992       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1993       __kmp_str_free(&buff);
1994     }
1995 #endif
1996 #if INCLUDE_SSC_MARKS
1997     SSC_MARK_DISPATCH_NEXT();
1998 #endif
1999     OMPT_LOOP_END;
2000     KMP_STATS_LOOP_END;
2001     return status;
2002   } else {
2003     kmp_int32 last = 0;
2004     dispatch_shared_info_template<T> volatile *sh;
2005 
2006     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2007                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2008 
2009     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2010         th->th.th_dispatch->th_dispatch_pr_current);
2011     KMP_DEBUG_ASSERT(pr);
2012     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2013         th->th.th_dispatch->th_dispatch_sh_current);
2014     KMP_DEBUG_ASSERT(sh);
2015 
2016 #if KMP_USE_HIER_SCHED
2017     if (pr->flags.use_hier)
2018       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2019     else
2020 #endif // KMP_USE_HIER_SCHED
2021       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2022                                                 p_st, th->th.th_team_nproc,
2023                                                 th->th.th_info.ds.ds_tid);
2024     // status == 0: no more iterations to execute
2025     if (status == 0) {
2026       UT num_done;
2027 
2028       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2029 #ifdef KMP_DEBUG
2030       {
2031         char *buff;
2032         // create format specifiers before the debug output
2033         buff = __kmp_str_format(
2034             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2035             traits_t<UT>::spec);
2036         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2037         __kmp_str_free(&buff);
2038       }
2039 #endif
2040 
2041 #if KMP_USE_HIER_SCHED
2042       pr->flags.use_hier = FALSE;
2043 #endif
2044       if ((ST)num_done == th->th.th_team_nproc - 1) {
2045 #if (KMP_STATIC_STEAL_ENABLED)
2046         if (pr->schedule == kmp_sch_static_steal &&
2047             traits_t<T>::type_size > 4) {
2048           int i;
2049           kmp_info_t **other_threads = team->t.t_threads;
2050           // loop complete, safe to destroy locks used for stealing
2051           for (i = 0; i < th->th.th_team_nproc; ++i) {
2052             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2053             KMP_ASSERT(lck != NULL);
2054             __kmp_destroy_lock(lck);
2055             __kmp_free(lck);
2056             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2057           }
2058         }
2059 #endif
2060         /* NOTE: release this buffer to be reused */
2061 
2062         KMP_MB(); /* Flush all pending memory write invalidates.  */
2063 
2064         sh->u.s.num_done = 0;
2065         sh->u.s.iteration = 0;
2066 
2067         /* TODO replace with general release procedure? */
2068         if (pr->flags.ordered) {
2069           sh->u.s.ordered_iteration = 0;
2070         }
2071 
2072         KMP_MB(); /* Flush all pending memory write invalidates.  */
2073 
2074         sh->buffer_index += __kmp_dispatch_num_buffers;
2075         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2076                        gtid, sh->buffer_index));
2077 
2078         KMP_MB(); /* Flush all pending memory write invalidates.  */
2079 
2080       } // if
2081       if (__kmp_env_consistency_check) {
2082         if (pr->pushed_ws != ct_none) {
2083           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2084         }
2085       }
2086 
2087       th->th.th_dispatch->th_deo_fcn = NULL;
2088       th->th.th_dispatch->th_dxo_fcn = NULL;
2089       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2090       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2091     } // if (status == 0)
2092 #if KMP_OS_WINDOWS
2093     else if (last) {
2094       pr->u.p.last_upper = pr->u.p.ub;
2095     }
2096 #endif /* KMP_OS_WINDOWS */
2097     if (p_last != NULL && status != 0)
2098       *p_last = last;
2099   } // if
2100 
2101 #ifdef KMP_DEBUG
2102   {
2103     char *buff;
2104     // create format specifiers before the debug output
2105     buff = __kmp_str_format(
2106         "__kmp_dispatch_next: T#%%d normal case: "
2107         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2108         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2109     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2110                   (p_last ? *p_last : 0), status));
2111     __kmp_str_free(&buff);
2112   }
2113 #endif
2114 #if INCLUDE_SSC_MARKS
2115   SSC_MARK_DISPATCH_NEXT();
2116 #endif
2117   OMPT_LOOP_END;
2118   KMP_STATS_LOOP_END;
2119   return status;
2120 }
2121 
2122 template <typename T>
2123 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2124                                   kmp_int32 *plastiter, T *plower, T *pupper,
2125                                   typename traits_t<T>::signed_t incr) {
2126   typedef typename traits_t<T>::unsigned_t UT;
2127   kmp_uint32 team_id;
2128   kmp_uint32 nteams;
2129   UT trip_count;
2130   kmp_team_t *team;
2131   kmp_info_t *th;
2132 
2133   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2135 #ifdef KMP_DEBUG
2136   typedef typename traits_t<T>::signed_t ST;
2137   {
2138     char *buff;
2139     // create format specifiers before the debug output
2140     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2141                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2142                             traits_t<T>::spec, traits_t<T>::spec,
2143                             traits_t<ST>::spec, traits_t<T>::spec);
2144     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2145     __kmp_str_free(&buff);
2146   }
2147 #endif
2148 
2149   if (__kmp_env_consistency_check) {
2150     if (incr == 0) {
2151       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2152                             loc);
2153     }
2154     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2155       // The loop is illegal.
2156       // Some zero-trip loops maintained by compiler, e.g.:
2157       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2158       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2159       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2160       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2161       // Compiler does not check the following illegal loops:
2162       //   for(i=0;i<10;i+=incr) // where incr<0
2163       //   for(i=10;i>0;i-=incr) // where incr<0
2164       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2165     }
2166   }
2167   th = __kmp_threads[gtid];
2168   team = th->th.th_team;
2169 #if OMP_40_ENABLED
2170   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2171   nteams = th->th.th_teams_size.nteams;
2172 #endif
2173   team_id = team->t.t_master_tid;
2174   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2175 
2176   // compute global trip count
2177   if (incr == 1) {
2178     trip_count = *pupper - *plower + 1;
2179   } else if (incr == -1) {
2180     trip_count = *plower - *pupper + 1;
2181   } else if (incr > 0) {
2182     // upper-lower can exceed the limit of signed type
2183     trip_count = (UT)(*pupper - *plower) / incr + 1;
2184   } else {
2185     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2186   }
2187 
2188   if (trip_count <= nteams) {
2189     KMP_DEBUG_ASSERT(
2190         __kmp_static == kmp_sch_static_greedy ||
2191         __kmp_static ==
2192             kmp_sch_static_balanced); // Unknown static scheduling type.
2193     // only some teams get single iteration, others get nothing
2194     if (team_id < trip_count) {
2195       *pupper = *plower = *plower + team_id * incr;
2196     } else {
2197       *plower = *pupper + incr; // zero-trip loop
2198     }
2199     if (plastiter != NULL)
2200       *plastiter = (team_id == trip_count - 1);
2201   } else {
2202     if (__kmp_static == kmp_sch_static_balanced) {
2203       UT chunk = trip_count / nteams;
2204       UT extras = trip_count % nteams;
2205       *plower +=
2206           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2207       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2208       if (plastiter != NULL)
2209         *plastiter = (team_id == nteams - 1);
2210     } else {
2211       T chunk_inc_count =
2212           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2213       T upper = *pupper;
2214       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2215       // Unknown static scheduling type.
2216       *plower += team_id * chunk_inc_count;
2217       *pupper = *plower + chunk_inc_count - incr;
2218       // Check/correct bounds if needed
2219       if (incr > 0) {
2220         if (*pupper < *plower)
2221           *pupper = traits_t<T>::max_value;
2222         if (plastiter != NULL)
2223           *plastiter = *plower <= upper && *pupper > upper - incr;
2224         if (*pupper > upper)
2225           *pupper = upper; // tracker C73258
2226       } else {
2227         if (*pupper > *plower)
2228           *pupper = traits_t<T>::min_value;
2229         if (plastiter != NULL)
2230           *plastiter = *plower >= upper && *pupper < upper - incr;
2231         if (*pupper < upper)
2232           *pupper = upper; // tracker C73258
2233       }
2234     }
2235   }
2236 }
2237 
2238 //-----------------------------------------------------------------------------
2239 // Dispatch routines
2240 //    Transfer call to template< type T >
2241 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2242 //                         T lb, T ub, ST st, ST chunk )
2243 extern "C" {
2244 
2245 /*!
2246 @ingroup WORK_SHARING
2247 @{
2248 @param loc Source location
2249 @param gtid Global thread id
2250 @param schedule Schedule type
2251 @param lb  Lower bound
2252 @param ub  Upper bound
2253 @param st  Step (or increment if you prefer)
2254 @param chunk The chunk size to block with
2255 
2256 This function prepares the runtime to start a dynamically scheduled for loop,
2257 saving the loop arguments.
2258 These functions are all identical apart from the types of the arguments.
2259 */
2260 
2261 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2262                             enum sched_type schedule, kmp_int32 lb,
2263                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2264   KMP_DEBUG_ASSERT(__kmp_init_serial);
2265 #if OMPT_SUPPORT && OMPT_OPTIONAL
2266   OMPT_STORE_RETURN_ADDRESS(gtid);
2267 #endif
2268   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2269 }
2270 /*!
2271 See @ref __kmpc_dispatch_init_4
2272 */
2273 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2274                              enum sched_type schedule, kmp_uint32 lb,
2275                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2276   KMP_DEBUG_ASSERT(__kmp_init_serial);
2277 #if OMPT_SUPPORT && OMPT_OPTIONAL
2278   OMPT_STORE_RETURN_ADDRESS(gtid);
2279 #endif
2280   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2281 }
2282 
2283 /*!
2284 See @ref __kmpc_dispatch_init_4
2285 */
2286 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2287                             enum sched_type schedule, kmp_int64 lb,
2288                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2289   KMP_DEBUG_ASSERT(__kmp_init_serial);
2290 #if OMPT_SUPPORT && OMPT_OPTIONAL
2291   OMPT_STORE_RETURN_ADDRESS(gtid);
2292 #endif
2293   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2294 }
2295 
2296 /*!
2297 See @ref __kmpc_dispatch_init_4
2298 */
2299 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2300                              enum sched_type schedule, kmp_uint64 lb,
2301                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2302   KMP_DEBUG_ASSERT(__kmp_init_serial);
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304   OMPT_STORE_RETURN_ADDRESS(gtid);
2305 #endif
2306   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2307 }
2308 
2309 /*!
2310 See @ref __kmpc_dispatch_init_4
2311 
2312 Difference from __kmpc_dispatch_init set of functions is these functions
2313 are called for composite distribute parallel for construct. Thus before
2314 regular iterations dispatching we need to calc per-team iteration space.
2315 
2316 These functions are all identical apart from the types of the arguments.
2317 */
2318 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2319                                  enum sched_type schedule, kmp_int32 *p_last,
2320                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2321                                  kmp_int32 chunk) {
2322   KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324   OMPT_STORE_RETURN_ADDRESS(gtid);
2325 #endif
2326   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2327   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2328 }
2329 
2330 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2331                                   enum sched_type schedule, kmp_int32 *p_last,
2332                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2333                                   kmp_int32 chunk) {
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336   OMPT_STORE_RETURN_ADDRESS(gtid);
2337 #endif
2338   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2339   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2340 }
2341 
2342 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2343                                  enum sched_type schedule, kmp_int32 *p_last,
2344                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2345                                  kmp_int64 chunk) {
2346   KMP_DEBUG_ASSERT(__kmp_init_serial);
2347 #if OMPT_SUPPORT && OMPT_OPTIONAL
2348   OMPT_STORE_RETURN_ADDRESS(gtid);
2349 #endif
2350   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2351   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2352 }
2353 
2354 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2355                                   enum sched_type schedule, kmp_int32 *p_last,
2356                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2357                                   kmp_int64 chunk) {
2358   KMP_DEBUG_ASSERT(__kmp_init_serial);
2359 #if OMPT_SUPPORT && OMPT_OPTIONAL
2360   OMPT_STORE_RETURN_ADDRESS(gtid);
2361 #endif
2362   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2363   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2364 }
2365 
2366 /*!
2367 @param loc Source code location
2368 @param gtid Global thread id
2369 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2370 otherwise
2371 @param p_lb   Pointer to the lower bound for the next chunk of work
2372 @param p_ub   Pointer to the upper bound for the next chunk of work
2373 @param p_st   Pointer to the stride for the next chunk of work
2374 @return one if there is work to be done, zero otherwise
2375 
2376 Get the next dynamically allocated chunk of work for this thread.
2377 If there is no more work, then the lb,ub and stride need not be modified.
2378 */
2379 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2380                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2381 #if OMPT_SUPPORT && OMPT_OPTIONAL
2382   OMPT_STORE_RETURN_ADDRESS(gtid);
2383 #endif
2384   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2385 #if OMPT_SUPPORT && OMPT_OPTIONAL
2386                                         ,
2387                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2388 #endif
2389                                             );
2390 }
2391 
2392 /*!
2393 See @ref __kmpc_dispatch_next_4
2394 */
2395 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2396                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2397                             kmp_int32 *p_st) {
2398 #if OMPT_SUPPORT && OMPT_OPTIONAL
2399   OMPT_STORE_RETURN_ADDRESS(gtid);
2400 #endif
2401   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL
2403                                          ,
2404                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2405 #endif
2406                                              );
2407 }
2408 
2409 /*!
2410 See @ref __kmpc_dispatch_next_4
2411 */
2412 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2413                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2414 #if OMPT_SUPPORT && OMPT_OPTIONAL
2415   OMPT_STORE_RETURN_ADDRESS(gtid);
2416 #endif
2417   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419                                         ,
2420                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2421 #endif
2422                                             );
2423 }
2424 
2425 /*!
2426 See @ref __kmpc_dispatch_next_4
2427 */
2428 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2429                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2430                             kmp_int64 *p_st) {
2431 #if OMPT_SUPPORT && OMPT_OPTIONAL
2432   OMPT_STORE_RETURN_ADDRESS(gtid);
2433 #endif
2434   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436                                          ,
2437                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2438 #endif
2439                                              );
2440 }
2441 
2442 /*!
2443 @param loc Source code location
2444 @param gtid Global thread id
2445 
2446 Mark the end of a dynamic loop.
2447 */
2448 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2449   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2450 }
2451 
2452 /*!
2453 See @ref __kmpc_dispatch_fini_4
2454 */
2455 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2456   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2457 }
2458 
2459 /*!
2460 See @ref __kmpc_dispatch_fini_4
2461 */
2462 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2463   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2464 }
2465 
2466 /*!
2467 See @ref __kmpc_dispatch_fini_4
2468 */
2469 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2470   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2471 }
2472 /*! @} */
2473 
2474 //-----------------------------------------------------------------------------
2475 // Non-template routines from kmp_dispatch.cpp used in other sources
2476 
2477 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2478   return value == checker;
2479 }
2480 
2481 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2482   return value != checker;
2483 }
2484 
2485 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2486   return value < checker;
2487 }
2488 
2489 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2490   return value >= checker;
2491 }
2492 
2493 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2494   return value <= checker;
2495 }
2496 
2497 kmp_uint32
2498 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2499              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2500              void *obj // Higher-level synchronization object, or NULL.
2501              ) {
2502   // note: we may not belong to a team at this point
2503   volatile kmp_uint32 *spin = spinner;
2504   kmp_uint32 check = checker;
2505   kmp_uint32 spins;
2506   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2507   kmp_uint32 r;
2508 
2509   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2510   KMP_INIT_YIELD(spins);
2511   // main wait spin loop
2512   while (!f(r = TCR_4(*spin), check)) {
2513     KMP_FSYNC_SPIN_PREPARE(obj);
2514     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2515        split. It causes problems with infinite recursion because of exit lock */
2516     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2517         __kmp_abort_thread(); */
2518     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2519   }
2520   KMP_FSYNC_SPIN_ACQUIRED(obj);
2521   return r;
2522 }
2523 
2524 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2525                       kmp_uint32 (*pred)(void *, kmp_uint32),
2526                       void *obj // Higher-level synchronization object, or NULL.
2527                       ) {
2528   // note: we may not belong to a team at this point
2529   void *spin = spinner;
2530   kmp_uint32 check = checker;
2531   kmp_uint32 spins;
2532   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2533 
2534   KMP_FSYNC_SPIN_INIT(obj, spin);
2535   KMP_INIT_YIELD(spins);
2536   // main wait spin loop
2537   while (!f(spin, check)) {
2538     KMP_FSYNC_SPIN_PREPARE(obj);
2539     /* if we have waited a bit, or are noversubscribed, yield */
2540     /* pause is in the following code */
2541     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2542   }
2543   KMP_FSYNC_SPIN_ACQUIRED(obj);
2544 }
2545 
2546 } // extern "C"
2547 
2548 #ifdef KMP_GOMP_COMPAT
2549 
2550 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2551                                enum sched_type schedule, kmp_int32 lb,
2552                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2553                                int push_ws) {
2554   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2555                                  push_ws);
2556 }
2557 
2558 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2559                                 enum sched_type schedule, kmp_uint32 lb,
2560                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2561                                 int push_ws) {
2562   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2563                                   push_ws);
2564 }
2565 
2566 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2567                                enum sched_type schedule, kmp_int64 lb,
2568                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2569                                int push_ws) {
2570   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2571                                  push_ws);
2572 }
2573 
2574 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2575                                 enum sched_type schedule, kmp_uint64 lb,
2576                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2577                                 int push_ws) {
2578   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2579                                   push_ws);
2580 }
2581 
2582 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2583   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2584 }
2585 
2586 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2587   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2588 }
2589 
2590 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2591   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2592 }
2593 
2594 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2595   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2596 }
2597 
2598 #endif /* KMP_GOMP_COMPAT */
2599 
2600 /* ------------------------------------------------------------------------ */
2601