1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Initialize a dispatch_private_info_template<T> buffer for a particular
72 // type of schedule,chunk.  The loop description is found in lb (lower bound),
73 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
74 // to the scheduling (often the number of threads in a team, but not always if
75 // hierarchical scheduling is used).  tid is the id of the thread calling
76 // the function within the group of nproc threads.  It will have a value
77 // between 0 and nproc - 1.  This is often just the thread id within a team, but
78 // is not necessarily the case when using hierarchical scheduling.
79 // loc is the source file location of the corresponding loop
80 // gtid is the global thread id
81 template <typename T>
82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
83                                    dispatch_private_info_template<T> *pr,
84                                    enum sched_type schedule, T lb, T ub,
85                                    typename traits_t<T>::signed_t st,
86 #if USE_ITT_BUILD
87                                    kmp_uint64 *cur_chunk,
88 #endif
89                                    typename traits_t<T>::signed_t chunk,
90                                    T nproc, T tid) {
91   typedef typename traits_t<T>::unsigned_t UT;
92   typedef typename traits_t<T>::floating_t DBL;
93 
94   int active;
95   T tc;
96   kmp_info_t *th;
97   kmp_team_t *team;
98 
99 #ifdef KMP_DEBUG
100   typedef typename traits_t<T>::signed_t ST;
101   {
102     char *buff;
103     // create format specifiers before the debug output
104     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
105                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
106                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107                             traits_t<T>::spec, traits_t<T>::spec,
108                             traits_t<ST>::spec, traits_t<ST>::spec,
109                             traits_t<T>::spec, traits_t<T>::spec);
110     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111     __kmp_str_free(&buff);
112   }
113 #endif
114   /* setup data */
115   th = __kmp_threads[gtid];
116   team = th->th.th_team;
117   active = !team->t.t_serialized;
118 
119 #if USE_ITT_BUILD
120   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121                                     __kmp_forkjoin_frames_mode == 3 &&
122                                     KMP_MASTER_GTID(gtid) &&
123 #if OMP_40_ENABLED
124                                     th->th.th_teams_microtask == NULL &&
125 #endif
126                                     team->t.t_active_level == 1;
127 #endif
128 #if (KMP_STATIC_STEAL_ENABLED)
129   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
130     // AC: we now have only one implementation of stealing, so use it
131     schedule = kmp_sch_static_steal;
132   else
133 #endif
134     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
135 
136   /* Pick up the nomerge/ordered bits from the scheduling type */
137   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
138     pr->flags.nomerge = TRUE;
139     schedule =
140         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
141   } else {
142     pr->flags.nomerge = FALSE;
143   }
144   pr->type_size = traits_t<T>::type_size; // remember the size of variables
145   if (kmp_ord_lower & schedule) {
146     pr->flags.ordered = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
149   } else {
150     pr->flags.ordered = FALSE;
151   }
152 
153   if (schedule == kmp_sch_static) {
154     schedule = __kmp_static;
155   } else {
156     if (schedule == kmp_sch_runtime) {
157       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
158       // not specified)
159       schedule = team->t.t_sched.r_sched_type;
160       // Detail the schedule if needed (global controls are differentiated
161       // appropriately)
162       if (schedule == kmp_sch_guided_chunked) {
163         schedule = __kmp_guided;
164       } else if (schedule == kmp_sch_static) {
165         schedule = __kmp_static;
166       }
167       // Use the chunk size specified by OMP_SCHEDULE (or default if not
168       // specified)
169       chunk = team->t.t_sched.chunk;
170 #if USE_ITT_BUILD
171       if (cur_chunk)
172         *cur_chunk = chunk;
173 #endif
174 #ifdef KMP_DEBUG
175       {
176         char *buff;
177         // create format specifiers before the debug output
178         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
179                                 "schedule:%%d chunk:%%%s\n",
180                                 traits_t<ST>::spec);
181         KD_TRACE(10, (buff, gtid, schedule, chunk));
182         __kmp_str_free(&buff);
183       }
184 #endif
185     } else {
186       if (schedule == kmp_sch_guided_chunked) {
187         schedule = __kmp_guided;
188       }
189       if (chunk <= 0) {
190         chunk = KMP_DEFAULT_CHUNK;
191       }
192     }
193 
194     if (schedule == kmp_sch_auto) {
195       // mapping and differentiation: in the __kmp_do_serial_initialize()
196       schedule = __kmp_auto;
197 #ifdef KMP_DEBUG
198       {
199         char *buff;
200         // create format specifiers before the debug output
201         buff = __kmp_str_format(
202             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
203             "schedule:%%d chunk:%%%s\n",
204             traits_t<ST>::spec);
205         KD_TRACE(10, (buff, gtid, schedule, chunk));
206         __kmp_str_free(&buff);
207       }
208 #endif
209     }
210 
211     /* guided analytical not safe for too many threads */
212     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213       schedule = kmp_sch_guided_iterative_chunked;
214       KMP_WARNING(DispatchManyThreads);
215     }
216 #if OMP_45_ENABLED
217     if (schedule == kmp_sch_runtime_simd) {
218       // compiler provides simd_width in the chunk parameter
219       schedule = team->t.t_sched.r_sched_type;
220       // Detail the schedule if needed (global controls are differentiated
221       // appropriately)
222       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
223           schedule == __kmp_static) {
224         schedule = kmp_sch_static_balanced_chunked;
225       } else {
226         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
227           schedule = kmp_sch_guided_simd;
228         }
229         chunk = team->t.t_sched.chunk * chunk;
230       }
231 #if USE_ITT_BUILD
232       if (cur_chunk)
233         *cur_chunk = chunk;
234 #endif
235 #ifdef KMP_DEBUG
236       {
237         char *buff;
238         // create format specifiers before the debug output
239         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
240                                 " chunk:%%%s\n",
241                                 traits_t<ST>::spec);
242         KD_TRACE(10, (buff, gtid, schedule, chunk));
243         __kmp_str_free(&buff);
244       }
245 #endif
246     }
247 #endif // OMP_45_ENABLED
248     pr->u.p.parm1 = chunk;
249   }
250   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
251               "unknown scheduling type");
252 
253   pr->u.p.count = 0;
254 
255   if (__kmp_env_consistency_check) {
256     if (st == 0) {
257       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
259     }
260   }
261   // compute trip count
262   if (st == 1) { // most common case
263     if (ub >= lb) {
264       tc = ub - lb + 1;
265     } else { // ub < lb
266       tc = 0; // zero-trip
267     }
268   } else if (st < 0) {
269     if (lb >= ub) {
270       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
271       // where the division needs to be unsigned regardless of the result type
272       tc = (UT)(lb - ub) / (-st) + 1;
273     } else { // lb < ub
274       tc = 0; // zero-trip
275     }
276   } else { // st > 0
277     if (ub >= lb) {
278       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(ub - lb) / st + 1;
281     } else { // ub < lb
282       tc = 0; // zero-trip
283     }
284   }
285 
286   pr->u.p.lb = lb;
287   pr->u.p.ub = ub;
288   pr->u.p.st = st;
289   pr->u.p.tc = tc;
290 
291 #if KMP_OS_WINDOWS
292   pr->u.p.last_upper = ub + st;
293 #endif /* KMP_OS_WINDOWS */
294 
295   /* NOTE: only the active parallel region(s) has active ordered sections */
296 
297   if (active) {
298     if (pr->flags.ordered) {
299       pr->ordered_bumped = 0;
300       pr->u.p.ordered_lower = 1;
301       pr->u.p.ordered_upper = 0;
302     }
303   }
304 
305   switch (schedule) {
306 #if (KMP_STATIC_STEAL_ENABLED)
307   case kmp_sch_static_steal: {
308     T ntc, init;
309 
310     KD_TRACE(100,
311              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
312               gtid));
313 
314     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
315     if (nproc > 1 && ntc >= nproc) {
316       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
317       T id = tid;
318       T small_chunk, extras;
319 
320       small_chunk = ntc / nproc;
321       extras = ntc % nproc;
322 
323       init = id * small_chunk + (id < extras ? id : extras);
324       pr->u.p.count = init;
325       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
326 
327       pr->u.p.parm2 = lb;
328       // pr->pfields.parm3 = 0; // it's not used in static_steal
329       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
330       pr->u.p.st = st;
331       if (traits_t<T>::type_size > 4) {
332         // AC: TODO: check if 16-byte CAS available and use it to
333         // improve performance (probably wait for explicit request
334         // before spending time on this).
335         // For now use dynamically allocated per-thread lock,
336         // free memory in __kmp_dispatch_next when status==0.
337         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
338         th->th.th_dispatch->th_steal_lock =
339             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
340         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
341       }
342       break;
343     } else {
344       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
345                      "kmp_sch_static_balanced\n",
346                      gtid));
347       schedule = kmp_sch_static_balanced;
348       /* too few iterations: fall-through to kmp_sch_static_balanced */
349     } // if
350     /* FALL-THROUGH to static balanced */
351   } // case
352 #endif
353   case kmp_sch_static_balanced: {
354     T init, limit;
355 
356     KD_TRACE(
357         100,
358         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
359          gtid));
360 
361     if (nproc > 1) {
362       T id = tid;
363 
364       if (tc < nproc) {
365         if (id < tc) {
366           init = id;
367           limit = id;
368           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
369         } else {
370           pr->u.p.count = 1; /* means no more chunks to execute */
371           pr->u.p.parm1 = FALSE;
372           break;
373         }
374       } else {
375         T small_chunk = tc / nproc;
376         T extras = tc % nproc;
377         init = id * small_chunk + (id < extras ? id : extras);
378         limit = init + small_chunk - (id < extras ? 0 : 1);
379         pr->u.p.parm1 = (id == nproc - 1);
380       }
381     } else {
382       if (tc > 0) {
383         init = 0;
384         limit = tc - 1;
385         pr->u.p.parm1 = TRUE;
386       } else {
387         // zero trip count
388         pr->u.p.count = 1; /* means no more chunks to execute */
389         pr->u.p.parm1 = FALSE;
390         break;
391       }
392     }
393 #if USE_ITT_BUILD
394     // Calculate chunk for metadata report
395     if (itt_need_metadata_reporting)
396       if (cur_chunk)
397         *cur_chunk = limit - init + 1;
398 #endif
399     if (st == 1) {
400       pr->u.p.lb = lb + init;
401       pr->u.p.ub = lb + limit;
402     } else {
403       // calculated upper bound, "ub" is user-defined upper bound
404       T ub_tmp = lb + limit * st;
405       pr->u.p.lb = lb + init * st;
406       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
407       // it exactly
408       if (st > 0) {
409         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
410       } else {
411         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
412       }
413     }
414     if (pr->flags.ordered) {
415       pr->u.p.ordered_lower = init;
416       pr->u.p.ordered_upper = limit;
417     }
418     break;
419   } // case
420 #if OMP_45_ENABLED
421   case kmp_sch_static_balanced_chunked: {
422     // similar to balanced, but chunk adjusted to multiple of simd width
423     T nth = nproc;
424     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
425                    " -> falling-through to static_greedy\n",
426                    gtid));
427     schedule = kmp_sch_static_greedy;
428     if (nth > 1)
429       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
430     else
431       pr->u.p.parm1 = tc;
432     break;
433   } // case
434   case kmp_sch_guided_simd:
435 #endif // OMP_45_ENABLED
436   case kmp_sch_guided_iterative_chunked: {
437     KD_TRACE(
438         100,
439         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
440          " case\n",
441          gtid));
442 
443     if (nproc > 1) {
444       if ((2L * chunk + 1) * nproc >= tc) {
445         /* chunk size too large, switch to dynamic */
446         schedule = kmp_sch_dynamic_chunked;
447       } else {
448         // when remaining iters become less than parm2 - switch to dynamic
449         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
450         *(double *)&pr->u.p.parm3 =
451             guided_flt_param / nproc; // may occupy parm3 and parm4
452       }
453     } else {
454       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
455                      "kmp_sch_static_greedy\n",
456                      gtid));
457       schedule = kmp_sch_static_greedy;
458       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
459       KD_TRACE(
460           100,
461           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
462            gtid));
463       pr->u.p.parm1 = tc;
464     } // if
465   } // case
466   break;
467   case kmp_sch_guided_analytical_chunked: {
468     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
469                    "kmp_sch_guided_analytical_chunked case\n",
470                    gtid));
471 
472     if (nproc > 1) {
473       if ((2L * chunk + 1) * nproc >= tc) {
474         /* chunk size too large, switch to dynamic */
475         schedule = kmp_sch_dynamic_chunked;
476       } else {
477         /* commonly used term: (2 nproc - 1)/(2 nproc) */
478         DBL x;
479 
480 #if KMP_USE_X87CONTROL
481         /* Linux* OS already has 64-bit computation by default for long double,
482            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
483            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
484            instead of the default 53-bit. Even though long double doesn't work
485            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
486            expected to impact the correctness of the algorithm, but this has not
487            been mathematically proven. */
488         // save original FPCW and set precision to 64-bit, as
489         // Windows* OS on IA-32 architecture defaults to 53-bit
490         unsigned int oldFpcw = _control87(0, 0);
491         _control87(_PC_64, _MCW_PC); // 0,0x30000
492 #endif
493         /* value used for comparison in solver for cross-over point */
494         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
495 
496         /* crossover point--chunk indexes equal to or greater than
497            this point switch to dynamic-style scheduling */
498         UT cross;
499 
500         /* commonly used term: (2 nproc - 1)/(2 nproc) */
501         x = (long double)1.0 - (long double)0.5 / nproc;
502 
503 #ifdef KMP_DEBUG
504         { // test natural alignment
505           struct _test_a {
506             char a;
507             union {
508               char b;
509               DBL d;
510             };
511           } t;
512           ptrdiff_t natural_alignment =
513               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
514           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
515           // long)natural_alignment );
516           KMP_DEBUG_ASSERT(
517               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
518         }
519 #endif // KMP_DEBUG
520 
521         /* save the term in thread private dispatch structure */
522         *(DBL *)&pr->u.p.parm3 = x;
523 
524         /* solve for the crossover point to the nearest integer i for which C_i
525            <= chunk */
526         {
527           UT left, right, mid;
528           long double p;
529 
530           /* estimate initial upper and lower bound */
531 
532           /* doesn't matter what value right is as long as it is positive, but
533              it affects performance of the solver */
534           right = 229;
535           p = __kmp_pow<UT>(x, right);
536           if (p > target) {
537             do {
538               p *= p;
539               right <<= 1;
540             } while (p > target && right < (1 << 27));
541             /* lower bound is previous (failed) estimate of upper bound */
542             left = right >> 1;
543           } else {
544             left = 0;
545           }
546 
547           /* bisection root-finding method */
548           while (left + 1 < right) {
549             mid = (left + right) / 2;
550             if (__kmp_pow<UT>(x, mid) > target) {
551               left = mid;
552             } else {
553               right = mid;
554             }
555           } // while
556           cross = right;
557         }
558         /* assert sanity of computed crossover point */
559         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
560                    __kmp_pow<UT>(x, cross) <= target);
561 
562         /* save the crossover point in thread private dispatch structure */
563         pr->u.p.parm2 = cross;
564 
565 // C75803
566 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
567 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
568 #else
569 #define GUIDED_ANALYTICAL_WORKAROUND (x)
570 #endif
571         /* dynamic-style scheduling offset */
572         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
573                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
574                         cross * chunk;
575 #if KMP_USE_X87CONTROL
576         // restore FPCW
577         _control87(oldFpcw, _MCW_PC);
578 #endif
579       } // if
580     } else {
581       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
582                      "kmp_sch_static_greedy\n",
583                      gtid));
584       schedule = kmp_sch_static_greedy;
585       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
586       pr->u.p.parm1 = tc;
587     } // if
588   } // case
589   break;
590   case kmp_sch_static_greedy:
591     KD_TRACE(
592         100,
593         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
594          gtid));
595     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
596     break;
597   case kmp_sch_static_chunked:
598   case kmp_sch_dynamic_chunked:
599     if (pr->u.p.parm1 <= 0) {
600       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
601     }
602     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
603                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
604                    gtid));
605     break;
606   case kmp_sch_trapezoidal: {
607     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
608 
609     T parm1, parm2, parm3, parm4;
610     KD_TRACE(100,
611              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
612               gtid));
613 
614     parm1 = chunk;
615 
616     /* F : size of the first cycle */
617     parm2 = (tc / (2 * nproc));
618 
619     if (parm2 < 1) {
620       parm2 = 1;
621     }
622 
623     /* L : size of the last cycle.  Make sure the last cycle is not larger
624        than the first cycle. */
625     if (parm1 < 1) {
626       parm1 = 1;
627     } else if (parm1 > parm2) {
628       parm1 = parm2;
629     }
630 
631     /* N : number of cycles */
632     parm3 = (parm2 + parm1);
633     parm3 = (2 * tc + parm3 - 1) / parm3;
634 
635     if (parm3 < 2) {
636       parm3 = 2;
637     }
638 
639     /* sigma : decreasing incr of the trapezoid */
640     parm4 = (parm3 - 1);
641     parm4 = (parm2 - parm1) / parm4;
642 
643     // pointless check, because parm4 >= 0 always
644     // if ( parm4 < 0 ) {
645     //    parm4 = 0;
646     //}
647 
648     pr->u.p.parm1 = parm1;
649     pr->u.p.parm2 = parm2;
650     pr->u.p.parm3 = parm3;
651     pr->u.p.parm4 = parm4;
652   } // case
653   break;
654 
655   default: {
656     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
657                 KMP_HNT(GetNewerLibrary), // Hint
658                 __kmp_msg_null // Variadic argument list terminator
659                 );
660   } break;
661   } // switch
662   pr->schedule = schedule;
663 }
664 
665 #if KMP_USE_HIER_SCHED
666 template <typename T>
667 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
668                                              typename traits_t<T>::signed_t st);
669 template <>
670 inline void
671 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
672                                             kmp_int32 ub, kmp_int32 st) {
673   __kmp_dispatch_init_hierarchy<kmp_int32>(
674       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
675       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
676 }
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
680                                              kmp_uint32 ub, kmp_int32 st) {
681   __kmp_dispatch_init_hierarchy<kmp_uint32>(
682       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
688                                             kmp_int64 ub, kmp_int64 st) {
689   __kmp_dispatch_init_hierarchy<kmp_int64>(
690       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
696                                              kmp_uint64 ub, kmp_int64 st) {
697   __kmp_dispatch_init_hierarchy<kmp_uint64>(
698       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 
702 // free all the hierarchy scheduling memory associated with the team
703 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
704   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
705   for (int i = 0; i < num_disp_buff; ++i) {
706     // type does not matter here so use kmp_int32
707     auto sh =
708         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
709             &team->t.t_disp_buffer[i]);
710     if (sh->hier) {
711       sh->hier->deallocate();
712       __kmp_free(sh->hier);
713     }
714   }
715 }
716 #endif
717 
718 // UT - unsigned flavor of T, ST - signed flavor of T,
719 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
720 template <typename T>
721 static void
722 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
723                     T ub, typename traits_t<T>::signed_t st,
724                     typename traits_t<T>::signed_t chunk, int push_ws) {
725   typedef typename traits_t<T>::unsigned_t UT;
726 
727   int active;
728   kmp_info_t *th;
729   kmp_team_t *team;
730   kmp_uint32 my_buffer_index;
731   dispatch_private_info_template<T> *pr;
732   dispatch_shared_info_template<T> volatile *sh;
733 
734   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
735                    sizeof(dispatch_private_info));
736   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
737                    sizeof(dispatch_shared_info));
738 
739   if (!TCR_4(__kmp_init_parallel))
740     __kmp_parallel_initialize();
741 
742 #if OMP_50_ENABLED
743   __kmp_resume_if_soft_paused();
744 #endif
745 
746 #if INCLUDE_SSC_MARKS
747   SSC_MARK_DISPATCH_INIT();
748 #endif
749 #ifdef KMP_DEBUG
750   typedef typename traits_t<T>::signed_t ST;
751   {
752     char *buff;
753     // create format specifiers before the debug output
754     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
755                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
756                             traits_t<ST>::spec, traits_t<T>::spec,
757                             traits_t<T>::spec, traits_t<ST>::spec);
758     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
759     __kmp_str_free(&buff);
760   }
761 #endif
762   /* setup data */
763   th = __kmp_threads[gtid];
764   team = th->th.th_team;
765   active = !team->t.t_serialized;
766   th->th.th_ident = loc;
767 
768   // Any half-decent optimizer will remove this test when the blocks are empty
769   // since the macros expand to nothing
770   // when statistics are disabled.
771   if (schedule == __kmp_static) {
772     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
773   } else {
774     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
775   }
776 
777 #if KMP_USE_HIER_SCHED
778   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
779   // Hierarchical scheduling does not work with ordered, so if ordered is
780   // detected, then revert back to threaded scheduling.
781   bool ordered;
782   enum sched_type my_sched = schedule;
783   my_buffer_index = th->th.th_dispatch->th_disp_index;
784   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
785       &th->th.th_dispatch
786            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
787   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
788   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
789     my_sched =
790         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
791   ordered = (kmp_ord_lower & my_sched);
792   if (pr->flags.use_hier) {
793     if (ordered) {
794       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
795                      "Disabling hierarchical scheduling.\n",
796                      gtid));
797       pr->flags.use_hier = FALSE;
798     }
799   }
800   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
801     // Don't use hierarchical for ordered parallel loops and don't
802     // use the runtime hierarchy if one was specified in the program
803     if (!ordered && !pr->flags.use_hier)
804       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
805   }
806 #endif // KMP_USE_HIER_SCHED
807 
808 #if USE_ITT_BUILD
809   kmp_uint64 cur_chunk = chunk;
810   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
811                                     __kmp_forkjoin_frames_mode == 3 &&
812                                     KMP_MASTER_GTID(gtid) &&
813 #if OMP_40_ENABLED
814                                     th->th.th_teams_microtask == NULL &&
815 #endif
816                                     team->t.t_active_level == 1;
817 #endif
818   if (!active) {
819     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
820         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
821   } else {
822     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
823                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
824 
825     my_buffer_index = th->th.th_dispatch->th_disp_index++;
826 
827     /* What happens when number of threads changes, need to resize buffer? */
828     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
829         &th->th.th_dispatch
830              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
831     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
832         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
833     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
834                   my_buffer_index));
835   }
836 
837   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
838 #if USE_ITT_BUILD
839                                 &cur_chunk,
840 #endif
841                                 chunk, (T)th->th.th_team_nproc,
842                                 (T)th->th.th_info.ds.ds_tid);
843   if (active) {
844     if (pr->flags.ordered == 0) {
845       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
846       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
847     } else {
848       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
849       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
850     }
851   }
852 
853   if (active) {
854     /* The name of this buffer should be my_buffer_index when it's free to use
855      * it */
856 
857     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
858                    "sh->buffer_index:%d\n",
859                    gtid, my_buffer_index, sh->buffer_index));
860     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
861                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
862     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
863     // my_buffer_index are *always* 32-bit integers.
864     KMP_MB(); /* is this necessary? */
865     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
866                    "sh->buffer_index:%d\n",
867                    gtid, my_buffer_index, sh->buffer_index));
868 
869     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
870     th->th.th_dispatch->th_dispatch_sh_current =
871         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
872 #if USE_ITT_BUILD
873     if (pr->flags.ordered) {
874       __kmp_itt_ordered_init(gtid);
875     }
876     // Report loop metadata
877     if (itt_need_metadata_reporting) {
878       // Only report metadata by master of active team at level 1
879       kmp_uint64 schedtype = 0;
880       switch (schedule) {
881       case kmp_sch_static_chunked:
882       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
883         break;
884       case kmp_sch_static_greedy:
885         cur_chunk = pr->u.p.parm1;
886         break;
887       case kmp_sch_dynamic_chunked:
888         schedtype = 1;
889         break;
890       case kmp_sch_guided_iterative_chunked:
891       case kmp_sch_guided_analytical_chunked:
892 #if OMP_45_ENABLED
893       case kmp_sch_guided_simd:
894 #endif
895         schedtype = 2;
896         break;
897       default:
898         // Should we put this case under "static"?
899         // case kmp_sch_static_steal:
900         schedtype = 3;
901         break;
902       }
903       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
904     }
905 #if KMP_USE_HIER_SCHED
906     if (pr->flags.use_hier) {
907       pr->u.p.count = 0;
908       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
909     }
910 #endif // KMP_USER_HIER_SCHED
911 #endif /* USE_ITT_BUILD */
912   }
913 
914 #ifdef KMP_DEBUG
915   {
916     char *buff;
917     // create format specifiers before the debug output
918     buff = __kmp_str_format(
919         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
920         "lb:%%%s ub:%%%s"
921         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
922         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
923         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
924         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
925         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
926         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
927     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
928                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
929                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
930                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
931     __kmp_str_free(&buff);
932   }
933 #endif
934 #if (KMP_STATIC_STEAL_ENABLED)
935   // It cannot be guaranteed that after execution of a loop with some other
936   // schedule kind all the parm3 variables will contain the same value. Even if
937   // all parm3 will be the same, it still exists a bad case like using 0 and 1
938   // rather than program life-time increment. So the dedicated variable is
939   // required. The 'static_steal_counter' is used.
940   if (schedule == kmp_sch_static_steal) {
941     // Other threads will inspect this variable when searching for a victim.
942     // This is a flag showing that other threads may steal from this thread
943     // since then.
944     volatile T *p = &pr->u.p.static_steal_counter;
945     *p = *p + 1;
946   }
947 #endif // ( KMP_STATIC_STEAL_ENABLED )
948 
949 #if OMPT_SUPPORT && OMPT_OPTIONAL
950   if (ompt_enabled.ompt_callback_work) {
951     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
952     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
953     ompt_callbacks.ompt_callback(ompt_callback_work)(
954         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
955         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
956   }
957 #endif
958   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
959 }
960 
961 /* For ordered loops, either __kmp_dispatch_finish() should be called after
962  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
963  * every chunk of iterations.  If the ordered section(s) were not executed
964  * for this iteration (or every iteration in this chunk), we need to set the
965  * ordered iteration counters so that the next thread can proceed. */
966 template <typename UT>
967 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
968   typedef typename traits_t<UT>::signed_t ST;
969   kmp_info_t *th = __kmp_threads[gtid];
970 
971   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
972   if (!th->th.th_team->t.t_serialized) {
973 
974     dispatch_private_info_template<UT> *pr =
975         reinterpret_cast<dispatch_private_info_template<UT> *>(
976             th->th.th_dispatch->th_dispatch_pr_current);
977     dispatch_shared_info_template<UT> volatile *sh =
978         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
979             th->th.th_dispatch->th_dispatch_sh_current);
980     KMP_DEBUG_ASSERT(pr);
981     KMP_DEBUG_ASSERT(sh);
982     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
983                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
984 
985     if (pr->ordered_bumped) {
986       KD_TRACE(
987           1000,
988           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
989            gtid));
990       pr->ordered_bumped = 0;
991     } else {
992       UT lower = pr->u.p.ordered_lower;
993 
994 #ifdef KMP_DEBUG
995       {
996         char *buff;
997         // create format specifiers before the debug output
998         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
999                                 "ordered_iteration:%%%s lower:%%%s\n",
1000                                 traits_t<UT>::spec, traits_t<UT>::spec);
1001         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1002         __kmp_str_free(&buff);
1003       }
1004 #endif
1005 
1006       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1007                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1008       KMP_MB(); /* is this necessary? */
1009 #ifdef KMP_DEBUG
1010       {
1011         char *buff;
1012         // create format specifiers before the debug output
1013         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1014                                 "ordered_iteration:%%%s lower:%%%s\n",
1015                                 traits_t<UT>::spec, traits_t<UT>::spec);
1016         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1017         __kmp_str_free(&buff);
1018       }
1019 #endif
1020 
1021       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1022     } // if
1023   } // if
1024   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1025 }
1026 
1027 #ifdef KMP_GOMP_COMPAT
1028 
1029 template <typename UT>
1030 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1031   typedef typename traits_t<UT>::signed_t ST;
1032   kmp_info_t *th = __kmp_threads[gtid];
1033 
1034   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1035   if (!th->th.th_team->t.t_serialized) {
1036     //        int cid;
1037     dispatch_private_info_template<UT> *pr =
1038         reinterpret_cast<dispatch_private_info_template<UT> *>(
1039             th->th.th_dispatch->th_dispatch_pr_current);
1040     dispatch_shared_info_template<UT> volatile *sh =
1041         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1042             th->th.th_dispatch->th_dispatch_sh_current);
1043     KMP_DEBUG_ASSERT(pr);
1044     KMP_DEBUG_ASSERT(sh);
1045     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1046                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1047 
1048     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1049     UT lower = pr->u.p.ordered_lower;
1050     UT upper = pr->u.p.ordered_upper;
1051     UT inc = upper - lower + 1;
1052 
1053     if (pr->ordered_bumped == inc) {
1054       KD_TRACE(
1055           1000,
1056           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1057            gtid));
1058       pr->ordered_bumped = 0;
1059     } else {
1060       inc -= pr->ordered_bumped;
1061 
1062 #ifdef KMP_DEBUG
1063       {
1064         char *buff;
1065         // create format specifiers before the debug output
1066         buff = __kmp_str_format(
1067             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1068             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1069             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1070         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1071         __kmp_str_free(&buff);
1072       }
1073 #endif
1074 
1075       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1076                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1077 
1078       KMP_MB(); /* is this necessary? */
1079       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1080                       "ordered_bumped to zero\n",
1081                       gtid));
1082       pr->ordered_bumped = 0;
1083 //!!!!! TODO check if the inc should be unsigned, or signed???
1084 #ifdef KMP_DEBUG
1085       {
1086         char *buff;
1087         // create format specifiers before the debug output
1088         buff = __kmp_str_format(
1089             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1090             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1091             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1092             traits_t<UT>::spec);
1093         KD_TRACE(1000,
1094                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1095         __kmp_str_free(&buff);
1096       }
1097 #endif
1098 
1099       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1100     }
1101     //        }
1102   }
1103   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1104 }
1105 
1106 #endif /* KMP_GOMP_COMPAT */
1107 
1108 template <typename T>
1109 int __kmp_dispatch_next_algorithm(int gtid,
1110                                   dispatch_private_info_template<T> *pr,
1111                                   dispatch_shared_info_template<T> volatile *sh,
1112                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1113                                   typename traits_t<T>::signed_t *p_st, T nproc,
1114                                   T tid) {
1115   typedef typename traits_t<T>::unsigned_t UT;
1116   typedef typename traits_t<T>::signed_t ST;
1117   typedef typename traits_t<T>::floating_t DBL;
1118   int status = 0;
1119   kmp_int32 last = 0;
1120   T start;
1121   ST incr;
1122   UT limit, trip, init;
1123   kmp_info_t *th = __kmp_threads[gtid];
1124   kmp_team_t *team = th->th.th_team;
1125 
1126   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1127                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1128   KMP_DEBUG_ASSERT(pr);
1129   KMP_DEBUG_ASSERT(sh);
1130   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1131 #ifdef KMP_DEBUG
1132   {
1133     char *buff;
1134     // create format specifiers before the debug output
1135     buff =
1136         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1137                          "sh:%%p nproc:%%%s tid:%%%s\n",
1138                          traits_t<T>::spec, traits_t<T>::spec);
1139     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1140     __kmp_str_free(&buff);
1141   }
1142 #endif
1143 
1144   // zero trip count
1145   if (pr->u.p.tc == 0) {
1146     KD_TRACE(10,
1147              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1148               "zero status:%d\n",
1149               gtid, status));
1150     return 0;
1151   }
1152 
1153   switch (pr->schedule) {
1154 #if (KMP_STATIC_STEAL_ENABLED)
1155   case kmp_sch_static_steal: {
1156     T chunk = pr->u.p.parm1;
1157 
1158     KD_TRACE(100,
1159              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1160               gtid));
1161 
1162     trip = pr->u.p.tc - 1;
1163 
1164     if (traits_t<T>::type_size > 4) {
1165       // use lock for 8-byte and CAS for 4-byte induction
1166       // variable. TODO (optional): check and use 16-byte CAS
1167       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1168       KMP_DEBUG_ASSERT(lck != NULL);
1169       if (pr->u.p.count < (UT)pr->u.p.ub) {
1170         __kmp_acquire_lock(lck, gtid);
1171         // try to get own chunk of iterations
1172         init = (pr->u.p.count)++;
1173         status = (init < (UT)pr->u.p.ub);
1174         __kmp_release_lock(lck, gtid);
1175       } else {
1176         status = 0; // no own chunks
1177       }
1178       if (!status) { // try to steal
1179         kmp_info_t **other_threads = team->t.t_threads;
1180         int while_limit = nproc; // nproc attempts to find a victim
1181         int while_index = 0;
1182         // TODO: algorithm of searching for a victim
1183         // should be cleaned up and measured
1184         while ((!status) && (while_limit != ++while_index)) {
1185           T remaining;
1186           T victimIdx = pr->u.p.parm4;
1187           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1188           dispatch_private_info_template<T> *victim =
1189               reinterpret_cast<dispatch_private_info_template<T> *>(
1190                   other_threads[victimIdx]
1191                       ->th.th_dispatch->th_dispatch_pr_current);
1192           while ((victim == NULL || victim == pr ||
1193                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1194                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1195                  oldVictimIdx != victimIdx) {
1196             victimIdx = (victimIdx + 1) % nproc;
1197             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1198                 other_threads[victimIdx]
1199                     ->th.th_dispatch->th_dispatch_pr_current);
1200           }
1201           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1202                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1203             continue; // try once more (nproc attempts in total)
1204             // no victim is ready yet to participate in stealing
1205             // because all victims are still in kmp_init_dispatch
1206           }
1207           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1208             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1209             continue; // not enough chunks to steal, goto next victim
1210           }
1211 
1212           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1213           KMP_ASSERT(lck != NULL);
1214           __kmp_acquire_lock(lck, gtid);
1215           limit = victim->u.p.ub; // keep initial ub
1216           if (victim->u.p.count >= limit ||
1217               (remaining = limit - victim->u.p.count) < 2) {
1218             __kmp_release_lock(lck, gtid);
1219             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1220             continue; // not enough chunks to steal
1221           }
1222           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1223           // by 1
1224           if (remaining > 3) {
1225             // steal 1/4 of remaining
1226             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1227             init = (victim->u.p.ub -= (remaining >> 2));
1228           } else {
1229             // steal 1 chunk of 2 or 3 remaining
1230             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1231             init = (victim->u.p.ub -= 1);
1232           }
1233           __kmp_release_lock(lck, gtid);
1234 
1235           KMP_DEBUG_ASSERT(init + 1 <= limit);
1236           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1237           status = 1;
1238           while_index = 0;
1239           // now update own count and ub with stolen range but init chunk
1240           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1241           pr->u.p.count = init + 1;
1242           pr->u.p.ub = limit;
1243           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1244         } // while (search for victim)
1245       } // if (try to find victim and steal)
1246     } else {
1247       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1248       typedef union {
1249         struct {
1250           UT count;
1251           T ub;
1252         } p;
1253         kmp_int64 b;
1254       } union_i4;
1255       // All operations on 'count' or 'ub' must be combined atomically
1256       // together.
1257       {
1258         union_i4 vold, vnew;
1259         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1260         vnew = vold;
1261         vnew.p.count++;
1262         while (!KMP_COMPARE_AND_STORE_ACQ64(
1263             (volatile kmp_int64 *)&pr->u.p.count,
1264             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1265             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1266           KMP_CPU_PAUSE();
1267           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1268           vnew = vold;
1269           vnew.p.count++;
1270         }
1271         vnew = vold;
1272         init = vnew.p.count;
1273         status = (init < (UT)vnew.p.ub);
1274       }
1275 
1276       if (!status) {
1277         kmp_info_t **other_threads = team->t.t_threads;
1278         int while_limit = nproc; // nproc attempts to find a victim
1279         int while_index = 0;
1280 
1281         // TODO: algorithm of searching for a victim
1282         // should be cleaned up and measured
1283         while ((!status) && (while_limit != ++while_index)) {
1284           union_i4 vold, vnew;
1285           kmp_int32 remaining;
1286           T victimIdx = pr->u.p.parm4;
1287           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1288           dispatch_private_info_template<T> *victim =
1289               reinterpret_cast<dispatch_private_info_template<T> *>(
1290                   other_threads[victimIdx]
1291                       ->th.th_dispatch->th_dispatch_pr_current);
1292           while ((victim == NULL || victim == pr ||
1293                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1294                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1295                  oldVictimIdx != victimIdx) {
1296             victimIdx = (victimIdx + 1) % nproc;
1297             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1298                 other_threads[victimIdx]
1299                     ->th.th_dispatch->th_dispatch_pr_current);
1300           }
1301           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1302                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1303             continue; // try once more (nproc attempts in total)
1304             // no victim is ready yet to participate in stealing
1305             // because all victims are still in kmp_init_dispatch
1306           }
1307           pr->u.p.parm4 = victimIdx; // new victim found
1308           while (1) { // CAS loop if victim has enough chunks to steal
1309             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1310             vnew = vold;
1311 
1312             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1313             if (vnew.p.count >= (UT)vnew.p.ub ||
1314                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1315               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1316               break; // not enough chunks to steal, goto next victim
1317             }
1318             if (remaining > 3) {
1319               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1320             } else {
1321               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1322             }
1323             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1324             // TODO: Should this be acquire or release?
1325             if (KMP_COMPARE_AND_STORE_ACQ64(
1326                     (volatile kmp_int64 *)&victim->u.p.count,
1327                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1328                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1329               // stealing succedded
1330               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1331                                         vold.p.ub - vnew.p.ub);
1332               status = 1;
1333               while_index = 0;
1334               // now update own count and ub
1335               init = vnew.p.ub;
1336               vold.p.count = init + 1;
1337 #if KMP_ARCH_X86
1338               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1339 #else
1340               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1341 #endif
1342               break;
1343             } // if (check CAS result)
1344             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1345           } // while (try to steal from particular victim)
1346         } // while (search for victim)
1347       } // if (try to find victim and steal)
1348     } // if (4-byte induction variable)
1349     if (!status) {
1350       *p_lb = 0;
1351       *p_ub = 0;
1352       if (p_st != NULL)
1353         *p_st = 0;
1354     } else {
1355       start = pr->u.p.parm2;
1356       init *= chunk;
1357       limit = chunk + init - 1;
1358       incr = pr->u.p.st;
1359       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1360 
1361       KMP_DEBUG_ASSERT(init <= trip);
1362       if ((last = (limit >= trip)) != 0)
1363         limit = trip;
1364       if (p_st != NULL)
1365         *p_st = incr;
1366 
1367       if (incr == 1) {
1368         *p_lb = start + init;
1369         *p_ub = start + limit;
1370       } else {
1371         *p_lb = start + init * incr;
1372         *p_ub = start + limit * incr;
1373       }
1374 
1375       if (pr->flags.ordered) {
1376         pr->u.p.ordered_lower = init;
1377         pr->u.p.ordered_upper = limit;
1378       } // if
1379     } // if
1380     break;
1381   } // case
1382 #endif // ( KMP_STATIC_STEAL_ENABLED )
1383   case kmp_sch_static_balanced: {
1384     KD_TRACE(
1385         10,
1386         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1387          gtid));
1388     /* check if thread has any iteration to do */
1389     if ((status = !pr->u.p.count) != 0) {
1390       pr->u.p.count = 1;
1391       *p_lb = pr->u.p.lb;
1392       *p_ub = pr->u.p.ub;
1393       last = pr->u.p.parm1;
1394       if (p_st != NULL)
1395         *p_st = pr->u.p.st;
1396     } else { /* no iterations to do */
1397       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1398     }
1399   } // case
1400   break;
1401   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1402                                  merged here */
1403   case kmp_sch_static_chunked: {
1404     T parm1;
1405 
1406     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1407                    "kmp_sch_static_[affinity|chunked] case\n",
1408                    gtid));
1409     parm1 = pr->u.p.parm1;
1410 
1411     trip = pr->u.p.tc - 1;
1412     init = parm1 * (pr->u.p.count + tid);
1413 
1414     if ((status = (init <= trip)) != 0) {
1415       start = pr->u.p.lb;
1416       incr = pr->u.p.st;
1417       limit = parm1 + init - 1;
1418 
1419       if ((last = (limit >= trip)) != 0)
1420         limit = trip;
1421 
1422       if (p_st != NULL)
1423         *p_st = incr;
1424 
1425       pr->u.p.count += nproc;
1426 
1427       if (incr == 1) {
1428         *p_lb = start + init;
1429         *p_ub = start + limit;
1430       } else {
1431         *p_lb = start + init * incr;
1432         *p_ub = start + limit * incr;
1433       }
1434 
1435       if (pr->flags.ordered) {
1436         pr->u.p.ordered_lower = init;
1437         pr->u.p.ordered_upper = limit;
1438       } // if
1439     } // if
1440   } // case
1441   break;
1442 
1443   case kmp_sch_dynamic_chunked: {
1444     T chunk = pr->u.p.parm1;
1445 
1446     KD_TRACE(
1447         100,
1448         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1449          gtid));
1450 
1451     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1452     trip = pr->u.p.tc - 1;
1453 
1454     if ((status = (init <= trip)) == 0) {
1455       *p_lb = 0;
1456       *p_ub = 0;
1457       if (p_st != NULL)
1458         *p_st = 0;
1459     } else {
1460       start = pr->u.p.lb;
1461       limit = chunk + init - 1;
1462       incr = pr->u.p.st;
1463 
1464       if ((last = (limit >= trip)) != 0)
1465         limit = trip;
1466 
1467       if (p_st != NULL)
1468         *p_st = incr;
1469 
1470       if (incr == 1) {
1471         *p_lb = start + init;
1472         *p_ub = start + limit;
1473       } else {
1474         *p_lb = start + init * incr;
1475         *p_ub = start + limit * incr;
1476       }
1477 
1478       if (pr->flags.ordered) {
1479         pr->u.p.ordered_lower = init;
1480         pr->u.p.ordered_upper = limit;
1481       } // if
1482     } // if
1483   } // case
1484   break;
1485 
1486   case kmp_sch_guided_iterative_chunked: {
1487     T chunkspec = pr->u.p.parm1;
1488     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1489                    "iterative case\n",
1490                    gtid));
1491     trip = pr->u.p.tc;
1492     // Start atomic part of calculations
1493     while (1) {
1494       ST remaining; // signed, because can be < 0
1495       init = sh->u.s.iteration; // shared value
1496       remaining = trip - init;
1497       if (remaining <= 0) { // AC: need to compare with 0 first
1498         // nothing to do, don't try atomic op
1499         status = 0;
1500         break;
1501       }
1502       if ((T)remaining <
1503           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1504         // use dynamic-style shcedule
1505         // atomically inrement iterations, get old value
1506         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1507                                  (ST)chunkspec);
1508         remaining = trip - init;
1509         if (remaining <= 0) {
1510           status = 0; // all iterations got by other threads
1511         } else {
1512           // got some iterations to work on
1513           status = 1;
1514           if ((T)remaining > chunkspec) {
1515             limit = init + chunkspec - 1;
1516           } else {
1517             last = 1; // the last chunk
1518             limit = init + remaining - 1;
1519           } // if
1520         } // if
1521         break;
1522       } // if
1523       limit = init +
1524               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1525       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1526                                (ST)init, (ST)limit)) {
1527         // CAS was successful, chunk obtained
1528         status = 1;
1529         --limit;
1530         break;
1531       } // if
1532     } // while
1533     if (status != 0) {
1534       start = pr->u.p.lb;
1535       incr = pr->u.p.st;
1536       if (p_st != NULL)
1537         *p_st = incr;
1538       *p_lb = start + init * incr;
1539       *p_ub = start + limit * incr;
1540       if (pr->flags.ordered) {
1541         pr->u.p.ordered_lower = init;
1542         pr->u.p.ordered_upper = limit;
1543       } // if
1544     } else {
1545       *p_lb = 0;
1546       *p_ub = 0;
1547       if (p_st != NULL)
1548         *p_st = 0;
1549     } // if
1550   } // case
1551   break;
1552 
1553 #if OMP_45_ENABLED
1554   case kmp_sch_guided_simd: {
1555     // same as iterative but curr-chunk adjusted to be multiple of given
1556     // chunk
1557     T chunk = pr->u.p.parm1;
1558     KD_TRACE(100,
1559              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1560               gtid));
1561     trip = pr->u.p.tc;
1562     // Start atomic part of calculations
1563     while (1) {
1564       ST remaining; // signed, because can be < 0
1565       init = sh->u.s.iteration; // shared value
1566       remaining = trip - init;
1567       if (remaining <= 0) { // AC: need to compare with 0 first
1568         status = 0; // nothing to do, don't try atomic op
1569         break;
1570       }
1571       KMP_DEBUG_ASSERT(init % chunk == 0);
1572       // compare with K*nproc*(chunk+1), K=2 by default
1573       if ((T)remaining < pr->u.p.parm2) {
1574         // use dynamic-style shcedule
1575         // atomically inrement iterations, get old value
1576         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1577                                  (ST)chunk);
1578         remaining = trip - init;
1579         if (remaining <= 0) {
1580           status = 0; // all iterations got by other threads
1581         } else {
1582           // got some iterations to work on
1583           status = 1;
1584           if ((T)remaining > chunk) {
1585             limit = init + chunk - 1;
1586           } else {
1587             last = 1; // the last chunk
1588             limit = init + remaining - 1;
1589           } // if
1590         } // if
1591         break;
1592       } // if
1593       // divide by K*nproc
1594       UT span = remaining * (*(double *)&pr->u.p.parm3);
1595       UT rem = span % chunk;
1596       if (rem) // adjust so that span%chunk == 0
1597         span += chunk - rem;
1598       limit = init + span;
1599       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1600                                (ST)init, (ST)limit)) {
1601         // CAS was successful, chunk obtained
1602         status = 1;
1603         --limit;
1604         break;
1605       } // if
1606     } // while
1607     if (status != 0) {
1608       start = pr->u.p.lb;
1609       incr = pr->u.p.st;
1610       if (p_st != NULL)
1611         *p_st = incr;
1612       *p_lb = start + init * incr;
1613       *p_ub = start + limit * incr;
1614       if (pr->flags.ordered) {
1615         pr->u.p.ordered_lower = init;
1616         pr->u.p.ordered_upper = limit;
1617       } // if
1618     } else {
1619       *p_lb = 0;
1620       *p_ub = 0;
1621       if (p_st != NULL)
1622         *p_st = 0;
1623     } // if
1624   } // case
1625   break;
1626 #endif // OMP_45_ENABLED
1627 
1628   case kmp_sch_guided_analytical_chunked: {
1629     T chunkspec = pr->u.p.parm1;
1630     UT chunkIdx;
1631 #if KMP_USE_X87CONTROL
1632     /* for storing original FPCW value for Windows* OS on
1633        IA-32 architecture 8-byte version */
1634     unsigned int oldFpcw;
1635     unsigned int fpcwSet = 0;
1636 #endif
1637     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1638                    "kmp_sch_guided_analytical_chunked case\n",
1639                    gtid));
1640 
1641     trip = pr->u.p.tc;
1642 
1643     KMP_DEBUG_ASSERT(nproc > 1);
1644     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1645 
1646     while (1) { /* this while loop is a safeguard against unexpected zero
1647                    chunk sizes */
1648       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1649       if (chunkIdx >= (UT)pr->u.p.parm2) {
1650         --trip;
1651         /* use dynamic-style scheduling */
1652         init = chunkIdx * chunkspec + pr->u.p.count;
1653         /* need to verify init > 0 in case of overflow in the above
1654          * calculation */
1655         if ((status = (init > 0 && init <= trip)) != 0) {
1656           limit = init + chunkspec - 1;
1657 
1658           if ((last = (limit >= trip)) != 0)
1659             limit = trip;
1660         }
1661         break;
1662       } else {
1663 /* use exponential-style scheduling */
1664 /* The following check is to workaround the lack of long double precision on
1665    Windows* OS.
1666    This check works around the possible effect that init != 0 for chunkIdx == 0.
1667  */
1668 #if KMP_USE_X87CONTROL
1669         /* If we haven't already done so, save original
1670            FPCW and set precision to 64-bit, as Windows* OS
1671            on IA-32 architecture defaults to 53-bit */
1672         if (!fpcwSet) {
1673           oldFpcw = _control87(0, 0);
1674           _control87(_PC_64, _MCW_PC);
1675           fpcwSet = 0x30000;
1676         }
1677 #endif
1678         if (chunkIdx) {
1679           init = __kmp_dispatch_guided_remaining<T>(
1680               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1681           KMP_DEBUG_ASSERT(init);
1682           init = trip - init;
1683         } else
1684           init = 0;
1685         limit = trip - __kmp_dispatch_guided_remaining<T>(
1686                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1687         KMP_ASSERT(init <= limit);
1688         if (init < limit) {
1689           KMP_DEBUG_ASSERT(limit <= trip);
1690           --limit;
1691           status = 1;
1692           break;
1693         } // if
1694       } // if
1695     } // while (1)
1696 #if KMP_USE_X87CONTROL
1697     /* restore FPCW if necessary
1698        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1699     */
1700     if (fpcwSet && (oldFpcw & fpcwSet))
1701       _control87(oldFpcw, _MCW_PC);
1702 #endif
1703     if (status != 0) {
1704       start = pr->u.p.lb;
1705       incr = pr->u.p.st;
1706       if (p_st != NULL)
1707         *p_st = incr;
1708       *p_lb = start + init * incr;
1709       *p_ub = start + limit * incr;
1710       if (pr->flags.ordered) {
1711         pr->u.p.ordered_lower = init;
1712         pr->u.p.ordered_upper = limit;
1713       }
1714     } else {
1715       *p_lb = 0;
1716       *p_ub = 0;
1717       if (p_st != NULL)
1718         *p_st = 0;
1719     }
1720   } // case
1721   break;
1722 
1723   case kmp_sch_trapezoidal: {
1724     UT index;
1725     T parm2 = pr->u.p.parm2;
1726     T parm3 = pr->u.p.parm3;
1727     T parm4 = pr->u.p.parm4;
1728     KD_TRACE(100,
1729              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1730               gtid));
1731 
1732     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1733 
1734     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1735     trip = pr->u.p.tc - 1;
1736 
1737     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1738       *p_lb = 0;
1739       *p_ub = 0;
1740       if (p_st != NULL)
1741         *p_st = 0;
1742     } else {
1743       start = pr->u.p.lb;
1744       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1745       incr = pr->u.p.st;
1746 
1747       if ((last = (limit >= trip)) != 0)
1748         limit = trip;
1749 
1750       if (p_st != NULL)
1751         *p_st = incr;
1752 
1753       if (incr == 1) {
1754         *p_lb = start + init;
1755         *p_ub = start + limit;
1756       } else {
1757         *p_lb = start + init * incr;
1758         *p_ub = start + limit * incr;
1759       }
1760 
1761       if (pr->flags.ordered) {
1762         pr->u.p.ordered_lower = init;
1763         pr->u.p.ordered_upper = limit;
1764       } // if
1765     } // if
1766   } // case
1767   break;
1768   default: {
1769     status = 0; // to avoid complaints on uninitialized variable use
1770     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1771                 KMP_HNT(GetNewerLibrary), // Hint
1772                 __kmp_msg_null // Variadic argument list terminator
1773                 );
1774   } break;
1775   } // switch
1776   if (p_last)
1777     *p_last = last;
1778 #ifdef KMP_DEBUG
1779   if (pr->flags.ordered) {
1780     char *buff;
1781     // create format specifiers before the debug output
1782     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1783                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1784                             traits_t<UT>::spec, traits_t<UT>::spec);
1785     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1786     __kmp_str_free(&buff);
1787   }
1788   {
1789     char *buff;
1790     // create format specifiers before the debug output
1791     buff = __kmp_str_format(
1792         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1793         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1794         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1795     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1796     __kmp_str_free(&buff);
1797   }
1798 #endif
1799   return status;
1800 }
1801 
1802 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1803    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1804    is not called. */
1805 #if OMPT_SUPPORT && OMPT_OPTIONAL
1806 #define OMPT_LOOP_END                                                          \
1807   if (status == 0) {                                                           \
1808     if (ompt_enabled.ompt_callback_work) {                                     \
1809       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1810       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1811       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1812           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1813           &(task_info->task_data), 0, codeptr);                                \
1814     }                                                                          \
1815   }
1816 // TODO: implement count
1817 #else
1818 #define OMPT_LOOP_END // no-op
1819 #endif
1820 
1821 #if KMP_STATS_ENABLED
1822 #define KMP_STATS_LOOP_END                                                     \
1823   {                                                                            \
1824     kmp_int64 u, l, t, i;                                                      \
1825     l = (kmp_int64)(*p_lb);                                                    \
1826     u = (kmp_int64)(*p_ub);                                                    \
1827     i = (kmp_int64)(pr->u.p.st);                                               \
1828     if (status == 0) {                                                         \
1829       t = 0;                                                                   \
1830       KMP_POP_PARTITIONED_TIMER();                                             \
1831     } else if (i == 1) {                                                       \
1832       if (u >= l)                                                              \
1833         t = u - l + 1;                                                         \
1834       else                                                                     \
1835         t = 0;                                                                 \
1836     } else if (i < 0) {                                                        \
1837       if (l >= u)                                                              \
1838         t = (l - u) / (-i) + 1;                                                \
1839       else                                                                     \
1840         t = 0;                                                                 \
1841     } else {                                                                   \
1842       if (u >= l)                                                              \
1843         t = (u - l) / i + 1;                                                   \
1844       else                                                                     \
1845         t = 0;                                                                 \
1846     }                                                                          \
1847     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1848   }
1849 #else
1850 #define KMP_STATS_LOOP_END /* Nothing */
1851 #endif
1852 
1853 template <typename T>
1854 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1855                                T *p_lb, T *p_ub,
1856                                typename traits_t<T>::signed_t *p_st
1857 #if OMPT_SUPPORT && OMPT_OPTIONAL
1858                                ,
1859                                void *codeptr
1860 #endif
1861                                ) {
1862 
1863   typedef typename traits_t<T>::unsigned_t UT;
1864   typedef typename traits_t<T>::signed_t ST;
1865   // This is potentially slightly misleading, schedule(runtime) will appear here
1866   // even if the actual runtme schedule is static. (Which points out a
1867   // disadavantage of schedule(runtime): even when static scheduling is used it
1868   // costs more than a compile time choice to use static scheduling would.)
1869   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1870 
1871   int status;
1872   dispatch_private_info_template<T> *pr;
1873   kmp_info_t *th = __kmp_threads[gtid];
1874   kmp_team_t *team = th->th.th_team;
1875 
1876   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1877   KD_TRACE(
1878       1000,
1879       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1880        gtid, p_lb, p_ub, p_st, p_last));
1881 
1882   if (team->t.t_serialized) {
1883     /* NOTE: serialize this dispatch becase we are not at the active level */
1884     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1885         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1886     KMP_DEBUG_ASSERT(pr);
1887 
1888     if ((status = (pr->u.p.tc != 0)) == 0) {
1889       *p_lb = 0;
1890       *p_ub = 0;
1891       //            if ( p_last != NULL )
1892       //                *p_last = 0;
1893       if (p_st != NULL)
1894         *p_st = 0;
1895       if (__kmp_env_consistency_check) {
1896         if (pr->pushed_ws != ct_none) {
1897           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1898         }
1899       }
1900     } else if (pr->flags.nomerge) {
1901       kmp_int32 last;
1902       T start;
1903       UT limit, trip, init;
1904       ST incr;
1905       T chunk = pr->u.p.parm1;
1906 
1907       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1908                      gtid));
1909 
1910       init = chunk * pr->u.p.count++;
1911       trip = pr->u.p.tc - 1;
1912 
1913       if ((status = (init <= trip)) == 0) {
1914         *p_lb = 0;
1915         *p_ub = 0;
1916         //                if ( p_last != NULL )
1917         //                    *p_last = 0;
1918         if (p_st != NULL)
1919           *p_st = 0;
1920         if (__kmp_env_consistency_check) {
1921           if (pr->pushed_ws != ct_none) {
1922             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1923           }
1924         }
1925       } else {
1926         start = pr->u.p.lb;
1927         limit = chunk + init - 1;
1928         incr = pr->u.p.st;
1929 
1930         if ((last = (limit >= trip)) != 0) {
1931           limit = trip;
1932 #if KMP_OS_WINDOWS
1933           pr->u.p.last_upper = pr->u.p.ub;
1934 #endif /* KMP_OS_WINDOWS */
1935         }
1936         if (p_last != NULL)
1937           *p_last = last;
1938         if (p_st != NULL)
1939           *p_st = incr;
1940         if (incr == 1) {
1941           *p_lb = start + init;
1942           *p_ub = start + limit;
1943         } else {
1944           *p_lb = start + init * incr;
1945           *p_ub = start + limit * incr;
1946         }
1947 
1948         if (pr->flags.ordered) {
1949           pr->u.p.ordered_lower = init;
1950           pr->u.p.ordered_upper = limit;
1951 #ifdef KMP_DEBUG
1952           {
1953             char *buff;
1954             // create format specifiers before the debug output
1955             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1956                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1957                                     traits_t<UT>::spec, traits_t<UT>::spec);
1958             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1959                             pr->u.p.ordered_upper));
1960             __kmp_str_free(&buff);
1961           }
1962 #endif
1963         } // if
1964       } // if
1965     } else {
1966       pr->u.p.tc = 0;
1967       *p_lb = pr->u.p.lb;
1968       *p_ub = pr->u.p.ub;
1969 #if KMP_OS_WINDOWS
1970       pr->u.p.last_upper = *p_ub;
1971 #endif /* KMP_OS_WINDOWS */
1972       if (p_last != NULL)
1973         *p_last = TRUE;
1974       if (p_st != NULL)
1975         *p_st = pr->u.p.st;
1976     } // if
1977 #ifdef KMP_DEBUG
1978     {
1979       char *buff;
1980       // create format specifiers before the debug output
1981       buff = __kmp_str_format(
1982           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1983           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1984           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1985       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1986       __kmp_str_free(&buff);
1987     }
1988 #endif
1989 #if INCLUDE_SSC_MARKS
1990     SSC_MARK_DISPATCH_NEXT();
1991 #endif
1992     OMPT_LOOP_END;
1993     KMP_STATS_LOOP_END;
1994     return status;
1995   } else {
1996     kmp_int32 last = 0;
1997     dispatch_shared_info_template<T> volatile *sh;
1998 
1999     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2000                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2001 
2002     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2003         th->th.th_dispatch->th_dispatch_pr_current);
2004     KMP_DEBUG_ASSERT(pr);
2005     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2006         th->th.th_dispatch->th_dispatch_sh_current);
2007     KMP_DEBUG_ASSERT(sh);
2008 
2009 #if KMP_USE_HIER_SCHED
2010     if (pr->flags.use_hier)
2011       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2012     else
2013 #endif // KMP_USE_HIER_SCHED
2014       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2015                                                 p_st, th->th.th_team_nproc,
2016                                                 th->th.th_info.ds.ds_tid);
2017     // status == 0: no more iterations to execute
2018     if (status == 0) {
2019       UT num_done;
2020 
2021       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2022 #ifdef KMP_DEBUG
2023       {
2024         char *buff;
2025         // create format specifiers before the debug output
2026         buff = __kmp_str_format(
2027             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2028             traits_t<UT>::spec);
2029         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2030         __kmp_str_free(&buff);
2031       }
2032 #endif
2033 
2034 #if KMP_USE_HIER_SCHED
2035       pr->flags.use_hier = FALSE;
2036 #endif
2037       if ((ST)num_done == th->th.th_team_nproc - 1) {
2038 #if (KMP_STATIC_STEAL_ENABLED)
2039         if (pr->schedule == kmp_sch_static_steal &&
2040             traits_t<T>::type_size > 4) {
2041           int i;
2042           kmp_info_t **other_threads = team->t.t_threads;
2043           // loop complete, safe to destroy locks used for stealing
2044           for (i = 0; i < th->th.th_team_nproc; ++i) {
2045             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2046             KMP_ASSERT(lck != NULL);
2047             __kmp_destroy_lock(lck);
2048             __kmp_free(lck);
2049             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2050           }
2051         }
2052 #endif
2053         /* NOTE: release this buffer to be reused */
2054 
2055         KMP_MB(); /* Flush all pending memory write invalidates.  */
2056 
2057         sh->u.s.num_done = 0;
2058         sh->u.s.iteration = 0;
2059 
2060         /* TODO replace with general release procedure? */
2061         if (pr->flags.ordered) {
2062           sh->u.s.ordered_iteration = 0;
2063         }
2064 
2065         KMP_MB(); /* Flush all pending memory write invalidates.  */
2066 
2067         sh->buffer_index += __kmp_dispatch_num_buffers;
2068         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2069                        gtid, sh->buffer_index));
2070 
2071         KMP_MB(); /* Flush all pending memory write invalidates.  */
2072 
2073       } // if
2074       if (__kmp_env_consistency_check) {
2075         if (pr->pushed_ws != ct_none) {
2076           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2077         }
2078       }
2079 
2080       th->th.th_dispatch->th_deo_fcn = NULL;
2081       th->th.th_dispatch->th_dxo_fcn = NULL;
2082       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2083       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2084     } // if (status == 0)
2085 #if KMP_OS_WINDOWS
2086     else if (last) {
2087       pr->u.p.last_upper = pr->u.p.ub;
2088     }
2089 #endif /* KMP_OS_WINDOWS */
2090     if (p_last != NULL && status != 0)
2091       *p_last = last;
2092   } // if
2093 
2094 #ifdef KMP_DEBUG
2095   {
2096     char *buff;
2097     // create format specifiers before the debug output
2098     buff = __kmp_str_format(
2099         "__kmp_dispatch_next: T#%%d normal case: "
2100         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2101         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2102     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2103                   (p_last ? *p_last : 0), status));
2104     __kmp_str_free(&buff);
2105   }
2106 #endif
2107 #if INCLUDE_SSC_MARKS
2108   SSC_MARK_DISPATCH_NEXT();
2109 #endif
2110   OMPT_LOOP_END;
2111   KMP_STATS_LOOP_END;
2112   return status;
2113 }
2114 
2115 template <typename T>
2116 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2117                                   kmp_int32 *plastiter, T *plower, T *pupper,
2118                                   typename traits_t<T>::signed_t incr) {
2119   typedef typename traits_t<T>::unsigned_t UT;
2120   kmp_uint32 team_id;
2121   kmp_uint32 nteams;
2122   UT trip_count;
2123   kmp_team_t *team;
2124   kmp_info_t *th;
2125 
2126   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2127   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2128 #ifdef KMP_DEBUG
2129   typedef typename traits_t<T>::signed_t ST;
2130   {
2131     char *buff;
2132     // create format specifiers before the debug output
2133     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2134                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2135                             traits_t<T>::spec, traits_t<T>::spec,
2136                             traits_t<ST>::spec, traits_t<T>::spec);
2137     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2138     __kmp_str_free(&buff);
2139   }
2140 #endif
2141 
2142   if (__kmp_env_consistency_check) {
2143     if (incr == 0) {
2144       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2145                             loc);
2146     }
2147     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2148       // The loop is illegal.
2149       // Some zero-trip loops maintained by compiler, e.g.:
2150       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2151       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2152       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2153       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2154       // Compiler does not check the following illegal loops:
2155       //   for(i=0;i<10;i+=incr) // where incr<0
2156       //   for(i=10;i>0;i-=incr) // where incr<0
2157       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2158     }
2159   }
2160   th = __kmp_threads[gtid];
2161   team = th->th.th_team;
2162 #if OMP_40_ENABLED
2163   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2164   nteams = th->th.th_teams_size.nteams;
2165 #endif
2166   team_id = team->t.t_master_tid;
2167   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2168 
2169   // compute global trip count
2170   if (incr == 1) {
2171     trip_count = *pupper - *plower + 1;
2172   } else if (incr == -1) {
2173     trip_count = *plower - *pupper + 1;
2174   } else if (incr > 0) {
2175     // upper-lower can exceed the limit of signed type
2176     trip_count = (UT)(*pupper - *plower) / incr + 1;
2177   } else {
2178     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2179   }
2180 
2181   if (trip_count <= nteams) {
2182     KMP_DEBUG_ASSERT(
2183         __kmp_static == kmp_sch_static_greedy ||
2184         __kmp_static ==
2185             kmp_sch_static_balanced); // Unknown static scheduling type.
2186     // only some teams get single iteration, others get nothing
2187     if (team_id < trip_count) {
2188       *pupper = *plower = *plower + team_id * incr;
2189     } else {
2190       *plower = *pupper + incr; // zero-trip loop
2191     }
2192     if (plastiter != NULL)
2193       *plastiter = (team_id == trip_count - 1);
2194   } else {
2195     if (__kmp_static == kmp_sch_static_balanced) {
2196       UT chunk = trip_count / nteams;
2197       UT extras = trip_count % nteams;
2198       *plower +=
2199           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2200       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2201       if (plastiter != NULL)
2202         *plastiter = (team_id == nteams - 1);
2203     } else {
2204       T chunk_inc_count =
2205           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2206       T upper = *pupper;
2207       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2208       // Unknown static scheduling type.
2209       *plower += team_id * chunk_inc_count;
2210       *pupper = *plower + chunk_inc_count - incr;
2211       // Check/correct bounds if needed
2212       if (incr > 0) {
2213         if (*pupper < *plower)
2214           *pupper = traits_t<T>::max_value;
2215         if (plastiter != NULL)
2216           *plastiter = *plower <= upper && *pupper > upper - incr;
2217         if (*pupper > upper)
2218           *pupper = upper; // tracker C73258
2219       } else {
2220         if (*pupper > *plower)
2221           *pupper = traits_t<T>::min_value;
2222         if (plastiter != NULL)
2223           *plastiter = *plower >= upper && *pupper < upper - incr;
2224         if (*pupper < upper)
2225           *pupper = upper; // tracker C73258
2226       }
2227     }
2228   }
2229 }
2230 
2231 //-----------------------------------------------------------------------------
2232 // Dispatch routines
2233 //    Transfer call to template< type T >
2234 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2235 //                         T lb, T ub, ST st, ST chunk )
2236 extern "C" {
2237 
2238 /*!
2239 @ingroup WORK_SHARING
2240 @{
2241 @param loc Source location
2242 @param gtid Global thread id
2243 @param schedule Schedule type
2244 @param lb  Lower bound
2245 @param ub  Upper bound
2246 @param st  Step (or increment if you prefer)
2247 @param chunk The chunk size to block with
2248 
2249 This function prepares the runtime to start a dynamically scheduled for loop,
2250 saving the loop arguments.
2251 These functions are all identical apart from the types of the arguments.
2252 */
2253 
2254 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2255                             enum sched_type schedule, kmp_int32 lb,
2256                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2257   KMP_DEBUG_ASSERT(__kmp_init_serial);
2258 #if OMPT_SUPPORT && OMPT_OPTIONAL
2259   OMPT_STORE_RETURN_ADDRESS(gtid);
2260 #endif
2261   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2262 }
2263 /*!
2264 See @ref __kmpc_dispatch_init_4
2265 */
2266 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2267                              enum sched_type schedule, kmp_uint32 lb,
2268                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2269   KMP_DEBUG_ASSERT(__kmp_init_serial);
2270 #if OMPT_SUPPORT && OMPT_OPTIONAL
2271   OMPT_STORE_RETURN_ADDRESS(gtid);
2272 #endif
2273   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2274 }
2275 
2276 /*!
2277 See @ref __kmpc_dispatch_init_4
2278 */
2279 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2280                             enum sched_type schedule, kmp_int64 lb,
2281                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2282   KMP_DEBUG_ASSERT(__kmp_init_serial);
2283 #if OMPT_SUPPORT && OMPT_OPTIONAL
2284   OMPT_STORE_RETURN_ADDRESS(gtid);
2285 #endif
2286   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2287 }
2288 
2289 /*!
2290 See @ref __kmpc_dispatch_init_4
2291 */
2292 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2293                              enum sched_type schedule, kmp_uint64 lb,
2294                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2295   KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 #if OMPT_SUPPORT && OMPT_OPTIONAL
2297   OMPT_STORE_RETURN_ADDRESS(gtid);
2298 #endif
2299   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2300 }
2301 
2302 /*!
2303 See @ref __kmpc_dispatch_init_4
2304 
2305 Difference from __kmpc_dispatch_init set of functions is these functions
2306 are called for composite distribute parallel for construct. Thus before
2307 regular iterations dispatching we need to calc per-team iteration space.
2308 
2309 These functions are all identical apart from the types of the arguments.
2310 */
2311 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2312                                  enum sched_type schedule, kmp_int32 *p_last,
2313                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2314                                  kmp_int32 chunk) {
2315   KMP_DEBUG_ASSERT(__kmp_init_serial);
2316 #if OMPT_SUPPORT && OMPT_OPTIONAL
2317   OMPT_STORE_RETURN_ADDRESS(gtid);
2318 #endif
2319   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2320   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2321 }
2322 
2323 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2324                                   enum sched_type schedule, kmp_int32 *p_last,
2325                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2326                                   kmp_int32 chunk) {
2327   KMP_DEBUG_ASSERT(__kmp_init_serial);
2328 #if OMPT_SUPPORT && OMPT_OPTIONAL
2329   OMPT_STORE_RETURN_ADDRESS(gtid);
2330 #endif
2331   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2332   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2333 }
2334 
2335 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2336                                  enum sched_type schedule, kmp_int32 *p_last,
2337                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2338                                  kmp_int64 chunk) {
2339   KMP_DEBUG_ASSERT(__kmp_init_serial);
2340 #if OMPT_SUPPORT && OMPT_OPTIONAL
2341   OMPT_STORE_RETURN_ADDRESS(gtid);
2342 #endif
2343   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2344   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2345 }
2346 
2347 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2348                                   enum sched_type schedule, kmp_int32 *p_last,
2349                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2350                                   kmp_int64 chunk) {
2351   KMP_DEBUG_ASSERT(__kmp_init_serial);
2352 #if OMPT_SUPPORT && OMPT_OPTIONAL
2353   OMPT_STORE_RETURN_ADDRESS(gtid);
2354 #endif
2355   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2356   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2357 }
2358 
2359 /*!
2360 @param loc Source code location
2361 @param gtid Global thread id
2362 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2363 otherwise
2364 @param p_lb   Pointer to the lower bound for the next chunk of work
2365 @param p_ub   Pointer to the upper bound for the next chunk of work
2366 @param p_st   Pointer to the stride for the next chunk of work
2367 @return one if there is work to be done, zero otherwise
2368 
2369 Get the next dynamically allocated chunk of work for this thread.
2370 If there is no more work, then the lb,ub and stride need not be modified.
2371 */
2372 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2373                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2374 #if OMPT_SUPPORT && OMPT_OPTIONAL
2375   OMPT_STORE_RETURN_ADDRESS(gtid);
2376 #endif
2377   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379                                         ,
2380                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2381 #endif
2382                                             );
2383 }
2384 
2385 /*!
2386 See @ref __kmpc_dispatch_next_4
2387 */
2388 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2389                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2390                             kmp_int32 *p_st) {
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392   OMPT_STORE_RETURN_ADDRESS(gtid);
2393 #endif
2394   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2395 #if OMPT_SUPPORT && OMPT_OPTIONAL
2396                                          ,
2397                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2398 #endif
2399                                              );
2400 }
2401 
2402 /*!
2403 See @ref __kmpc_dispatch_next_4
2404 */
2405 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2406                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2407 #if OMPT_SUPPORT && OMPT_OPTIONAL
2408   OMPT_STORE_RETURN_ADDRESS(gtid);
2409 #endif
2410   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2411 #if OMPT_SUPPORT && OMPT_OPTIONAL
2412                                         ,
2413                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2414 #endif
2415                                             );
2416 }
2417 
2418 /*!
2419 See @ref __kmpc_dispatch_next_4
2420 */
2421 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2422                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2423                             kmp_int64 *p_st) {
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429                                          ,
2430                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2431 #endif
2432                                              );
2433 }
2434 
2435 /*!
2436 @param loc Source code location
2437 @param gtid Global thread id
2438 
2439 Mark the end of a dynamic loop.
2440 */
2441 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2442   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2443 }
2444 
2445 /*!
2446 See @ref __kmpc_dispatch_fini_4
2447 */
2448 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2449   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2450 }
2451 
2452 /*!
2453 See @ref __kmpc_dispatch_fini_4
2454 */
2455 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2456   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2457 }
2458 
2459 /*!
2460 See @ref __kmpc_dispatch_fini_4
2461 */
2462 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2463   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2464 }
2465 /*! @} */
2466 
2467 //-----------------------------------------------------------------------------
2468 // Non-template routines from kmp_dispatch.cpp used in other sources
2469 
2470 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2471   return value == checker;
2472 }
2473 
2474 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2475   return value != checker;
2476 }
2477 
2478 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2479   return value < checker;
2480 }
2481 
2482 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2483   return value >= checker;
2484 }
2485 
2486 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2487   return value <= checker;
2488 }
2489 
2490 kmp_uint32
2491 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2492                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2493                    void *obj // Higher-level synchronization object, or NULL.
2494                    ) {
2495   // note: we may not belong to a team at this point
2496   volatile kmp_uint32 *spin = spinner;
2497   kmp_uint32 check = checker;
2498   kmp_uint32 spins;
2499   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2500   kmp_uint32 r;
2501 
2502   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2503   KMP_INIT_YIELD(spins);
2504   // main wait spin loop
2505   while (!f(r = TCR_4(*spin), check)) {
2506     KMP_FSYNC_SPIN_PREPARE(obj);
2507     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2508        split. It causes problems with infinite recursion because of exit lock */
2509     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2510         __kmp_abort_thread(); */
2511 
2512     /* if we have waited a bit, or are oversubscribed, yield */
2513     /* pause is in the following code */
2514     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2515     KMP_YIELD_SPIN(spins);
2516   }
2517   KMP_FSYNC_SPIN_ACQUIRED(obj);
2518   return r;
2519 }
2520 
2521 void __kmp_wait_yield_4_ptr(
2522     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2523     void *obj // Higher-level synchronization object, or NULL.
2524     ) {
2525   // note: we may not belong to a team at this point
2526   void *spin = spinner;
2527   kmp_uint32 check = checker;
2528   kmp_uint32 spins;
2529   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2530 
2531   KMP_FSYNC_SPIN_INIT(obj, spin);
2532   KMP_INIT_YIELD(spins);
2533   // main wait spin loop
2534   while (!f(spin, check)) {
2535     KMP_FSYNC_SPIN_PREPARE(obj);
2536     /* if we have waited a bit, or are oversubscribed, yield */
2537     /* pause is in the following code */
2538     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2539     KMP_YIELD_SPIN(spins);
2540   }
2541   KMP_FSYNC_SPIN_ACQUIRED(obj);
2542 }
2543 
2544 } // extern "C"
2545 
2546 #ifdef KMP_GOMP_COMPAT
2547 
2548 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2549                                enum sched_type schedule, kmp_int32 lb,
2550                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2551                                int push_ws) {
2552   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2553                                  push_ws);
2554 }
2555 
2556 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2557                                 enum sched_type schedule, kmp_uint32 lb,
2558                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2559                                 int push_ws) {
2560   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2561                                   push_ws);
2562 }
2563 
2564 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2565                                enum sched_type schedule, kmp_int64 lb,
2566                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2567                                int push_ws) {
2568   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2569                                  push_ws);
2570 }
2571 
2572 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2573                                 enum sched_type schedule, kmp_uint64 lb,
2574                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2575                                 int push_ws) {
2576   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2577                                   push_ws);
2578 }
2579 
2580 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2581   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2582 }
2583 
2584 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2585   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2586 }
2587 
2588 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2589   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2590 }
2591 
2592 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2593   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2594 }
2595 
2596 #endif /* KMP_GOMP_COMPAT */
2597 
2598 /* ------------------------------------------------------------------------ */
2599