1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Initialize a dispatch_private_info_template<T> buffer for a particular
72 // type of schedule,chunk.  The loop description is found in lb (lower bound),
73 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
74 // to the scheduling (often the number of threads in a team, but not always if
75 // hierarchical scheduling is used).  tid is the id of the thread calling
76 // the function within the group of nproc threads.  It will have a value
77 // between 0 and nproc - 1.  This is often just the thread id within a team, but
78 // is not necessarily the case when using hierarchical scheduling.
79 // loc is the source file location of the corresponding loop
80 // gtid is the global thread id
81 template <typename T>
82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
83                                    dispatch_private_info_template<T> *pr,
84                                    enum sched_type schedule, T lb, T ub,
85                                    typename traits_t<T>::signed_t st,
86 #if USE_ITT_BUILD
87                                    kmp_uint64 *cur_chunk,
88 #endif
89                                    typename traits_t<T>::signed_t chunk,
90                                    T nproc, T tid) {
91   typedef typename traits_t<T>::unsigned_t UT;
92   typedef typename traits_t<T>::floating_t DBL;
93 
94   int active;
95   T tc;
96   kmp_info_t *th;
97   kmp_team_t *team;
98 
99 #ifdef KMP_DEBUG
100   typedef typename traits_t<T>::signed_t ST;
101   {
102     char *buff;
103     // create format specifiers before the debug output
104     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
105                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
106                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107                             traits_t<T>::spec, traits_t<T>::spec,
108                             traits_t<ST>::spec, traits_t<ST>::spec,
109                             traits_t<T>::spec, traits_t<T>::spec);
110     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111     __kmp_str_free(&buff);
112   }
113 #endif
114   /* setup data */
115   th = __kmp_threads[gtid];
116   team = th->th.th_team;
117   active = !team->t.t_serialized;
118 
119 #if USE_ITT_BUILD
120   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121                                     __kmp_forkjoin_frames_mode == 3 &&
122                                     KMP_MASTER_GTID(gtid) &&
123 #if OMP_40_ENABLED
124                                     th->th.th_teams_microtask == NULL &&
125 #endif
126                                     team->t.t_active_level == 1;
127 #endif
128 #if (KMP_STATIC_STEAL_ENABLED)
129   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
130     // AC: we now have only one implementation of stealing, so use it
131     schedule = kmp_sch_static_steal;
132   else
133 #endif
134     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
135 
136   /* Pick up the nomerge/ordered bits from the scheduling type */
137   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
138     pr->flags.nomerge = TRUE;
139     schedule =
140         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
141   } else {
142     pr->flags.nomerge = FALSE;
143   }
144   pr->type_size = traits_t<T>::type_size; // remember the size of variables
145   if (kmp_ord_lower & schedule) {
146     pr->flags.ordered = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
149   } else {
150     pr->flags.ordered = FALSE;
151   }
152 
153   if (schedule == kmp_sch_static) {
154     schedule = __kmp_static;
155   } else {
156     if (schedule == kmp_sch_runtime) {
157       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
158       // not specified)
159       schedule = team->t.t_sched.r_sched_type;
160       // Detail the schedule if needed (global controls are differentiated
161       // appropriately)
162       if (schedule == kmp_sch_guided_chunked) {
163         schedule = __kmp_guided;
164       } else if (schedule == kmp_sch_static) {
165         schedule = __kmp_static;
166       }
167       // Use the chunk size specified by OMP_SCHEDULE (or default if not
168       // specified)
169       chunk = team->t.t_sched.chunk;
170 #if USE_ITT_BUILD
171       if (cur_chunk)
172         *cur_chunk = chunk;
173 #endif
174 #ifdef KMP_DEBUG
175       {
176         char *buff;
177         // create format specifiers before the debug output
178         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
179                                 "schedule:%%d chunk:%%%s\n",
180                                 traits_t<ST>::spec);
181         KD_TRACE(10, (buff, gtid, schedule, chunk));
182         __kmp_str_free(&buff);
183       }
184 #endif
185     } else {
186       if (schedule == kmp_sch_guided_chunked) {
187         schedule = __kmp_guided;
188       }
189       if (chunk <= 0) {
190         chunk = KMP_DEFAULT_CHUNK;
191       }
192     }
193 
194     if (schedule == kmp_sch_auto) {
195       // mapping and differentiation: in the __kmp_do_serial_initialize()
196       schedule = __kmp_auto;
197 #ifdef KMP_DEBUG
198       {
199         char *buff;
200         // create format specifiers before the debug output
201         buff = __kmp_str_format(
202             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
203             "schedule:%%d chunk:%%%s\n",
204             traits_t<ST>::spec);
205         KD_TRACE(10, (buff, gtid, schedule, chunk));
206         __kmp_str_free(&buff);
207       }
208 #endif
209     }
210 
211     /* guided analytical not safe for too many threads */
212     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213       schedule = kmp_sch_guided_iterative_chunked;
214       KMP_WARNING(DispatchManyThreads);
215     }
216 #if OMP_45_ENABLED
217     if (schedule == kmp_sch_runtime_simd) {
218       // compiler provides simd_width in the chunk parameter
219       schedule = team->t.t_sched.r_sched_type;
220       // Detail the schedule if needed (global controls are differentiated
221       // appropriately)
222       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
223           schedule == __kmp_static) {
224         schedule = kmp_sch_static_balanced_chunked;
225       } else {
226         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
227           schedule = kmp_sch_guided_simd;
228         }
229         chunk = team->t.t_sched.chunk * chunk;
230       }
231 #if USE_ITT_BUILD
232       if (cur_chunk)
233         *cur_chunk = chunk;
234 #endif
235 #ifdef KMP_DEBUG
236       {
237         char *buff;
238         // create format specifiers before the debug output
239         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
240                                 " chunk:%%%s\n",
241                                 traits_t<ST>::spec);
242         KD_TRACE(10, (buff, gtid, schedule, chunk));
243         __kmp_str_free(&buff);
244       }
245 #endif
246     }
247 #endif // OMP_45_ENABLED
248     pr->u.p.parm1 = chunk;
249   }
250   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
251               "unknown scheduling type");
252 
253   pr->u.p.count = 0;
254 
255   if (__kmp_env_consistency_check) {
256     if (st == 0) {
257       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
259     }
260   }
261   // compute trip count
262   if (st == 1) { // most common case
263     if (ub >= lb) {
264       tc = ub - lb + 1;
265     } else { // ub < lb
266       tc = 0; // zero-trip
267     }
268   } else if (st < 0) {
269     if (lb >= ub) {
270       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
271       // where the division needs to be unsigned regardless of the result type
272       tc = (UT)(lb - ub) / (-st) + 1;
273     } else { // lb < ub
274       tc = 0; // zero-trip
275     }
276   } else { // st > 0
277     if (ub >= lb) {
278       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(ub - lb) / st + 1;
281     } else { // ub < lb
282       tc = 0; // zero-trip
283     }
284   }
285 
286   pr->u.p.lb = lb;
287   pr->u.p.ub = ub;
288   pr->u.p.st = st;
289   pr->u.p.tc = tc;
290 
291 #if KMP_OS_WINDOWS
292   pr->u.p.last_upper = ub + st;
293 #endif /* KMP_OS_WINDOWS */
294 
295   /* NOTE: only the active parallel region(s) has active ordered sections */
296 
297   if (active) {
298     if (pr->flags.ordered) {
299       pr->ordered_bumped = 0;
300       pr->u.p.ordered_lower = 1;
301       pr->u.p.ordered_upper = 0;
302     }
303   }
304 
305   switch (schedule) {
306 #if (KMP_STATIC_STEAL_ENABLED)
307   case kmp_sch_static_steal: {
308     T ntc, init;
309 
310     KD_TRACE(100,
311              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
312               gtid));
313 
314     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
315     if (nproc > 1 && ntc >= nproc) {
316       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
317       T id = tid;
318       T small_chunk, extras;
319 
320       small_chunk = ntc / nproc;
321       extras = ntc % nproc;
322 
323       init = id * small_chunk + (id < extras ? id : extras);
324       pr->u.p.count = init;
325       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
326 
327       pr->u.p.parm2 = lb;
328       // pr->pfields.parm3 = 0; // it's not used in static_steal
329       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
330       pr->u.p.st = st;
331       if (traits_t<T>::type_size > 4) {
332         // AC: TODO: check if 16-byte CAS available and use it to
333         // improve performance (probably wait for explicit request
334         // before spending time on this).
335         // For now use dynamically allocated per-thread lock,
336         // free memory in __kmp_dispatch_next when status==0.
337         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
338         th->th.th_dispatch->th_steal_lock =
339             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
340         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
341       }
342       break;
343     } else {
344       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
345                      "kmp_sch_static_balanced\n",
346                      gtid));
347       schedule = kmp_sch_static_balanced;
348       /* too few iterations: fall-through to kmp_sch_static_balanced */
349     } // if
350     /* FALL-THROUGH to static balanced */
351     KMP_FALLTHROUGH();
352   } // case
353 #endif
354   case kmp_sch_static_balanced: {
355     T init, limit;
356 
357     KD_TRACE(
358         100,
359         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
360          gtid));
361 
362     if (nproc > 1) {
363       T id = tid;
364 
365       if (tc < nproc) {
366         if (id < tc) {
367           init = id;
368           limit = id;
369           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
370         } else {
371           pr->u.p.count = 1; /* means no more chunks to execute */
372           pr->u.p.parm1 = FALSE;
373           break;
374         }
375       } else {
376         T small_chunk = tc / nproc;
377         T extras = tc % nproc;
378         init = id * small_chunk + (id < extras ? id : extras);
379         limit = init + small_chunk - (id < extras ? 0 : 1);
380         pr->u.p.parm1 = (id == nproc - 1);
381       }
382     } else {
383       if (tc > 0) {
384         init = 0;
385         limit = tc - 1;
386         pr->u.p.parm1 = TRUE;
387       } else {
388         // zero trip count
389         pr->u.p.count = 1; /* means no more chunks to execute */
390         pr->u.p.parm1 = FALSE;
391         break;
392       }
393     }
394 #if USE_ITT_BUILD
395     // Calculate chunk for metadata report
396     if (itt_need_metadata_reporting)
397       if (cur_chunk)
398         *cur_chunk = limit - init + 1;
399 #endif
400     if (st == 1) {
401       pr->u.p.lb = lb + init;
402       pr->u.p.ub = lb + limit;
403     } else {
404       // calculated upper bound, "ub" is user-defined upper bound
405       T ub_tmp = lb + limit * st;
406       pr->u.p.lb = lb + init * st;
407       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
408       // it exactly
409       if (st > 0) {
410         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
411       } else {
412         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
413       }
414     }
415     if (pr->flags.ordered) {
416       pr->u.p.ordered_lower = init;
417       pr->u.p.ordered_upper = limit;
418     }
419     break;
420   } // case
421 #if OMP_45_ENABLED
422   case kmp_sch_static_balanced_chunked: {
423     // similar to balanced, but chunk adjusted to multiple of simd width
424     T nth = nproc;
425     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
426                    " -> falling-through to static_greedy\n",
427                    gtid));
428     schedule = kmp_sch_static_greedy;
429     if (nth > 1)
430       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
431     else
432       pr->u.p.parm1 = tc;
433     break;
434   } // case
435   case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED
437   case kmp_sch_guided_iterative_chunked: {
438     KD_TRACE(
439         100,
440         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
441          " case\n",
442          gtid));
443 
444     if (nproc > 1) {
445       if ((2L * chunk + 1) * nproc >= tc) {
446         /* chunk size too large, switch to dynamic */
447         schedule = kmp_sch_dynamic_chunked;
448       } else {
449         // when remaining iters become less than parm2 - switch to dynamic
450         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451         *(double *)&pr->u.p.parm3 =
452             guided_flt_param / nproc; // may occupy parm3 and parm4
453       }
454     } else {
455       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
456                      "kmp_sch_static_greedy\n",
457                      gtid));
458       schedule = kmp_sch_static_greedy;
459       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
460       KD_TRACE(
461           100,
462           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
463            gtid));
464       pr->u.p.parm1 = tc;
465     } // if
466   } // case
467   break;
468   case kmp_sch_guided_analytical_chunked: {
469     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
470                    "kmp_sch_guided_analytical_chunked case\n",
471                    gtid));
472 
473     if (nproc > 1) {
474       if ((2L * chunk + 1) * nproc >= tc) {
475         /* chunk size too large, switch to dynamic */
476         schedule = kmp_sch_dynamic_chunked;
477       } else {
478         /* commonly used term: (2 nproc - 1)/(2 nproc) */
479         DBL x;
480 
481 #if KMP_USE_X87CONTROL
482         /* Linux* OS already has 64-bit computation by default for long double,
483            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
484            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
485            instead of the default 53-bit. Even though long double doesn't work
486            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
487            expected to impact the correctness of the algorithm, but this has not
488            been mathematically proven. */
489         // save original FPCW and set precision to 64-bit, as
490         // Windows* OS on IA-32 architecture defaults to 53-bit
491         unsigned int oldFpcw = _control87(0, 0);
492         _control87(_PC_64, _MCW_PC); // 0,0x30000
493 #endif
494         /* value used for comparison in solver for cross-over point */
495         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
496 
497         /* crossover point--chunk indexes equal to or greater than
498            this point switch to dynamic-style scheduling */
499         UT cross;
500 
501         /* commonly used term: (2 nproc - 1)/(2 nproc) */
502         x = (long double)1.0 - (long double)0.5 / nproc;
503 
504 #ifdef KMP_DEBUG
505         { // test natural alignment
506           struct _test_a {
507             char a;
508             union {
509               char b;
510               DBL d;
511             };
512           } t;
513           ptrdiff_t natural_alignment =
514               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
515           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
516           // long)natural_alignment );
517           KMP_DEBUG_ASSERT(
518               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
519         }
520 #endif // KMP_DEBUG
521 
522         /* save the term in thread private dispatch structure */
523         *(DBL *)&pr->u.p.parm3 = x;
524 
525         /* solve for the crossover point to the nearest integer i for which C_i
526            <= chunk */
527         {
528           UT left, right, mid;
529           long double p;
530 
531           /* estimate initial upper and lower bound */
532 
533           /* doesn't matter what value right is as long as it is positive, but
534              it affects performance of the solver */
535           right = 229;
536           p = __kmp_pow<UT>(x, right);
537           if (p > target) {
538             do {
539               p *= p;
540               right <<= 1;
541             } while (p > target && right < (1 << 27));
542             /* lower bound is previous (failed) estimate of upper bound */
543             left = right >> 1;
544           } else {
545             left = 0;
546           }
547 
548           /* bisection root-finding method */
549           while (left + 1 < right) {
550             mid = (left + right) / 2;
551             if (__kmp_pow<UT>(x, mid) > target) {
552               left = mid;
553             } else {
554               right = mid;
555             }
556           } // while
557           cross = right;
558         }
559         /* assert sanity of computed crossover point */
560         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561                    __kmp_pow<UT>(x, cross) <= target);
562 
563         /* save the crossover point in thread private dispatch structure */
564         pr->u.p.parm2 = cross;
565 
566 // C75803
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
569 #else
570 #define GUIDED_ANALYTICAL_WORKAROUND (x)
571 #endif
572         /* dynamic-style scheduling offset */
573         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
575                         cross * chunk;
576 #if KMP_USE_X87CONTROL
577         // restore FPCW
578         _control87(oldFpcw, _MCW_PC);
579 #endif
580       } // if
581     } else {
582       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
583                      "kmp_sch_static_greedy\n",
584                      gtid));
585       schedule = kmp_sch_static_greedy;
586       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
587       pr->u.p.parm1 = tc;
588     } // if
589   } // case
590   break;
591   case kmp_sch_static_greedy:
592     KD_TRACE(
593         100,
594         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
595          gtid));
596     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
597     break;
598   case kmp_sch_static_chunked:
599   case kmp_sch_dynamic_chunked:
600     if (pr->u.p.parm1 <= 0) {
601       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
602     }
603     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
604                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
605                    gtid));
606     break;
607   case kmp_sch_trapezoidal: {
608     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
609 
610     T parm1, parm2, parm3, parm4;
611     KD_TRACE(100,
612              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
613               gtid));
614 
615     parm1 = chunk;
616 
617     /* F : size of the first cycle */
618     parm2 = (tc / (2 * nproc));
619 
620     if (parm2 < 1) {
621       parm2 = 1;
622     }
623 
624     /* L : size of the last cycle.  Make sure the last cycle is not larger
625        than the first cycle. */
626     if (parm1 < 1) {
627       parm1 = 1;
628     } else if (parm1 > parm2) {
629       parm1 = parm2;
630     }
631 
632     /* N : number of cycles */
633     parm3 = (parm2 + parm1);
634     parm3 = (2 * tc + parm3 - 1) / parm3;
635 
636     if (parm3 < 2) {
637       parm3 = 2;
638     }
639 
640     /* sigma : decreasing incr of the trapezoid */
641     parm4 = (parm3 - 1);
642     parm4 = (parm2 - parm1) / parm4;
643 
644     // pointless check, because parm4 >= 0 always
645     // if ( parm4 < 0 ) {
646     //    parm4 = 0;
647     //}
648 
649     pr->u.p.parm1 = parm1;
650     pr->u.p.parm2 = parm2;
651     pr->u.p.parm3 = parm3;
652     pr->u.p.parm4 = parm4;
653   } // case
654   break;
655 
656   default: {
657     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
658                 KMP_HNT(GetNewerLibrary), // Hint
659                 __kmp_msg_null // Variadic argument list terminator
660                 );
661   } break;
662   } // switch
663   pr->schedule = schedule;
664 }
665 
666 #if KMP_USE_HIER_SCHED
667 template <typename T>
668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
669                                              typename traits_t<T>::signed_t st);
670 template <>
671 inline void
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
673                                             kmp_int32 ub, kmp_int32 st) {
674   __kmp_dispatch_init_hierarchy<kmp_int32>(
675       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
677 }
678 template <>
679 inline void
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
681                                              kmp_uint32 ub, kmp_int32 st) {
682   __kmp_dispatch_init_hierarchy<kmp_uint32>(
683       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
685 }
686 template <>
687 inline void
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
689                                             kmp_int64 ub, kmp_int64 st) {
690   __kmp_dispatch_init_hierarchy<kmp_int64>(
691       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
693 }
694 template <>
695 inline void
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
697                                              kmp_uint64 ub, kmp_int64 st) {
698   __kmp_dispatch_init_hierarchy<kmp_uint64>(
699       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
701 }
702 
703 // free all the hierarchy scheduling memory associated with the team
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706   for (int i = 0; i < num_disp_buff; ++i) {
707     // type does not matter here so use kmp_int32
708     auto sh =
709         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
710             &team->t.t_disp_buffer[i]);
711     if (sh->hier) {
712       sh->hier->deallocate();
713       __kmp_free(sh->hier);
714     }
715   }
716 }
717 #endif
718 
719 // UT - unsigned flavor of T, ST - signed flavor of T,
720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
721 template <typename T>
722 static void
723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
724                     T ub, typename traits_t<T>::signed_t st,
725                     typename traits_t<T>::signed_t chunk, int push_ws) {
726   typedef typename traits_t<T>::unsigned_t UT;
727 
728   int active;
729   kmp_info_t *th;
730   kmp_team_t *team;
731   kmp_uint32 my_buffer_index;
732   dispatch_private_info_template<T> *pr;
733   dispatch_shared_info_template<T> volatile *sh;
734 
735   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
736                    sizeof(dispatch_private_info));
737   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
738                    sizeof(dispatch_shared_info));
739 
740   if (!TCR_4(__kmp_init_parallel))
741     __kmp_parallel_initialize();
742 
743 #if OMP_50_ENABLED
744   __kmp_resume_if_soft_paused();
745 #endif
746 
747 #if INCLUDE_SSC_MARKS
748   SSC_MARK_DISPATCH_INIT();
749 #endif
750 #ifdef KMP_DEBUG
751   typedef typename traits_t<T>::signed_t ST;
752   {
753     char *buff;
754     // create format specifiers before the debug output
755     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
756                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
757                             traits_t<ST>::spec, traits_t<T>::spec,
758                             traits_t<T>::spec, traits_t<ST>::spec);
759     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
760     __kmp_str_free(&buff);
761   }
762 #endif
763   /* setup data */
764   th = __kmp_threads[gtid];
765   team = th->th.th_team;
766   active = !team->t.t_serialized;
767   th->th.th_ident = loc;
768 
769   // Any half-decent optimizer will remove this test when the blocks are empty
770   // since the macros expand to nothing
771   // when statistics are disabled.
772   if (schedule == __kmp_static) {
773     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
774   } else {
775     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
776   }
777 
778 #if KMP_USE_HIER_SCHED
779   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
780   // Hierarchical scheduling does not work with ordered, so if ordered is
781   // detected, then revert back to threaded scheduling.
782   bool ordered;
783   enum sched_type my_sched = schedule;
784   my_buffer_index = th->th.th_dispatch->th_disp_index;
785   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
786       &th->th.th_dispatch
787            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
788   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
789   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
790     my_sched =
791         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
792   ordered = (kmp_ord_lower & my_sched);
793   if (pr->flags.use_hier) {
794     if (ordered) {
795       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
796                      "Disabling hierarchical scheduling.\n",
797                      gtid));
798       pr->flags.use_hier = FALSE;
799     }
800   }
801   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
802     // Don't use hierarchical for ordered parallel loops and don't
803     // use the runtime hierarchy if one was specified in the program
804     if (!ordered && !pr->flags.use_hier)
805       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
806   }
807 #endif // KMP_USE_HIER_SCHED
808 
809 #if USE_ITT_BUILD
810   kmp_uint64 cur_chunk = chunk;
811   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
812                                     __kmp_forkjoin_frames_mode == 3 &&
813                                     KMP_MASTER_GTID(gtid) &&
814 #if OMP_40_ENABLED
815                                     th->th.th_teams_microtask == NULL &&
816 #endif
817                                     team->t.t_active_level == 1;
818 #endif
819   if (!active) {
820     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
822   } else {
823     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
824                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
825 
826     my_buffer_index = th->th.th_dispatch->th_disp_index++;
827 
828     /* What happens when number of threads changes, need to resize buffer? */
829     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
830         &th->th.th_dispatch
831              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
833         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
834     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
835                   my_buffer_index));
836   }
837 
838   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
839 #if USE_ITT_BUILD
840                                 &cur_chunk,
841 #endif
842                                 chunk, (T)th->th.th_team_nproc,
843                                 (T)th->th.th_info.ds.ds_tid);
844   if (active) {
845     if (pr->flags.ordered == 0) {
846       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
847       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
848     } else {
849       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
850       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
851     }
852   }
853 
854   if (active) {
855     /* The name of this buffer should be my_buffer_index when it's free to use
856      * it */
857 
858     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
859                    "sh->buffer_index:%d\n",
860                    gtid, my_buffer_index, sh->buffer_index));
861     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
862                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
863     // Note: KMP_WAIT() cannot be used there: buffer index and
864     // my_buffer_index are *always* 32-bit integers.
865     KMP_MB(); /* is this necessary? */
866     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
867                    "sh->buffer_index:%d\n",
868                    gtid, my_buffer_index, sh->buffer_index));
869 
870     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
871     th->th.th_dispatch->th_dispatch_sh_current =
872         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
873 #if USE_ITT_BUILD
874     if (pr->flags.ordered) {
875       __kmp_itt_ordered_init(gtid);
876     }
877     // Report loop metadata
878     if (itt_need_metadata_reporting) {
879       // Only report metadata by master of active team at level 1
880       kmp_uint64 schedtype = 0;
881       switch (schedule) {
882       case kmp_sch_static_chunked:
883       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
884         break;
885       case kmp_sch_static_greedy:
886         cur_chunk = pr->u.p.parm1;
887         break;
888       case kmp_sch_dynamic_chunked:
889         schedtype = 1;
890         break;
891       case kmp_sch_guided_iterative_chunked:
892       case kmp_sch_guided_analytical_chunked:
893 #if OMP_45_ENABLED
894       case kmp_sch_guided_simd:
895 #endif
896         schedtype = 2;
897         break;
898       default:
899         // Should we put this case under "static"?
900         // case kmp_sch_static_steal:
901         schedtype = 3;
902         break;
903       }
904       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
905     }
906 #if KMP_USE_HIER_SCHED
907     if (pr->flags.use_hier) {
908       pr->u.p.count = 0;
909       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
910     }
911 #endif // KMP_USER_HIER_SCHED
912 #endif /* USE_ITT_BUILD */
913   }
914 
915 #ifdef KMP_DEBUG
916   {
917     char *buff;
918     // create format specifiers before the debug output
919     buff = __kmp_str_format(
920         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
921         "lb:%%%s ub:%%%s"
922         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
923         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
924         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
925         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
926         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
927         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
928     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
929                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
930                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
931                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
932     __kmp_str_free(&buff);
933   }
934 #endif
935 #if (KMP_STATIC_STEAL_ENABLED)
936   // It cannot be guaranteed that after execution of a loop with some other
937   // schedule kind all the parm3 variables will contain the same value. Even if
938   // all parm3 will be the same, it still exists a bad case like using 0 and 1
939   // rather than program life-time increment. So the dedicated variable is
940   // required. The 'static_steal_counter' is used.
941   if (schedule == kmp_sch_static_steal) {
942     // Other threads will inspect this variable when searching for a victim.
943     // This is a flag showing that other threads may steal from this thread
944     // since then.
945     volatile T *p = &pr->u.p.static_steal_counter;
946     *p = *p + 1;
947   }
948 #endif // ( KMP_STATIC_STEAL_ENABLED )
949 
950 #if OMPT_SUPPORT && OMPT_OPTIONAL
951   if (ompt_enabled.ompt_callback_work) {
952     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
953     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
954     ompt_callbacks.ompt_callback(ompt_callback_work)(
955         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
956         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
957   }
958 #endif
959   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
960 }
961 
962 /* For ordered loops, either __kmp_dispatch_finish() should be called after
963  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
964  * every chunk of iterations.  If the ordered section(s) were not executed
965  * for this iteration (or every iteration in this chunk), we need to set the
966  * ordered iteration counters so that the next thread can proceed. */
967 template <typename UT>
968 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
969   typedef typename traits_t<UT>::signed_t ST;
970   kmp_info_t *th = __kmp_threads[gtid];
971 
972   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
973   if (!th->th.th_team->t.t_serialized) {
974 
975     dispatch_private_info_template<UT> *pr =
976         reinterpret_cast<dispatch_private_info_template<UT> *>(
977             th->th.th_dispatch->th_dispatch_pr_current);
978     dispatch_shared_info_template<UT> volatile *sh =
979         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
980             th->th.th_dispatch->th_dispatch_sh_current);
981     KMP_DEBUG_ASSERT(pr);
982     KMP_DEBUG_ASSERT(sh);
983     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
984                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
985 
986     if (pr->ordered_bumped) {
987       KD_TRACE(
988           1000,
989           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
990            gtid));
991       pr->ordered_bumped = 0;
992     } else {
993       UT lower = pr->u.p.ordered_lower;
994 
995 #ifdef KMP_DEBUG
996       {
997         char *buff;
998         // create format specifiers before the debug output
999         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1000                                 "ordered_iteration:%%%s lower:%%%s\n",
1001                                 traits_t<UT>::spec, traits_t<UT>::spec);
1002         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1003         __kmp_str_free(&buff);
1004       }
1005 #endif
1006 
1007       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1008                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1009       KMP_MB(); /* is this necessary? */
1010 #ifdef KMP_DEBUG
1011       {
1012         char *buff;
1013         // create format specifiers before the debug output
1014         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1015                                 "ordered_iteration:%%%s lower:%%%s\n",
1016                                 traits_t<UT>::spec, traits_t<UT>::spec);
1017         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1018         __kmp_str_free(&buff);
1019       }
1020 #endif
1021 
1022       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1023     } // if
1024   } // if
1025   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1026 }
1027 
1028 #ifdef KMP_GOMP_COMPAT
1029 
1030 template <typename UT>
1031 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1032   typedef typename traits_t<UT>::signed_t ST;
1033   kmp_info_t *th = __kmp_threads[gtid];
1034 
1035   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1036   if (!th->th.th_team->t.t_serialized) {
1037     //        int cid;
1038     dispatch_private_info_template<UT> *pr =
1039         reinterpret_cast<dispatch_private_info_template<UT> *>(
1040             th->th.th_dispatch->th_dispatch_pr_current);
1041     dispatch_shared_info_template<UT> volatile *sh =
1042         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1043             th->th.th_dispatch->th_dispatch_sh_current);
1044     KMP_DEBUG_ASSERT(pr);
1045     KMP_DEBUG_ASSERT(sh);
1046     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1047                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1048 
1049     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1050     UT lower = pr->u.p.ordered_lower;
1051     UT upper = pr->u.p.ordered_upper;
1052     UT inc = upper - lower + 1;
1053 
1054     if (pr->ordered_bumped == inc) {
1055       KD_TRACE(
1056           1000,
1057           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1058            gtid));
1059       pr->ordered_bumped = 0;
1060     } else {
1061       inc -= pr->ordered_bumped;
1062 
1063 #ifdef KMP_DEBUG
1064       {
1065         char *buff;
1066         // create format specifiers before the debug output
1067         buff = __kmp_str_format(
1068             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1069             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1070             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1071         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1072         __kmp_str_free(&buff);
1073       }
1074 #endif
1075 
1076       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1077                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1078 
1079       KMP_MB(); /* is this necessary? */
1080       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1081                       "ordered_bumped to zero\n",
1082                       gtid));
1083       pr->ordered_bumped = 0;
1084 //!!!!! TODO check if the inc should be unsigned, or signed???
1085 #ifdef KMP_DEBUG
1086       {
1087         char *buff;
1088         // create format specifiers before the debug output
1089         buff = __kmp_str_format(
1090             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1091             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1092             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1093             traits_t<UT>::spec);
1094         KD_TRACE(1000,
1095                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1096         __kmp_str_free(&buff);
1097       }
1098 #endif
1099 
1100       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1101     }
1102     //        }
1103   }
1104   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1105 }
1106 
1107 #endif /* KMP_GOMP_COMPAT */
1108 
1109 template <typename T>
1110 int __kmp_dispatch_next_algorithm(int gtid,
1111                                   dispatch_private_info_template<T> *pr,
1112                                   dispatch_shared_info_template<T> volatile *sh,
1113                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1114                                   typename traits_t<T>::signed_t *p_st, T nproc,
1115                                   T tid) {
1116   typedef typename traits_t<T>::unsigned_t UT;
1117   typedef typename traits_t<T>::signed_t ST;
1118   typedef typename traits_t<T>::floating_t DBL;
1119   int status = 0;
1120   kmp_int32 last = 0;
1121   T start;
1122   ST incr;
1123   UT limit, trip, init;
1124   kmp_info_t *th = __kmp_threads[gtid];
1125   kmp_team_t *team = th->th.th_team;
1126 
1127   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1128                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1129   KMP_DEBUG_ASSERT(pr);
1130   KMP_DEBUG_ASSERT(sh);
1131   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1132 #ifdef KMP_DEBUG
1133   {
1134     char *buff;
1135     // create format specifiers before the debug output
1136     buff =
1137         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1138                          "sh:%%p nproc:%%%s tid:%%%s\n",
1139                          traits_t<T>::spec, traits_t<T>::spec);
1140     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1141     __kmp_str_free(&buff);
1142   }
1143 #endif
1144 
1145   // zero trip count
1146   if (pr->u.p.tc == 0) {
1147     KD_TRACE(10,
1148              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1149               "zero status:%d\n",
1150               gtid, status));
1151     return 0;
1152   }
1153 
1154   switch (pr->schedule) {
1155 #if (KMP_STATIC_STEAL_ENABLED)
1156   case kmp_sch_static_steal: {
1157     T chunk = pr->u.p.parm1;
1158 
1159     KD_TRACE(100,
1160              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1161               gtid));
1162 
1163     trip = pr->u.p.tc - 1;
1164 
1165     if (traits_t<T>::type_size > 4) {
1166       // use lock for 8-byte and CAS for 4-byte induction
1167       // variable. TODO (optional): check and use 16-byte CAS
1168       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1169       KMP_DEBUG_ASSERT(lck != NULL);
1170       if (pr->u.p.count < (UT)pr->u.p.ub) {
1171         __kmp_acquire_lock(lck, gtid);
1172         // try to get own chunk of iterations
1173         init = (pr->u.p.count)++;
1174         status = (init < (UT)pr->u.p.ub);
1175         __kmp_release_lock(lck, gtid);
1176       } else {
1177         status = 0; // no own chunks
1178       }
1179       if (!status) { // try to steal
1180         kmp_info_t **other_threads = team->t.t_threads;
1181         int while_limit = nproc; // nproc attempts to find a victim
1182         int while_index = 0;
1183         // TODO: algorithm of searching for a victim
1184         // should be cleaned up and measured
1185         while ((!status) && (while_limit != ++while_index)) {
1186           T remaining;
1187           T victimIdx = pr->u.p.parm4;
1188           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1189           dispatch_private_info_template<T> *victim =
1190               reinterpret_cast<dispatch_private_info_template<T> *>(
1191                   other_threads[victimIdx]
1192                       ->th.th_dispatch->th_dispatch_pr_current);
1193           while ((victim == NULL || victim == pr ||
1194                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1195                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1196                  oldVictimIdx != victimIdx) {
1197             victimIdx = (victimIdx + 1) % nproc;
1198             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1199                 other_threads[victimIdx]
1200                     ->th.th_dispatch->th_dispatch_pr_current);
1201           }
1202           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1203                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1204             continue; // try once more (nproc attempts in total)
1205             // no victim is ready yet to participate in stealing
1206             // because all victims are still in kmp_init_dispatch
1207           }
1208           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1209             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1210             continue; // not enough chunks to steal, goto next victim
1211           }
1212 
1213           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1214           KMP_ASSERT(lck != NULL);
1215           __kmp_acquire_lock(lck, gtid);
1216           limit = victim->u.p.ub; // keep initial ub
1217           if (victim->u.p.count >= limit ||
1218               (remaining = limit - victim->u.p.count) < 2) {
1219             __kmp_release_lock(lck, gtid);
1220             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1221             continue; // not enough chunks to steal
1222           }
1223           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1224           // by 1
1225           if (remaining > 3) {
1226             // steal 1/4 of remaining
1227             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1228             init = (victim->u.p.ub -= (remaining >> 2));
1229           } else {
1230             // steal 1 chunk of 2 or 3 remaining
1231             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1232             init = (victim->u.p.ub -= 1);
1233           }
1234           __kmp_release_lock(lck, gtid);
1235 
1236           KMP_DEBUG_ASSERT(init + 1 <= limit);
1237           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1238           status = 1;
1239           while_index = 0;
1240           // now update own count and ub with stolen range but init chunk
1241           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1242           pr->u.p.count = init + 1;
1243           pr->u.p.ub = limit;
1244           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1245         } // while (search for victim)
1246       } // if (try to find victim and steal)
1247     } else {
1248       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1249       typedef union {
1250         struct {
1251           UT count;
1252           T ub;
1253         } p;
1254         kmp_int64 b;
1255       } union_i4;
1256       // All operations on 'count' or 'ub' must be combined atomically
1257       // together.
1258       {
1259         union_i4 vold, vnew;
1260         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1261         vnew = vold;
1262         vnew.p.count++;
1263         while (!KMP_COMPARE_AND_STORE_ACQ64(
1264             (volatile kmp_int64 *)&pr->u.p.count,
1265             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1266             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1267           KMP_CPU_PAUSE();
1268           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1269           vnew = vold;
1270           vnew.p.count++;
1271         }
1272         vnew = vold;
1273         init = vnew.p.count;
1274         status = (init < (UT)vnew.p.ub);
1275       }
1276 
1277       if (!status) {
1278         kmp_info_t **other_threads = team->t.t_threads;
1279         int while_limit = nproc; // nproc attempts to find a victim
1280         int while_index = 0;
1281 
1282         // TODO: algorithm of searching for a victim
1283         // should be cleaned up and measured
1284         while ((!status) && (while_limit != ++while_index)) {
1285           union_i4 vold, vnew;
1286           kmp_int32 remaining;
1287           T victimIdx = pr->u.p.parm4;
1288           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1289           dispatch_private_info_template<T> *victim =
1290               reinterpret_cast<dispatch_private_info_template<T> *>(
1291                   other_threads[victimIdx]
1292                       ->th.th_dispatch->th_dispatch_pr_current);
1293           while ((victim == NULL || victim == pr ||
1294                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1295                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1296                  oldVictimIdx != victimIdx) {
1297             victimIdx = (victimIdx + 1) % nproc;
1298             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1299                 other_threads[victimIdx]
1300                     ->th.th_dispatch->th_dispatch_pr_current);
1301           }
1302           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1303                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1304             continue; // try once more (nproc attempts in total)
1305             // no victim is ready yet to participate in stealing
1306             // because all victims are still in kmp_init_dispatch
1307           }
1308           pr->u.p.parm4 = victimIdx; // new victim found
1309           while (1) { // CAS loop if victim has enough chunks to steal
1310             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1311             vnew = vold;
1312 
1313             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1314             if (vnew.p.count >= (UT)vnew.p.ub ||
1315                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1316               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1317               break; // not enough chunks to steal, goto next victim
1318             }
1319             if (remaining > 3) {
1320               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1321             } else {
1322               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1323             }
1324             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1325             // TODO: Should this be acquire or release?
1326             if (KMP_COMPARE_AND_STORE_ACQ64(
1327                     (volatile kmp_int64 *)&victim->u.p.count,
1328                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1329                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1330               // stealing succedded
1331               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1332                                         vold.p.ub - vnew.p.ub);
1333               status = 1;
1334               while_index = 0;
1335               // now update own count and ub
1336               init = vnew.p.ub;
1337               vold.p.count = init + 1;
1338 #if KMP_ARCH_X86
1339               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1340 #else
1341               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1342 #endif
1343               break;
1344             } // if (check CAS result)
1345             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1346           } // while (try to steal from particular victim)
1347         } // while (search for victim)
1348       } // if (try to find victim and steal)
1349     } // if (4-byte induction variable)
1350     if (!status) {
1351       *p_lb = 0;
1352       *p_ub = 0;
1353       if (p_st != NULL)
1354         *p_st = 0;
1355     } else {
1356       start = pr->u.p.parm2;
1357       init *= chunk;
1358       limit = chunk + init - 1;
1359       incr = pr->u.p.st;
1360       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1361 
1362       KMP_DEBUG_ASSERT(init <= trip);
1363       if ((last = (limit >= trip)) != 0)
1364         limit = trip;
1365       if (p_st != NULL)
1366         *p_st = incr;
1367 
1368       if (incr == 1) {
1369         *p_lb = start + init;
1370         *p_ub = start + limit;
1371       } else {
1372         *p_lb = start + init * incr;
1373         *p_ub = start + limit * incr;
1374       }
1375 
1376       if (pr->flags.ordered) {
1377         pr->u.p.ordered_lower = init;
1378         pr->u.p.ordered_upper = limit;
1379       } // if
1380     } // if
1381     break;
1382   } // case
1383 #endif // ( KMP_STATIC_STEAL_ENABLED )
1384   case kmp_sch_static_balanced: {
1385     KD_TRACE(
1386         10,
1387         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1388          gtid));
1389     /* check if thread has any iteration to do */
1390     if ((status = !pr->u.p.count) != 0) {
1391       pr->u.p.count = 1;
1392       *p_lb = pr->u.p.lb;
1393       *p_ub = pr->u.p.ub;
1394       last = pr->u.p.parm1;
1395       if (p_st != NULL)
1396         *p_st = pr->u.p.st;
1397     } else { /* no iterations to do */
1398       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1399     }
1400   } // case
1401   break;
1402   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1403                                  merged here */
1404   case kmp_sch_static_chunked: {
1405     T parm1;
1406 
1407     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1408                    "kmp_sch_static_[affinity|chunked] case\n",
1409                    gtid));
1410     parm1 = pr->u.p.parm1;
1411 
1412     trip = pr->u.p.tc - 1;
1413     init = parm1 * (pr->u.p.count + tid);
1414 
1415     if ((status = (init <= trip)) != 0) {
1416       start = pr->u.p.lb;
1417       incr = pr->u.p.st;
1418       limit = parm1 + init - 1;
1419 
1420       if ((last = (limit >= trip)) != 0)
1421         limit = trip;
1422 
1423       if (p_st != NULL)
1424         *p_st = incr;
1425 
1426       pr->u.p.count += nproc;
1427 
1428       if (incr == 1) {
1429         *p_lb = start + init;
1430         *p_ub = start + limit;
1431       } else {
1432         *p_lb = start + init * incr;
1433         *p_ub = start + limit * incr;
1434       }
1435 
1436       if (pr->flags.ordered) {
1437         pr->u.p.ordered_lower = init;
1438         pr->u.p.ordered_upper = limit;
1439       } // if
1440     } // if
1441   } // case
1442   break;
1443 
1444   case kmp_sch_dynamic_chunked: {
1445     T chunk = pr->u.p.parm1;
1446 
1447     KD_TRACE(
1448         100,
1449         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1450          gtid));
1451 
1452     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1453     trip = pr->u.p.tc - 1;
1454 
1455     if ((status = (init <= trip)) == 0) {
1456       *p_lb = 0;
1457       *p_ub = 0;
1458       if (p_st != NULL)
1459         *p_st = 0;
1460     } else {
1461       start = pr->u.p.lb;
1462       limit = chunk + init - 1;
1463       incr = pr->u.p.st;
1464 
1465       if ((last = (limit >= trip)) != 0)
1466         limit = trip;
1467 
1468       if (p_st != NULL)
1469         *p_st = incr;
1470 
1471       if (incr == 1) {
1472         *p_lb = start + init;
1473         *p_ub = start + limit;
1474       } else {
1475         *p_lb = start + init * incr;
1476         *p_ub = start + limit * incr;
1477       }
1478 
1479       if (pr->flags.ordered) {
1480         pr->u.p.ordered_lower = init;
1481         pr->u.p.ordered_upper = limit;
1482       } // if
1483     } // if
1484   } // case
1485   break;
1486 
1487   case kmp_sch_guided_iterative_chunked: {
1488     T chunkspec = pr->u.p.parm1;
1489     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1490                    "iterative case\n",
1491                    gtid));
1492     trip = pr->u.p.tc;
1493     // Start atomic part of calculations
1494     while (1) {
1495       ST remaining; // signed, because can be < 0
1496       init = sh->u.s.iteration; // shared value
1497       remaining = trip - init;
1498       if (remaining <= 0) { // AC: need to compare with 0 first
1499         // nothing to do, don't try atomic op
1500         status = 0;
1501         break;
1502       }
1503       if ((T)remaining <
1504           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1505         // use dynamic-style shcedule
1506         // atomically inrement iterations, get old value
1507         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1508                                  (ST)chunkspec);
1509         remaining = trip - init;
1510         if (remaining <= 0) {
1511           status = 0; // all iterations got by other threads
1512         } else {
1513           // got some iterations to work on
1514           status = 1;
1515           if ((T)remaining > chunkspec) {
1516             limit = init + chunkspec - 1;
1517           } else {
1518             last = 1; // the last chunk
1519             limit = init + remaining - 1;
1520           } // if
1521         } // if
1522         break;
1523       } // if
1524       limit = init +
1525               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1526       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1527                                (ST)init, (ST)limit)) {
1528         // CAS was successful, chunk obtained
1529         status = 1;
1530         --limit;
1531         break;
1532       } // if
1533     } // while
1534     if (status != 0) {
1535       start = pr->u.p.lb;
1536       incr = pr->u.p.st;
1537       if (p_st != NULL)
1538         *p_st = incr;
1539       *p_lb = start + init * incr;
1540       *p_ub = start + limit * incr;
1541       if (pr->flags.ordered) {
1542         pr->u.p.ordered_lower = init;
1543         pr->u.p.ordered_upper = limit;
1544       } // if
1545     } else {
1546       *p_lb = 0;
1547       *p_ub = 0;
1548       if (p_st != NULL)
1549         *p_st = 0;
1550     } // if
1551   } // case
1552   break;
1553 
1554 #if OMP_45_ENABLED
1555   case kmp_sch_guided_simd: {
1556     // same as iterative but curr-chunk adjusted to be multiple of given
1557     // chunk
1558     T chunk = pr->u.p.parm1;
1559     KD_TRACE(100,
1560              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1561               gtid));
1562     trip = pr->u.p.tc;
1563     // Start atomic part of calculations
1564     while (1) {
1565       ST remaining; // signed, because can be < 0
1566       init = sh->u.s.iteration; // shared value
1567       remaining = trip - init;
1568       if (remaining <= 0) { // AC: need to compare with 0 first
1569         status = 0; // nothing to do, don't try atomic op
1570         break;
1571       }
1572       KMP_DEBUG_ASSERT(init % chunk == 0);
1573       // compare with K*nproc*(chunk+1), K=2 by default
1574       if ((T)remaining < pr->u.p.parm2) {
1575         // use dynamic-style shcedule
1576         // atomically inrement iterations, get old value
1577         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1578                                  (ST)chunk);
1579         remaining = trip - init;
1580         if (remaining <= 0) {
1581           status = 0; // all iterations got by other threads
1582         } else {
1583           // got some iterations to work on
1584           status = 1;
1585           if ((T)remaining > chunk) {
1586             limit = init + chunk - 1;
1587           } else {
1588             last = 1; // the last chunk
1589             limit = init + remaining - 1;
1590           } // if
1591         } // if
1592         break;
1593       } // if
1594       // divide by K*nproc
1595       UT span = remaining * (*(double *)&pr->u.p.parm3);
1596       UT rem = span % chunk;
1597       if (rem) // adjust so that span%chunk == 0
1598         span += chunk - rem;
1599       limit = init + span;
1600       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1601                                (ST)init, (ST)limit)) {
1602         // CAS was successful, chunk obtained
1603         status = 1;
1604         --limit;
1605         break;
1606       } // if
1607     } // while
1608     if (status != 0) {
1609       start = pr->u.p.lb;
1610       incr = pr->u.p.st;
1611       if (p_st != NULL)
1612         *p_st = incr;
1613       *p_lb = start + init * incr;
1614       *p_ub = start + limit * incr;
1615       if (pr->flags.ordered) {
1616         pr->u.p.ordered_lower = init;
1617         pr->u.p.ordered_upper = limit;
1618       } // if
1619     } else {
1620       *p_lb = 0;
1621       *p_ub = 0;
1622       if (p_st != NULL)
1623         *p_st = 0;
1624     } // if
1625   } // case
1626   break;
1627 #endif // OMP_45_ENABLED
1628 
1629   case kmp_sch_guided_analytical_chunked: {
1630     T chunkspec = pr->u.p.parm1;
1631     UT chunkIdx;
1632 #if KMP_USE_X87CONTROL
1633     /* for storing original FPCW value for Windows* OS on
1634        IA-32 architecture 8-byte version */
1635     unsigned int oldFpcw;
1636     unsigned int fpcwSet = 0;
1637 #endif
1638     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1639                    "kmp_sch_guided_analytical_chunked case\n",
1640                    gtid));
1641 
1642     trip = pr->u.p.tc;
1643 
1644     KMP_DEBUG_ASSERT(nproc > 1);
1645     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1646 
1647     while (1) { /* this while loop is a safeguard against unexpected zero
1648                    chunk sizes */
1649       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1650       if (chunkIdx >= (UT)pr->u.p.parm2) {
1651         --trip;
1652         /* use dynamic-style scheduling */
1653         init = chunkIdx * chunkspec + pr->u.p.count;
1654         /* need to verify init > 0 in case of overflow in the above
1655          * calculation */
1656         if ((status = (init > 0 && init <= trip)) != 0) {
1657           limit = init + chunkspec - 1;
1658 
1659           if ((last = (limit >= trip)) != 0)
1660             limit = trip;
1661         }
1662         break;
1663       } else {
1664 /* use exponential-style scheduling */
1665 /* The following check is to workaround the lack of long double precision on
1666    Windows* OS.
1667    This check works around the possible effect that init != 0 for chunkIdx == 0.
1668  */
1669 #if KMP_USE_X87CONTROL
1670         /* If we haven't already done so, save original
1671            FPCW and set precision to 64-bit, as Windows* OS
1672            on IA-32 architecture defaults to 53-bit */
1673         if (!fpcwSet) {
1674           oldFpcw = _control87(0, 0);
1675           _control87(_PC_64, _MCW_PC);
1676           fpcwSet = 0x30000;
1677         }
1678 #endif
1679         if (chunkIdx) {
1680           init = __kmp_dispatch_guided_remaining<T>(
1681               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1682           KMP_DEBUG_ASSERT(init);
1683           init = trip - init;
1684         } else
1685           init = 0;
1686         limit = trip - __kmp_dispatch_guided_remaining<T>(
1687                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1688         KMP_ASSERT(init <= limit);
1689         if (init < limit) {
1690           KMP_DEBUG_ASSERT(limit <= trip);
1691           --limit;
1692           status = 1;
1693           break;
1694         } // if
1695       } // if
1696     } // while (1)
1697 #if KMP_USE_X87CONTROL
1698     /* restore FPCW if necessary
1699        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1700     */
1701     if (fpcwSet && (oldFpcw & fpcwSet))
1702       _control87(oldFpcw, _MCW_PC);
1703 #endif
1704     if (status != 0) {
1705       start = pr->u.p.lb;
1706       incr = pr->u.p.st;
1707       if (p_st != NULL)
1708         *p_st = incr;
1709       *p_lb = start + init * incr;
1710       *p_ub = start + limit * incr;
1711       if (pr->flags.ordered) {
1712         pr->u.p.ordered_lower = init;
1713         pr->u.p.ordered_upper = limit;
1714       }
1715     } else {
1716       *p_lb = 0;
1717       *p_ub = 0;
1718       if (p_st != NULL)
1719         *p_st = 0;
1720     }
1721   } // case
1722   break;
1723 
1724   case kmp_sch_trapezoidal: {
1725     UT index;
1726     T parm2 = pr->u.p.parm2;
1727     T parm3 = pr->u.p.parm3;
1728     T parm4 = pr->u.p.parm4;
1729     KD_TRACE(100,
1730              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1731               gtid));
1732 
1733     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1734 
1735     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1736     trip = pr->u.p.tc - 1;
1737 
1738     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1739       *p_lb = 0;
1740       *p_ub = 0;
1741       if (p_st != NULL)
1742         *p_st = 0;
1743     } else {
1744       start = pr->u.p.lb;
1745       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1746       incr = pr->u.p.st;
1747 
1748       if ((last = (limit >= trip)) != 0)
1749         limit = trip;
1750 
1751       if (p_st != NULL)
1752         *p_st = incr;
1753 
1754       if (incr == 1) {
1755         *p_lb = start + init;
1756         *p_ub = start + limit;
1757       } else {
1758         *p_lb = start + init * incr;
1759         *p_ub = start + limit * incr;
1760       }
1761 
1762       if (pr->flags.ordered) {
1763         pr->u.p.ordered_lower = init;
1764         pr->u.p.ordered_upper = limit;
1765       } // if
1766     } // if
1767   } // case
1768   break;
1769   default: {
1770     status = 0; // to avoid complaints on uninitialized variable use
1771     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1772                 KMP_HNT(GetNewerLibrary), // Hint
1773                 __kmp_msg_null // Variadic argument list terminator
1774                 );
1775   } break;
1776   } // switch
1777   if (p_last)
1778     *p_last = last;
1779 #ifdef KMP_DEBUG
1780   if (pr->flags.ordered) {
1781     char *buff;
1782     // create format specifiers before the debug output
1783     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1784                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1785                             traits_t<UT>::spec, traits_t<UT>::spec);
1786     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1787     __kmp_str_free(&buff);
1788   }
1789   {
1790     char *buff;
1791     // create format specifiers before the debug output
1792     buff = __kmp_str_format(
1793         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1794         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1795         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1796     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1797     __kmp_str_free(&buff);
1798   }
1799 #endif
1800   return status;
1801 }
1802 
1803 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1804    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1805    is not called. */
1806 #if OMPT_SUPPORT && OMPT_OPTIONAL
1807 #define OMPT_LOOP_END                                                          \
1808   if (status == 0) {                                                           \
1809     if (ompt_enabled.ompt_callback_work) {                                     \
1810       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1811       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1812       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1813           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1814           &(task_info->task_data), 0, codeptr);                                \
1815     }                                                                          \
1816   }
1817 // TODO: implement count
1818 #else
1819 #define OMPT_LOOP_END // no-op
1820 #endif
1821 
1822 #if KMP_STATS_ENABLED
1823 #define KMP_STATS_LOOP_END                                                     \
1824   {                                                                            \
1825     kmp_int64 u, l, t, i;                                                      \
1826     l = (kmp_int64)(*p_lb);                                                    \
1827     u = (kmp_int64)(*p_ub);                                                    \
1828     i = (kmp_int64)(pr->u.p.st);                                               \
1829     if (status == 0) {                                                         \
1830       t = 0;                                                                   \
1831       KMP_POP_PARTITIONED_TIMER();                                             \
1832     } else if (i == 1) {                                                       \
1833       if (u >= l)                                                              \
1834         t = u - l + 1;                                                         \
1835       else                                                                     \
1836         t = 0;                                                                 \
1837     } else if (i < 0) {                                                        \
1838       if (l >= u)                                                              \
1839         t = (l - u) / (-i) + 1;                                                \
1840       else                                                                     \
1841         t = 0;                                                                 \
1842     } else {                                                                   \
1843       if (u >= l)                                                              \
1844         t = (u - l) / i + 1;                                                   \
1845       else                                                                     \
1846         t = 0;                                                                 \
1847     }                                                                          \
1848     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1849   }
1850 #else
1851 #define KMP_STATS_LOOP_END /* Nothing */
1852 #endif
1853 
1854 template <typename T>
1855 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1856                                T *p_lb, T *p_ub,
1857                                typename traits_t<T>::signed_t *p_st
1858 #if OMPT_SUPPORT && OMPT_OPTIONAL
1859                                ,
1860                                void *codeptr
1861 #endif
1862                                ) {
1863 
1864   typedef typename traits_t<T>::unsigned_t UT;
1865   typedef typename traits_t<T>::signed_t ST;
1866   // This is potentially slightly misleading, schedule(runtime) will appear here
1867   // even if the actual runtme schedule is static. (Which points out a
1868   // disadavantage of schedule(runtime): even when static scheduling is used it
1869   // costs more than a compile time choice to use static scheduling would.)
1870   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1871 
1872   int status;
1873   dispatch_private_info_template<T> *pr;
1874   kmp_info_t *th = __kmp_threads[gtid];
1875   kmp_team_t *team = th->th.th_team;
1876 
1877   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1878   KD_TRACE(
1879       1000,
1880       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1881        gtid, p_lb, p_ub, p_st, p_last));
1882 
1883   if (team->t.t_serialized) {
1884     /* NOTE: serialize this dispatch becase we are not at the active level */
1885     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1886         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1887     KMP_DEBUG_ASSERT(pr);
1888 
1889     if ((status = (pr->u.p.tc != 0)) == 0) {
1890       *p_lb = 0;
1891       *p_ub = 0;
1892       //            if ( p_last != NULL )
1893       //                *p_last = 0;
1894       if (p_st != NULL)
1895         *p_st = 0;
1896       if (__kmp_env_consistency_check) {
1897         if (pr->pushed_ws != ct_none) {
1898           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1899         }
1900       }
1901     } else if (pr->flags.nomerge) {
1902       kmp_int32 last;
1903       T start;
1904       UT limit, trip, init;
1905       ST incr;
1906       T chunk = pr->u.p.parm1;
1907 
1908       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1909                      gtid));
1910 
1911       init = chunk * pr->u.p.count++;
1912       trip = pr->u.p.tc - 1;
1913 
1914       if ((status = (init <= trip)) == 0) {
1915         *p_lb = 0;
1916         *p_ub = 0;
1917         //                if ( p_last != NULL )
1918         //                    *p_last = 0;
1919         if (p_st != NULL)
1920           *p_st = 0;
1921         if (__kmp_env_consistency_check) {
1922           if (pr->pushed_ws != ct_none) {
1923             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1924           }
1925         }
1926       } else {
1927         start = pr->u.p.lb;
1928         limit = chunk + init - 1;
1929         incr = pr->u.p.st;
1930 
1931         if ((last = (limit >= trip)) != 0) {
1932           limit = trip;
1933 #if KMP_OS_WINDOWS
1934           pr->u.p.last_upper = pr->u.p.ub;
1935 #endif /* KMP_OS_WINDOWS */
1936         }
1937         if (p_last != NULL)
1938           *p_last = last;
1939         if (p_st != NULL)
1940           *p_st = incr;
1941         if (incr == 1) {
1942           *p_lb = start + init;
1943           *p_ub = start + limit;
1944         } else {
1945           *p_lb = start + init * incr;
1946           *p_ub = start + limit * incr;
1947         }
1948 
1949         if (pr->flags.ordered) {
1950           pr->u.p.ordered_lower = init;
1951           pr->u.p.ordered_upper = limit;
1952 #ifdef KMP_DEBUG
1953           {
1954             char *buff;
1955             // create format specifiers before the debug output
1956             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1957                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1958                                     traits_t<UT>::spec, traits_t<UT>::spec);
1959             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1960                             pr->u.p.ordered_upper));
1961             __kmp_str_free(&buff);
1962           }
1963 #endif
1964         } // if
1965       } // if
1966     } else {
1967       pr->u.p.tc = 0;
1968       *p_lb = pr->u.p.lb;
1969       *p_ub = pr->u.p.ub;
1970 #if KMP_OS_WINDOWS
1971       pr->u.p.last_upper = *p_ub;
1972 #endif /* KMP_OS_WINDOWS */
1973       if (p_last != NULL)
1974         *p_last = TRUE;
1975       if (p_st != NULL)
1976         *p_st = pr->u.p.st;
1977     } // if
1978 #ifdef KMP_DEBUG
1979     {
1980       char *buff;
1981       // create format specifiers before the debug output
1982       buff = __kmp_str_format(
1983           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1984           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1985           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1986       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1987       __kmp_str_free(&buff);
1988     }
1989 #endif
1990 #if INCLUDE_SSC_MARKS
1991     SSC_MARK_DISPATCH_NEXT();
1992 #endif
1993     OMPT_LOOP_END;
1994     KMP_STATS_LOOP_END;
1995     return status;
1996   } else {
1997     kmp_int32 last = 0;
1998     dispatch_shared_info_template<T> volatile *sh;
1999 
2000     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2001                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2002 
2003     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2004         th->th.th_dispatch->th_dispatch_pr_current);
2005     KMP_DEBUG_ASSERT(pr);
2006     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2007         th->th.th_dispatch->th_dispatch_sh_current);
2008     KMP_DEBUG_ASSERT(sh);
2009 
2010 #if KMP_USE_HIER_SCHED
2011     if (pr->flags.use_hier)
2012       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2013     else
2014 #endif // KMP_USE_HIER_SCHED
2015       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2016                                                 p_st, th->th.th_team_nproc,
2017                                                 th->th.th_info.ds.ds_tid);
2018     // status == 0: no more iterations to execute
2019     if (status == 0) {
2020       UT num_done;
2021 
2022       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2023 #ifdef KMP_DEBUG
2024       {
2025         char *buff;
2026         // create format specifiers before the debug output
2027         buff = __kmp_str_format(
2028             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2029             traits_t<UT>::spec);
2030         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2031         __kmp_str_free(&buff);
2032       }
2033 #endif
2034 
2035 #if KMP_USE_HIER_SCHED
2036       pr->flags.use_hier = FALSE;
2037 #endif
2038       if ((ST)num_done == th->th.th_team_nproc - 1) {
2039 #if (KMP_STATIC_STEAL_ENABLED)
2040         if (pr->schedule == kmp_sch_static_steal &&
2041             traits_t<T>::type_size > 4) {
2042           int i;
2043           kmp_info_t **other_threads = team->t.t_threads;
2044           // loop complete, safe to destroy locks used for stealing
2045           for (i = 0; i < th->th.th_team_nproc; ++i) {
2046             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2047             KMP_ASSERT(lck != NULL);
2048             __kmp_destroy_lock(lck);
2049             __kmp_free(lck);
2050             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2051           }
2052         }
2053 #endif
2054         /* NOTE: release this buffer to be reused */
2055 
2056         KMP_MB(); /* Flush all pending memory write invalidates.  */
2057 
2058         sh->u.s.num_done = 0;
2059         sh->u.s.iteration = 0;
2060 
2061         /* TODO replace with general release procedure? */
2062         if (pr->flags.ordered) {
2063           sh->u.s.ordered_iteration = 0;
2064         }
2065 
2066         KMP_MB(); /* Flush all pending memory write invalidates.  */
2067 
2068         sh->buffer_index += __kmp_dispatch_num_buffers;
2069         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2070                        gtid, sh->buffer_index));
2071 
2072         KMP_MB(); /* Flush all pending memory write invalidates.  */
2073 
2074       } // if
2075       if (__kmp_env_consistency_check) {
2076         if (pr->pushed_ws != ct_none) {
2077           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2078         }
2079       }
2080 
2081       th->th.th_dispatch->th_deo_fcn = NULL;
2082       th->th.th_dispatch->th_dxo_fcn = NULL;
2083       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2084       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2085     } // if (status == 0)
2086 #if KMP_OS_WINDOWS
2087     else if (last) {
2088       pr->u.p.last_upper = pr->u.p.ub;
2089     }
2090 #endif /* KMP_OS_WINDOWS */
2091     if (p_last != NULL && status != 0)
2092       *p_last = last;
2093   } // if
2094 
2095 #ifdef KMP_DEBUG
2096   {
2097     char *buff;
2098     // create format specifiers before the debug output
2099     buff = __kmp_str_format(
2100         "__kmp_dispatch_next: T#%%d normal case: "
2101         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2102         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2103     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2104                   (p_last ? *p_last : 0), status));
2105     __kmp_str_free(&buff);
2106   }
2107 #endif
2108 #if INCLUDE_SSC_MARKS
2109   SSC_MARK_DISPATCH_NEXT();
2110 #endif
2111   OMPT_LOOP_END;
2112   KMP_STATS_LOOP_END;
2113   return status;
2114 }
2115 
2116 template <typename T>
2117 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2118                                   kmp_int32 *plastiter, T *plower, T *pupper,
2119                                   typename traits_t<T>::signed_t incr) {
2120   typedef typename traits_t<T>::unsigned_t UT;
2121   kmp_uint32 team_id;
2122   kmp_uint32 nteams;
2123   UT trip_count;
2124   kmp_team_t *team;
2125   kmp_info_t *th;
2126 
2127   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2128   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2129 #ifdef KMP_DEBUG
2130   typedef typename traits_t<T>::signed_t ST;
2131   {
2132     char *buff;
2133     // create format specifiers before the debug output
2134     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2135                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2136                             traits_t<T>::spec, traits_t<T>::spec,
2137                             traits_t<ST>::spec, traits_t<T>::spec);
2138     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2139     __kmp_str_free(&buff);
2140   }
2141 #endif
2142 
2143   if (__kmp_env_consistency_check) {
2144     if (incr == 0) {
2145       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2146                             loc);
2147     }
2148     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2149       // The loop is illegal.
2150       // Some zero-trip loops maintained by compiler, e.g.:
2151       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2152       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2153       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2154       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2155       // Compiler does not check the following illegal loops:
2156       //   for(i=0;i<10;i+=incr) // where incr<0
2157       //   for(i=10;i>0;i-=incr) // where incr<0
2158       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2159     }
2160   }
2161   th = __kmp_threads[gtid];
2162   team = th->th.th_team;
2163 #if OMP_40_ENABLED
2164   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2165   nteams = th->th.th_teams_size.nteams;
2166 #endif
2167   team_id = team->t.t_master_tid;
2168   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2169 
2170   // compute global trip count
2171   if (incr == 1) {
2172     trip_count = *pupper - *plower + 1;
2173   } else if (incr == -1) {
2174     trip_count = *plower - *pupper + 1;
2175   } else if (incr > 0) {
2176     // upper-lower can exceed the limit of signed type
2177     trip_count = (UT)(*pupper - *plower) / incr + 1;
2178   } else {
2179     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2180   }
2181 
2182   if (trip_count <= nteams) {
2183     KMP_DEBUG_ASSERT(
2184         __kmp_static == kmp_sch_static_greedy ||
2185         __kmp_static ==
2186             kmp_sch_static_balanced); // Unknown static scheduling type.
2187     // only some teams get single iteration, others get nothing
2188     if (team_id < trip_count) {
2189       *pupper = *plower = *plower + team_id * incr;
2190     } else {
2191       *plower = *pupper + incr; // zero-trip loop
2192     }
2193     if (plastiter != NULL)
2194       *plastiter = (team_id == trip_count - 1);
2195   } else {
2196     if (__kmp_static == kmp_sch_static_balanced) {
2197       UT chunk = trip_count / nteams;
2198       UT extras = trip_count % nteams;
2199       *plower +=
2200           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2201       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2202       if (plastiter != NULL)
2203         *plastiter = (team_id == nteams - 1);
2204     } else {
2205       T chunk_inc_count =
2206           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2207       T upper = *pupper;
2208       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2209       // Unknown static scheduling type.
2210       *plower += team_id * chunk_inc_count;
2211       *pupper = *plower + chunk_inc_count - incr;
2212       // Check/correct bounds if needed
2213       if (incr > 0) {
2214         if (*pupper < *plower)
2215           *pupper = traits_t<T>::max_value;
2216         if (plastiter != NULL)
2217           *plastiter = *plower <= upper && *pupper > upper - incr;
2218         if (*pupper > upper)
2219           *pupper = upper; // tracker C73258
2220       } else {
2221         if (*pupper > *plower)
2222           *pupper = traits_t<T>::min_value;
2223         if (plastiter != NULL)
2224           *plastiter = *plower >= upper && *pupper < upper - incr;
2225         if (*pupper < upper)
2226           *pupper = upper; // tracker C73258
2227       }
2228     }
2229   }
2230 }
2231 
2232 //-----------------------------------------------------------------------------
2233 // Dispatch routines
2234 //    Transfer call to template< type T >
2235 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2236 //                         T lb, T ub, ST st, ST chunk )
2237 extern "C" {
2238 
2239 /*!
2240 @ingroup WORK_SHARING
2241 @{
2242 @param loc Source location
2243 @param gtid Global thread id
2244 @param schedule Schedule type
2245 @param lb  Lower bound
2246 @param ub  Upper bound
2247 @param st  Step (or increment if you prefer)
2248 @param chunk The chunk size to block with
2249 
2250 This function prepares the runtime to start a dynamically scheduled for loop,
2251 saving the loop arguments.
2252 These functions are all identical apart from the types of the arguments.
2253 */
2254 
2255 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2256                             enum sched_type schedule, kmp_int32 lb,
2257                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2258   KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 #if OMPT_SUPPORT && OMPT_OPTIONAL
2260   OMPT_STORE_RETURN_ADDRESS(gtid);
2261 #endif
2262   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2263 }
2264 /*!
2265 See @ref __kmpc_dispatch_init_4
2266 */
2267 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2268                              enum sched_type schedule, kmp_uint32 lb,
2269                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2270   KMP_DEBUG_ASSERT(__kmp_init_serial);
2271 #if OMPT_SUPPORT && OMPT_OPTIONAL
2272   OMPT_STORE_RETURN_ADDRESS(gtid);
2273 #endif
2274   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2275 }
2276 
2277 /*!
2278 See @ref __kmpc_dispatch_init_4
2279 */
2280 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2281                             enum sched_type schedule, kmp_int64 lb,
2282                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2283   KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL
2285   OMPT_STORE_RETURN_ADDRESS(gtid);
2286 #endif
2287   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2288 }
2289 
2290 /*!
2291 See @ref __kmpc_dispatch_init_4
2292 */
2293 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2294                              enum sched_type schedule, kmp_uint64 lb,
2295                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2296   KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298   OMPT_STORE_RETURN_ADDRESS(gtid);
2299 #endif
2300   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301 }
2302 
2303 /*!
2304 See @ref __kmpc_dispatch_init_4
2305 
2306 Difference from __kmpc_dispatch_init set of functions is these functions
2307 are called for composite distribute parallel for construct. Thus before
2308 regular iterations dispatching we need to calc per-team iteration space.
2309 
2310 These functions are all identical apart from the types of the arguments.
2311 */
2312 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2313                                  enum sched_type schedule, kmp_int32 *p_last,
2314                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2315                                  kmp_int32 chunk) {
2316   KMP_DEBUG_ASSERT(__kmp_init_serial);
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318   OMPT_STORE_RETURN_ADDRESS(gtid);
2319 #endif
2320   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2321   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2322 }
2323 
2324 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2325                                   enum sched_type schedule, kmp_int32 *p_last,
2326                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2327                                   kmp_int32 chunk) {
2328   KMP_DEBUG_ASSERT(__kmp_init_serial);
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL
2330   OMPT_STORE_RETURN_ADDRESS(gtid);
2331 #endif
2332   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2333   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2334 }
2335 
2336 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2337                                  enum sched_type schedule, kmp_int32 *p_last,
2338                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2339                                  kmp_int64 chunk) {
2340   KMP_DEBUG_ASSERT(__kmp_init_serial);
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342   OMPT_STORE_RETURN_ADDRESS(gtid);
2343 #endif
2344   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2345   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2346 }
2347 
2348 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2349                                   enum sched_type schedule, kmp_int32 *p_last,
2350                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2351                                   kmp_int64 chunk) {
2352   KMP_DEBUG_ASSERT(__kmp_init_serial);
2353 #if OMPT_SUPPORT && OMPT_OPTIONAL
2354   OMPT_STORE_RETURN_ADDRESS(gtid);
2355 #endif
2356   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2357   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2358 }
2359 
2360 /*!
2361 @param loc Source code location
2362 @param gtid Global thread id
2363 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2364 otherwise
2365 @param p_lb   Pointer to the lower bound for the next chunk of work
2366 @param p_ub   Pointer to the upper bound for the next chunk of work
2367 @param p_st   Pointer to the stride for the next chunk of work
2368 @return one if there is work to be done, zero otherwise
2369 
2370 Get the next dynamically allocated chunk of work for this thread.
2371 If there is no more work, then the lb,ub and stride need not be modified.
2372 */
2373 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2374                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376   OMPT_STORE_RETURN_ADDRESS(gtid);
2377 #endif
2378   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380                                         ,
2381                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2382 #endif
2383                                             );
2384 }
2385 
2386 /*!
2387 See @ref __kmpc_dispatch_next_4
2388 */
2389 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2390                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2391                             kmp_int32 *p_st) {
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393   OMPT_STORE_RETURN_ADDRESS(gtid);
2394 #endif
2395   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397                                          ,
2398                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2399 #endif
2400                                              );
2401 }
2402 
2403 /*!
2404 See @ref __kmpc_dispatch_next_4
2405 */
2406 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2407                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409   OMPT_STORE_RETURN_ADDRESS(gtid);
2410 #endif
2411   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413                                         ,
2414                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2415 #endif
2416                                             );
2417 }
2418 
2419 /*!
2420 See @ref __kmpc_dispatch_next_4
2421 */
2422 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2423                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2424                             kmp_int64 *p_st) {
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426   OMPT_STORE_RETURN_ADDRESS(gtid);
2427 #endif
2428   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL
2430                                          ,
2431                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2432 #endif
2433                                              );
2434 }
2435 
2436 /*!
2437 @param loc Source code location
2438 @param gtid Global thread id
2439 
2440 Mark the end of a dynamic loop.
2441 */
2442 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2443   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2444 }
2445 
2446 /*!
2447 See @ref __kmpc_dispatch_fini_4
2448 */
2449 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2450   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2451 }
2452 
2453 /*!
2454 See @ref __kmpc_dispatch_fini_4
2455 */
2456 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2457   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2458 }
2459 
2460 /*!
2461 See @ref __kmpc_dispatch_fini_4
2462 */
2463 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2464   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2465 }
2466 /*! @} */
2467 
2468 //-----------------------------------------------------------------------------
2469 // Non-template routines from kmp_dispatch.cpp used in other sources
2470 
2471 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2472   return value == checker;
2473 }
2474 
2475 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2476   return value != checker;
2477 }
2478 
2479 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2480   return value < checker;
2481 }
2482 
2483 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2484   return value >= checker;
2485 }
2486 
2487 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2488   return value <= checker;
2489 }
2490 
2491 kmp_uint32
2492 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2493              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2494              void *obj // Higher-level synchronization object, or NULL.
2495              ) {
2496   // note: we may not belong to a team at this point
2497   volatile kmp_uint32 *spin = spinner;
2498   kmp_uint32 check = checker;
2499   kmp_uint32 spins;
2500   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2501   kmp_uint32 r;
2502 
2503   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2504   KMP_INIT_YIELD(spins);
2505   // main wait spin loop
2506   while (!f(r = TCR_4(*spin), check)) {
2507     KMP_FSYNC_SPIN_PREPARE(obj);
2508     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2509        split. It causes problems with infinite recursion because of exit lock */
2510     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2511         __kmp_abort_thread(); */
2512     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2513   }
2514   KMP_FSYNC_SPIN_ACQUIRED(obj);
2515   return r;
2516 }
2517 
2518 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2519                       kmp_uint32 (*pred)(void *, kmp_uint32),
2520                       void *obj // Higher-level synchronization object, or NULL.
2521                       ) {
2522   // note: we may not belong to a team at this point
2523   void *spin = spinner;
2524   kmp_uint32 check = checker;
2525   kmp_uint32 spins;
2526   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2527 
2528   KMP_FSYNC_SPIN_INIT(obj, spin);
2529   KMP_INIT_YIELD(spins);
2530   // main wait spin loop
2531   while (!f(spin, check)) {
2532     KMP_FSYNC_SPIN_PREPARE(obj);
2533     /* if we have waited a bit, or are noversubscribed, yield */
2534     /* pause is in the following code */
2535     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2536   }
2537   KMP_FSYNC_SPIN_ACQUIRED(obj);
2538 }
2539 
2540 } // extern "C"
2541 
2542 #ifdef KMP_GOMP_COMPAT
2543 
2544 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2545                                enum sched_type schedule, kmp_int32 lb,
2546                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2547                                int push_ws) {
2548   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2549                                  push_ws);
2550 }
2551 
2552 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2553                                 enum sched_type schedule, kmp_uint32 lb,
2554                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2555                                 int push_ws) {
2556   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2557                                   push_ws);
2558 }
2559 
2560 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2561                                enum sched_type schedule, kmp_int64 lb,
2562                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2563                                int push_ws) {
2564   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2565                                  push_ws);
2566 }
2567 
2568 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2569                                 enum sched_type schedule, kmp_uint64 lb,
2570                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2571                                 int push_ws) {
2572   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2573                                   push_ws);
2574 }
2575 
2576 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2577   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2578 }
2579 
2580 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2581   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2582 }
2583 
2584 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2585   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2586 }
2587 
2588 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2589   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2590 }
2591 
2592 #endif /* KMP_GOMP_COMPAT */
2593 
2594 /* ------------------------------------------------------------------------ */
2595