1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 #include "kmp_lock.h"
38 #include "kmp_dispatch.h"
39 #if KMP_USE_HIER_SCHED
40 #include "kmp_dispatch_hier.h"
41 #endif
42 
43 #if OMPT_SUPPORT
44 #include "ompt-specific.h"
45 #endif
46 
47 /* ------------------------------------------------------------------------ */
48 /* ------------------------------------------------------------------------ */
49 
50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
51   kmp_info_t *th;
52 
53   KMP_DEBUG_ASSERT(gtid_ref);
54 
55   if (__kmp_env_consistency_check) {
56     th = __kmp_threads[*gtid_ref];
57     if (th->th.th_root->r.r_active &&
58         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK
60       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
61 #else
62       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
63 #endif
64     }
65   }
66 }
67 
68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
69   kmp_info_t *th;
70 
71   if (__kmp_env_consistency_check) {
72     th = __kmp_threads[*gtid_ref];
73     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
75     }
76   }
77 }
78 
79 // Initialize a dispatch_private_info_template<T> buffer for a particular
80 // type of schedule,chunk.  The loop description is found in lb (lower bound),
81 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
82 // to the scheduling (often the number of threads in a team, but not always if
83 // hierarchical scheduling is used).  tid is the id of the thread calling
84 // the function within the group of nproc threads.  It will have a value
85 // between 0 and nproc - 1.  This is often just the thread id within a team, but
86 // is not necessarily the case when using hierarchical scheduling.
87 // loc is the source file location of the corresponding loop
88 // gtid is the global thread id
89 template <typename T>
90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
91                                    dispatch_private_info_template<T> *pr,
92                                    enum sched_type schedule, T lb, T ub,
93                                    typename traits_t<T>::signed_t st,
94 #if USE_ITT_BUILD
95                                    kmp_uint64 *cur_chunk,
96 #endif
97                                    typename traits_t<T>::signed_t chunk,
98                                    T nproc, T tid) {
99   typedef typename traits_t<T>::unsigned_t UT;
100   typedef typename traits_t<T>::signed_t ST;
101   typedef typename traits_t<T>::floating_t DBL;
102 
103   int active;
104   T tc;
105   kmp_info_t *th;
106   kmp_team_t *team;
107 
108 #ifdef KMP_DEBUG
109   {
110     char *buff;
111     // create format specifiers before the debug output
112     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
113                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
114                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115                             traits_t<T>::spec, traits_t<T>::spec,
116                             traits_t<ST>::spec, traits_t<ST>::spec,
117                             traits_t<T>::spec, traits_t<T>::spec);
118     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119     __kmp_str_free(&buff);
120   }
121 #endif
122   /* setup data */
123   th = __kmp_threads[gtid];
124   team = th->th.th_team;
125   active = !team->t.t_serialized;
126 
127 #if USE_ITT_BUILD
128   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129                                     __kmp_forkjoin_frames_mode == 3 &&
130                                     KMP_MASTER_GTID(gtid) &&
131 #if OMP_40_ENABLED
132                                     th->th.th_teams_microtask == NULL &&
133 #endif
134                                     team->t.t_active_level == 1;
135 #endif
136 #if (KMP_STATIC_STEAL_ENABLED)
137   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
138     // AC: we now have only one implementation of stealing, so use it
139     schedule = kmp_sch_static_steal;
140   else
141 #endif
142     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
143 
144   /* Pick up the nomerge/ordered bits from the scheduling type */
145   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
146     pr->flags.nomerge = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
149   } else {
150     pr->flags.nomerge = FALSE;
151   }
152   pr->type_size = traits_t<T>::type_size; // remember the size of variables
153   if (kmp_ord_lower & schedule) {
154     pr->flags.ordered = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
157   } else {
158     pr->flags.ordered = FALSE;
159   }
160 
161   if (schedule == kmp_sch_static) {
162     schedule = __kmp_static;
163   } else {
164     if (schedule == kmp_sch_runtime) {
165       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
166       // not specified)
167       schedule = team->t.t_sched.r_sched_type;
168       // Detail the schedule if needed (global controls are differentiated
169       // appropriately)
170       if (schedule == kmp_sch_guided_chunked) {
171         schedule = __kmp_guided;
172       } else if (schedule == kmp_sch_static) {
173         schedule = __kmp_static;
174       }
175       // Use the chunk size specified by OMP_SCHEDULE (or default if not
176       // specified)
177       chunk = team->t.t_sched.chunk;
178 #if USE_ITT_BUILD
179       if (cur_chunk)
180         *cur_chunk = chunk;
181 #endif
182 #ifdef KMP_DEBUG
183       {
184         char *buff;
185         // create format specifiers before the debug output
186         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
187                                 "schedule:%%d chunk:%%%s\n",
188                                 traits_t<ST>::spec);
189         KD_TRACE(10, (buff, gtid, schedule, chunk));
190         __kmp_str_free(&buff);
191       }
192 #endif
193     } else {
194       if (schedule == kmp_sch_guided_chunked) {
195         schedule = __kmp_guided;
196       }
197       if (chunk <= 0) {
198         chunk = KMP_DEFAULT_CHUNK;
199       }
200     }
201 
202     if (schedule == kmp_sch_auto) {
203       // mapping and differentiation: in the __kmp_do_serial_initialize()
204       schedule = __kmp_auto;
205 #ifdef KMP_DEBUG
206       {
207         char *buff;
208         // create format specifiers before the debug output
209         buff = __kmp_str_format(
210             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
211             "schedule:%%d chunk:%%%s\n",
212             traits_t<ST>::spec);
213         KD_TRACE(10, (buff, gtid, schedule, chunk));
214         __kmp_str_free(&buff);
215       }
216 #endif
217     }
218 
219     /* guided analytical not safe for too many threads */
220     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221       schedule = kmp_sch_guided_iterative_chunked;
222       KMP_WARNING(DispatchManyThreads);
223     }
224 #if OMP_45_ENABLED
225     if (schedule == kmp_sch_runtime_simd) {
226       // compiler provides simd_width in the chunk parameter
227       schedule = team->t.t_sched.r_sched_type;
228       // Detail the schedule if needed (global controls are differentiated
229       // appropriately)
230       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
231           schedule == __kmp_static) {
232         schedule = kmp_sch_static_balanced_chunked;
233       } else {
234         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
235           schedule = kmp_sch_guided_simd;
236         }
237         chunk = team->t.t_sched.chunk * chunk;
238       }
239 #if USE_ITT_BUILD
240       if (cur_chunk)
241         *cur_chunk = chunk;
242 #endif
243 #ifdef KMP_DEBUG
244       {
245         char *buff;
246         // create format specifiers before the debug output
247         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
248                                 " chunk:%%%s\n",
249                                 traits_t<ST>::spec);
250         KD_TRACE(10, (buff, gtid, schedule, chunk));
251         __kmp_str_free(&buff);
252       }
253 #endif
254     }
255 #endif // OMP_45_ENABLED
256     pr->u.p.parm1 = chunk;
257   }
258   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
259               "unknown scheduling type");
260 
261   pr->u.p.count = 0;
262 
263   if (__kmp_env_consistency_check) {
264     if (st == 0) {
265       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
267     }
268   }
269   // compute trip count
270   if (st == 1) { // most common case
271     if (ub >= lb) {
272       tc = ub - lb + 1;
273     } else { // ub < lb
274       tc = 0; // zero-trip
275     }
276   } else if (st < 0) {
277     if (lb >= ub) {
278       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(lb - ub) / (-st) + 1;
281     } else { // lb < ub
282       tc = 0; // zero-trip
283     }
284   } else { // st > 0
285     if (ub >= lb) {
286       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
287       // where the division needs to be unsigned regardless of the result type
288       tc = (UT)(ub - lb) / st + 1;
289     } else { // ub < lb
290       tc = 0; // zero-trip
291     }
292   }
293 
294   pr->u.p.lb = lb;
295   pr->u.p.ub = ub;
296   pr->u.p.st = st;
297   pr->u.p.tc = tc;
298 
299 #if KMP_OS_WINDOWS
300   pr->u.p.last_upper = ub + st;
301 #endif /* KMP_OS_WINDOWS */
302 
303   /* NOTE: only the active parallel region(s) has active ordered sections */
304 
305   if (active) {
306     if (pr->flags.ordered) {
307       pr->ordered_bumped = 0;
308       pr->u.p.ordered_lower = 1;
309       pr->u.p.ordered_upper = 0;
310     }
311   }
312 
313   switch (schedule) {
314 #if (KMP_STATIC_STEAL_ENABLED)
315   case kmp_sch_static_steal: {
316     T ntc, init;
317 
318     KD_TRACE(100,
319              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320               gtid));
321 
322     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323     if (nproc > 1 && ntc >= nproc) {
324       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
325       T id = tid;
326       T small_chunk, extras;
327 
328       small_chunk = ntc / nproc;
329       extras = ntc % nproc;
330 
331       init = id * small_chunk + (id < extras ? id : extras);
332       pr->u.p.count = init;
333       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
334 
335       pr->u.p.parm2 = lb;
336       // pr->pfields.parm3 = 0; // it's not used in static_steal
337       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
338       pr->u.p.st = st;
339       if (traits_t<T>::type_size > 4) {
340         // AC: TODO: check if 16-byte CAS available and use it to
341         // improve performance (probably wait for explicit request
342         // before spending time on this).
343         // For now use dynamically allocated per-thread lock,
344         // free memory in __kmp_dispatch_next when status==0.
345         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346         th->th.th_dispatch->th_steal_lock =
347             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
348         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
349       }
350       break;
351     } else {
352       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
353                      "kmp_sch_static_balanced\n",
354                      gtid));
355       schedule = kmp_sch_static_balanced;
356       /* too few iterations: fall-through to kmp_sch_static_balanced */
357     } // if
358     /* FALL-THROUGH to static balanced */
359   } // case
360 #endif
361   case kmp_sch_static_balanced: {
362     T init, limit;
363 
364     KD_TRACE(
365         100,
366         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
367          gtid));
368 
369     if (nproc > 1) {
370       T id = tid;
371 
372       if (tc < nproc) {
373         if (id < tc) {
374           init = id;
375           limit = id;
376           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
377         } else {
378           pr->u.p.count = 1; /* means no more chunks to execute */
379           pr->u.p.parm1 = FALSE;
380           break;
381         }
382       } else {
383         T small_chunk = tc / nproc;
384         T extras = tc % nproc;
385         init = id * small_chunk + (id < extras ? id : extras);
386         limit = init + small_chunk - (id < extras ? 0 : 1);
387         pr->u.p.parm1 = (id == nproc - 1);
388       }
389     } else {
390       if (tc > 0) {
391         init = 0;
392         limit = tc - 1;
393         pr->u.p.parm1 = TRUE;
394       } else {
395         // zero trip count
396         pr->u.p.count = 1; /* means no more chunks to execute */
397         pr->u.p.parm1 = FALSE;
398         break;
399       }
400     }
401 #if USE_ITT_BUILD
402     // Calculate chunk for metadata report
403     if (itt_need_metadata_reporting)
404       if (cur_chunk)
405         *cur_chunk = limit - init + 1;
406 #endif
407     if (st == 1) {
408       pr->u.p.lb = lb + init;
409       pr->u.p.ub = lb + limit;
410     } else {
411       // calculated upper bound, "ub" is user-defined upper bound
412       T ub_tmp = lb + limit * st;
413       pr->u.p.lb = lb + init * st;
414       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
415       // it exactly
416       if (st > 0) {
417         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418       } else {
419         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
420       }
421     }
422     if (pr->flags.ordered) {
423       pr->u.p.ordered_lower = init;
424       pr->u.p.ordered_upper = limit;
425     }
426     break;
427   } // case
428 #if OMP_45_ENABLED
429   case kmp_sch_static_balanced_chunked: {
430     // similar to balanced, but chunk adjusted to multiple of simd width
431     T nth = nproc;
432     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
433                    " -> falling-through to static_greedy\n",
434                    gtid));
435     schedule = kmp_sch_static_greedy;
436     if (nth > 1)
437       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
438     else
439       pr->u.p.parm1 = tc;
440     break;
441   } // case
442   case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED
444   case kmp_sch_guided_iterative_chunked: {
445     KD_TRACE(
446         100,
447         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
448          " case\n",
449          gtid));
450 
451     if (nproc > 1) {
452       if ((2L * chunk + 1) * nproc >= tc) {
453         /* chunk size too large, switch to dynamic */
454         schedule = kmp_sch_dynamic_chunked;
455       } else {
456         // when remaining iters become less than parm2 - switch to dynamic
457         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458         *(double *)&pr->u.p.parm3 =
459             guided_flt_param / nproc; // may occupy parm3 and parm4
460       }
461     } else {
462       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
463                      "kmp_sch_static_greedy\n",
464                      gtid));
465       schedule = kmp_sch_static_greedy;
466       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
467       KD_TRACE(
468           100,
469           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
470            gtid));
471       pr->u.p.parm1 = tc;
472     } // if
473   } // case
474   break;
475   case kmp_sch_guided_analytical_chunked: {
476     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
477                    "kmp_sch_guided_analytical_chunked case\n",
478                    gtid));
479 
480     if (nproc > 1) {
481       if ((2L * chunk + 1) * nproc >= tc) {
482         /* chunk size too large, switch to dynamic */
483         schedule = kmp_sch_dynamic_chunked;
484       } else {
485         /* commonly used term: (2 nproc - 1)/(2 nproc) */
486         DBL x;
487 
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86
489         /* Linux* OS already has 64-bit computation by default for long double,
490            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
491            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
492            instead of the default 53-bit. Even though long double doesn't work
493            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
494            expected to impact the correctness of the algorithm, but this has not
495            been mathematically proven. */
496         // save original FPCW and set precision to 64-bit, as
497         // Windows* OS on IA-32 architecture defaults to 53-bit
498         unsigned int oldFpcw = _control87(0, 0);
499         _control87(_PC_64, _MCW_PC); // 0,0x30000
500 #endif
501         /* value used for comparison in solver for cross-over point */
502         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
503 
504         /* crossover point--chunk indexes equal to or greater than
505            this point switch to dynamic-style scheduling */
506         UT cross;
507 
508         /* commonly used term: (2 nproc - 1)/(2 nproc) */
509         x = (long double)1.0 - (long double)0.5 / nproc;
510 
511 #ifdef KMP_DEBUG
512         { // test natural alignment
513           struct _test_a {
514             char a;
515             union {
516               char b;
517               DBL d;
518             };
519           } t;
520           ptrdiff_t natural_alignment =
521               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
522           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
523           // long)natural_alignment );
524           KMP_DEBUG_ASSERT(
525               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
526         }
527 #endif // KMP_DEBUG
528 
529         /* save the term in thread private dispatch structure */
530         *(DBL *)&pr->u.p.parm3 = x;
531 
532         /* solve for the crossover point to the nearest integer i for which C_i
533            <= chunk */
534         {
535           UT left, right, mid;
536           long double p;
537 
538           /* estimate initial upper and lower bound */
539 
540           /* doesn't matter what value right is as long as it is positive, but
541              it affects performance of the solver */
542           right = 229;
543           p = __kmp_pow<UT>(x, right);
544           if (p > target) {
545             do {
546               p *= p;
547               right <<= 1;
548             } while (p > target && right < (1 << 27));
549             /* lower bound is previous (failed) estimate of upper bound */
550             left = right >> 1;
551           } else {
552             left = 0;
553           }
554 
555           /* bisection root-finding method */
556           while (left + 1 < right) {
557             mid = (left + right) / 2;
558             if (__kmp_pow<UT>(x, mid) > target) {
559               left = mid;
560             } else {
561               right = mid;
562             }
563           } // while
564           cross = right;
565         }
566         /* assert sanity of computed crossover point */
567         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568                    __kmp_pow<UT>(x, cross) <= target);
569 
570         /* save the crossover point in thread private dispatch structure */
571         pr->u.p.parm2 = cross;
572 
573 // C75803
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
576 #else
577 #define GUIDED_ANALYTICAL_WORKAROUND (x)
578 #endif
579         /* dynamic-style scheduling offset */
580         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582                         cross * chunk;
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86
584         // restore FPCW
585         _control87(oldFpcw, _MCW_PC);
586 #endif
587       } // if
588     } else {
589       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
590                      "kmp_sch_static_greedy\n",
591                      gtid));
592       schedule = kmp_sch_static_greedy;
593       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
594       pr->u.p.parm1 = tc;
595     } // if
596   } // case
597   break;
598   case kmp_sch_static_greedy:
599     KD_TRACE(
600         100,
601         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602          gtid));
603     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604     break;
605   case kmp_sch_static_chunked:
606   case kmp_sch_dynamic_chunked:
607     if (pr->u.p.parm1 <= 0) {
608       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609     }
610     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
611                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
612                    gtid));
613     break;
614   case kmp_sch_trapezoidal: {
615     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
616 
617     T parm1, parm2, parm3, parm4;
618     KD_TRACE(100,
619              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
620               gtid));
621 
622     parm1 = chunk;
623 
624     /* F : size of the first cycle */
625     parm2 = (tc / (2 * nproc));
626 
627     if (parm2 < 1) {
628       parm2 = 1;
629     }
630 
631     /* L : size of the last cycle.  Make sure the last cycle is not larger
632        than the first cycle. */
633     if (parm1 < 1) {
634       parm1 = 1;
635     } else if (parm1 > parm2) {
636       parm1 = parm2;
637     }
638 
639     /* N : number of cycles */
640     parm3 = (parm2 + parm1);
641     parm3 = (2 * tc + parm3 - 1) / parm3;
642 
643     if (parm3 < 2) {
644       parm3 = 2;
645     }
646 
647     /* sigma : decreasing incr of the trapezoid */
648     parm4 = (parm3 - 1);
649     parm4 = (parm2 - parm1) / parm4;
650 
651     // pointless check, because parm4 >= 0 always
652     // if ( parm4 < 0 ) {
653     //    parm4 = 0;
654     //}
655 
656     pr->u.p.parm1 = parm1;
657     pr->u.p.parm2 = parm2;
658     pr->u.p.parm3 = parm3;
659     pr->u.p.parm4 = parm4;
660   } // case
661   break;
662 
663   default: {
664     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
665                 KMP_HNT(GetNewerLibrary), // Hint
666                 __kmp_msg_null // Variadic argument list terminator
667                 );
668   } break;
669   } // switch
670   pr->schedule = schedule;
671 }
672 
673 #if KMP_USE_HIER_SCHED
674 template <typename T>
675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
676                                              typename traits_t<T>::signed_t st);
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
680                                             kmp_int32 ub, kmp_int32 st) {
681   __kmp_dispatch_init_hierarchy<kmp_int32>(
682       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
688                                              kmp_uint32 ub, kmp_int32 st) {
689   __kmp_dispatch_init_hierarchy<kmp_uint32>(
690       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
696                                             kmp_int64 ub, kmp_int64 st) {
697   __kmp_dispatch_init_hierarchy<kmp_int64>(
698       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 template <>
702 inline void
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
704                                              kmp_uint64 ub, kmp_int64 st) {
705   __kmp_dispatch_init_hierarchy<kmp_uint64>(
706       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
708 }
709 
710 // free all the hierarchy scheduling memory associated with the team
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713   for (int i = 0; i < num_disp_buff; ++i) {
714     // type does not matter here so use kmp_int32
715     auto sh =
716         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
717             &team->t.t_disp_buffer[i]);
718     if (sh->hier) {
719       sh->hier->deallocate();
720       __kmp_free(sh->hier);
721     }
722   }
723 }
724 #endif
725 
726 // UT - unsigned flavor of T, ST - signed flavor of T,
727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
728 template <typename T>
729 static void
730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
731                     T ub, typename traits_t<T>::signed_t st,
732                     typename traits_t<T>::signed_t chunk, int push_ws) {
733   typedef typename traits_t<T>::unsigned_t UT;
734   typedef typename traits_t<T>::signed_t ST;
735   typedef typename traits_t<T>::floating_t DBL;
736 
737   int active;
738   kmp_info_t *th;
739   kmp_team_t *team;
740   kmp_uint32 my_buffer_index;
741   dispatch_private_info_template<T> *pr;
742   dispatch_shared_info_template<T> volatile *sh;
743 
744   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
745                    sizeof(dispatch_private_info));
746   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
747                    sizeof(dispatch_shared_info));
748 
749   if (!TCR_4(__kmp_init_parallel))
750     __kmp_parallel_initialize();
751 
752 #if INCLUDE_SSC_MARKS
753   SSC_MARK_DISPATCH_INIT();
754 #endif
755 #ifdef KMP_DEBUG
756   {
757     char *buff;
758     // create format specifiers before the debug output
759     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
760                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
761                             traits_t<ST>::spec, traits_t<T>::spec,
762                             traits_t<T>::spec, traits_t<ST>::spec);
763     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
764     __kmp_str_free(&buff);
765   }
766 #endif
767   /* setup data */
768   th = __kmp_threads[gtid];
769   team = th->th.th_team;
770   active = !team->t.t_serialized;
771   th->th.th_ident = loc;
772 
773 #if KMP_USE_HIER_SCHED
774   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
775   // Hierarchical scheduling does not work with ordered, so if ordered is
776   // detected, then revert back to threaded scheduling.
777   bool ordered;
778   enum sched_type my_sched = schedule;
779   my_buffer_index = th->th.th_dispatch->th_disp_index;
780   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
781       &th->th.th_dispatch
782            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
783   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
784   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
785     my_sched =
786         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
787   ordered = (kmp_ord_lower & my_sched);
788   if (pr->flags.use_hier) {
789     if (ordered) {
790       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
791                      "Disabling hierarchical scheduling.\n",
792                      gtid));
793       pr->flags.use_hier = FALSE;
794     }
795   }
796   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
797     // Don't use hierarchical for ordered parallel loops and don't
798     // use the runtime hierarchy if one was specified in the program
799     if (!ordered && !pr->flags.use_hier)
800       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
801   }
802 #endif // KMP_USE_HIER_SCHED
803 
804 #if USE_ITT_BUILD
805   kmp_uint64 cur_chunk = chunk;
806   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
807                                     __kmp_forkjoin_frames_mode == 3 &&
808                                     KMP_MASTER_GTID(gtid) &&
809 #if OMP_40_ENABLED
810                                     th->th.th_teams_microtask == NULL &&
811 #endif
812                                     team->t.t_active_level == 1;
813 #endif
814   if (!active) {
815     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
816         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
817   } else {
818     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
819                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
820 
821     my_buffer_index = th->th.th_dispatch->th_disp_index++;
822 
823     /* What happens when number of threads changes, need to resize buffer? */
824     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
825         &th->th.th_dispatch
826              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
827     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
828         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
829     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
830                   my_buffer_index));
831   }
832 
833   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
834 #if USE_ITT_BUILD
835                                 &cur_chunk,
836 #endif
837                                 chunk, (T)th->th.th_team_nproc,
838                                 (T)th->th.th_info.ds.ds_tid);
839   if (active) {
840     if (pr->flags.ordered == 0) {
841       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
842       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
843     } else {
844       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
845       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
846     }
847   }
848 
849   // Any half-decent optimizer will remove this test when the blocks are empty
850   // since the macros expand to nothing
851   // when statistics are disabled.
852   if (schedule == __kmp_static) {
853     KMP_COUNT_BLOCK(OMP_FOR_static);
854     KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc);
855   } else {
856     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
857     KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc);
858   }
859 
860   if (active) {
861     /* The name of this buffer should be my_buffer_index when it's free to use
862      * it */
863 
864     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
865                    "sh->buffer_index:%d\n",
866                    gtid, my_buffer_index, sh->buffer_index));
867     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
868                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
869     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
870     // my_buffer_index are *always* 32-bit integers.
871     KMP_MB(); /* is this necessary? */
872     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
873                    "sh->buffer_index:%d\n",
874                    gtid, my_buffer_index, sh->buffer_index));
875 
876     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
877     th->th.th_dispatch->th_dispatch_sh_current =
878         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
879 #if USE_ITT_BUILD
880     if (pr->flags.ordered) {
881       __kmp_itt_ordered_init(gtid);
882     }
883     // Report loop metadata
884     if (itt_need_metadata_reporting) {
885       // Only report metadata by master of active team at level 1
886       kmp_uint64 schedtype = 0;
887       switch (schedule) {
888       case kmp_sch_static_chunked:
889       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
890         break;
891       case kmp_sch_static_greedy:
892         cur_chunk = pr->u.p.parm1;
893         break;
894       case kmp_sch_dynamic_chunked:
895         schedtype = 1;
896         break;
897       case kmp_sch_guided_iterative_chunked:
898       case kmp_sch_guided_analytical_chunked:
899 #if OMP_45_ENABLED
900       case kmp_sch_guided_simd:
901 #endif
902         schedtype = 2;
903         break;
904       default:
905         // Should we put this case under "static"?
906         // case kmp_sch_static_steal:
907         schedtype = 3;
908         break;
909       }
910       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
911     }
912 #if KMP_USE_HIER_SCHED
913     if (pr->flags.use_hier) {
914       pr->u.p.count = 0;
915       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
916     }
917 #endif // KMP_USER_HIER_SCHED
918 #endif /* USE_ITT_BUILD */
919   }
920 
921 #ifdef KMP_DEBUG
922   {
923     char *buff;
924     // create format specifiers before the debug output
925     buff = __kmp_str_format(
926         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
927         "lb:%%%s ub:%%%s"
928         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
929         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
930         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
931         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
932         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
933         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
934     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
935                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
936                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
937                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
938     __kmp_str_free(&buff);
939   }
940 #endif
941 #if (KMP_STATIC_STEAL_ENABLED)
942   // It cannot be guaranteed that after execution of a loop with some other
943   // schedule kind all the parm3 variables will contain the same value. Even if
944   // all parm3 will be the same, it still exists a bad case like using 0 and 1
945   // rather than program life-time increment. So the dedicated variable is
946   // required. The 'static_steal_counter' is used.
947   if (schedule == kmp_sch_static_steal) {
948     // Other threads will inspect this variable when searching for a victim.
949     // This is a flag showing that other threads may steal from this thread
950     // since then.
951     volatile T *p = &pr->u.p.static_steal_counter;
952     *p = *p + 1;
953   }
954 #endif // ( KMP_STATIC_STEAL_ENABLED )
955 
956 #if OMPT_SUPPORT && OMPT_OPTIONAL
957   if (ompt_enabled.ompt_callback_work) {
958     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
959     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
960     ompt_callbacks.ompt_callback(ompt_callback_work)(
961         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
962         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
963   }
964 #endif
965 }
966 
967 /* For ordered loops, either __kmp_dispatch_finish() should be called after
968  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
969  * every chunk of iterations.  If the ordered section(s) were not executed
970  * for this iteration (or every iteration in this chunk), we need to set the
971  * ordered iteration counters so that the next thread can proceed. */
972 template <typename UT>
973 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
974   typedef typename traits_t<UT>::signed_t ST;
975   kmp_info_t *th = __kmp_threads[gtid];
976 
977   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
978   if (!th->th.th_team->t.t_serialized) {
979 
980     dispatch_private_info_template<UT> *pr =
981         reinterpret_cast<dispatch_private_info_template<UT> *>(
982             th->th.th_dispatch->th_dispatch_pr_current);
983     dispatch_shared_info_template<UT> volatile *sh =
984         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
985             th->th.th_dispatch->th_dispatch_sh_current);
986     KMP_DEBUG_ASSERT(pr);
987     KMP_DEBUG_ASSERT(sh);
988     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
989                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
990 
991     if (pr->ordered_bumped) {
992       KD_TRACE(
993           1000,
994           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
995            gtid));
996       pr->ordered_bumped = 0;
997     } else {
998       UT lower = pr->u.p.ordered_lower;
999 
1000 #ifdef KMP_DEBUG
1001       {
1002         char *buff;
1003         // create format specifiers before the debug output
1004         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1005                                 "ordered_iteration:%%%s lower:%%%s\n",
1006                                 traits_t<UT>::spec, traits_t<UT>::spec);
1007         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1008         __kmp_str_free(&buff);
1009       }
1010 #endif
1011 
1012       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1013                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1014       KMP_MB(); /* is this necessary? */
1015 #ifdef KMP_DEBUG
1016       {
1017         char *buff;
1018         // create format specifiers before the debug output
1019         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1020                                 "ordered_iteration:%%%s lower:%%%s\n",
1021                                 traits_t<UT>::spec, traits_t<UT>::spec);
1022         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1023         __kmp_str_free(&buff);
1024       }
1025 #endif
1026 
1027       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1028     } // if
1029   } // if
1030   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1031 }
1032 
1033 #ifdef KMP_GOMP_COMPAT
1034 
1035 template <typename UT>
1036 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1037   typedef typename traits_t<UT>::signed_t ST;
1038   kmp_info_t *th = __kmp_threads[gtid];
1039 
1040   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1041   if (!th->th.th_team->t.t_serialized) {
1042     //        int cid;
1043     dispatch_private_info_template<UT> *pr =
1044         reinterpret_cast<dispatch_private_info_template<UT> *>(
1045             th->th.th_dispatch->th_dispatch_pr_current);
1046     dispatch_shared_info_template<UT> volatile *sh =
1047         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1048             th->th.th_dispatch->th_dispatch_sh_current);
1049     KMP_DEBUG_ASSERT(pr);
1050     KMP_DEBUG_ASSERT(sh);
1051     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1052                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1053 
1054     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1055     UT lower = pr->u.p.ordered_lower;
1056     UT upper = pr->u.p.ordered_upper;
1057     UT inc = upper - lower + 1;
1058 
1059     if (pr->ordered_bumped == inc) {
1060       KD_TRACE(
1061           1000,
1062           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1063            gtid));
1064       pr->ordered_bumped = 0;
1065     } else {
1066       inc -= pr->ordered_bumped;
1067 
1068 #ifdef KMP_DEBUG
1069       {
1070         char *buff;
1071         // create format specifiers before the debug output
1072         buff = __kmp_str_format(
1073             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1074             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1075             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1076         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1077         __kmp_str_free(&buff);
1078       }
1079 #endif
1080 
1081       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1082                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1083 
1084       KMP_MB(); /* is this necessary? */
1085       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1086                       "ordered_bumped to zero\n",
1087                       gtid));
1088       pr->ordered_bumped = 0;
1089 //!!!!! TODO check if the inc should be unsigned, or signed???
1090 #ifdef KMP_DEBUG
1091       {
1092         char *buff;
1093         // create format specifiers before the debug output
1094         buff = __kmp_str_format(
1095             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1096             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1097             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1098             traits_t<UT>::spec);
1099         KD_TRACE(1000,
1100                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1101         __kmp_str_free(&buff);
1102       }
1103 #endif
1104 
1105       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1106     }
1107     //        }
1108   }
1109   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1110 }
1111 
1112 #endif /* KMP_GOMP_COMPAT */
1113 
1114 template <typename T>
1115 int __kmp_dispatch_next_algorithm(int gtid,
1116                                   dispatch_private_info_template<T> *pr,
1117                                   dispatch_shared_info_template<T> volatile *sh,
1118                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1119                                   typename traits_t<T>::signed_t *p_st, T nproc,
1120                                   T tid) {
1121   typedef typename traits_t<T>::unsigned_t UT;
1122   typedef typename traits_t<T>::signed_t ST;
1123   typedef typename traits_t<T>::floating_t DBL;
1124   int status = 0;
1125   kmp_int32 last = 0;
1126   T start;
1127   ST incr;
1128   UT limit, trip, init;
1129   kmp_info_t *th = __kmp_threads[gtid];
1130   kmp_team_t *team = th->th.th_team;
1131 
1132   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1133                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1134   KMP_DEBUG_ASSERT(pr);
1135   KMP_DEBUG_ASSERT(sh);
1136   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1137 #ifdef KMP_DEBUG
1138   {
1139     char *buff;
1140     // create format specifiers before the debug output
1141     buff =
1142         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1143                          "sh:%%p nproc:%%%s tid:%%%s\n",
1144                          traits_t<T>::spec, traits_t<T>::spec);
1145     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1146     __kmp_str_free(&buff);
1147   }
1148 #endif
1149 
1150   // zero trip count
1151   if (pr->u.p.tc == 0) {
1152     KD_TRACE(10,
1153              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1154               "zero status:%d\n",
1155               gtid, status));
1156     return 0;
1157   }
1158 
1159   switch (pr->schedule) {
1160 #if (KMP_STATIC_STEAL_ENABLED)
1161   case kmp_sch_static_steal: {
1162     T chunk = pr->u.p.parm1;
1163 
1164     KD_TRACE(100,
1165              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1166               gtid));
1167 
1168     trip = pr->u.p.tc - 1;
1169 
1170     if (traits_t<T>::type_size > 4) {
1171       // use lock for 8-byte and CAS for 4-byte induction
1172       // variable. TODO (optional): check and use 16-byte CAS
1173       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1174       KMP_DEBUG_ASSERT(lck != NULL);
1175       if (pr->u.p.count < (UT)pr->u.p.ub) {
1176         __kmp_acquire_lock(lck, gtid);
1177         // try to get own chunk of iterations
1178         init = (pr->u.p.count)++;
1179         status = (init < (UT)pr->u.p.ub);
1180         __kmp_release_lock(lck, gtid);
1181       } else {
1182         status = 0; // no own chunks
1183       }
1184       if (!status) { // try to steal
1185         kmp_info_t **other_threads = team->t.t_threads;
1186         int while_limit = nproc; // nproc attempts to find a victim
1187         int while_index = 0;
1188         // TODO: algorithm of searching for a victim
1189         // should be cleaned up and measured
1190         while ((!status) && (while_limit != ++while_index)) {
1191           T remaining;
1192           T victimIdx = pr->u.p.parm4;
1193           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1194           dispatch_private_info_template<T> *victim =
1195               reinterpret_cast<dispatch_private_info_template<T> *>(
1196                   other_threads[victimIdx]
1197                       ->th.th_dispatch->th_dispatch_pr_current);
1198           while ((victim == NULL || victim == pr ||
1199                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1200                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1201                  oldVictimIdx != victimIdx) {
1202             victimIdx = (victimIdx + 1) % nproc;
1203             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1204                 other_threads[victimIdx]
1205                     ->th.th_dispatch->th_dispatch_pr_current);
1206           }
1207           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1208                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1209             continue; // try once more (nproc attempts in total)
1210             // no victim is ready yet to participate in stealing
1211             // because all victims are still in kmp_init_dispatch
1212           }
1213           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1214             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1215             continue; // not enough chunks to steal, goto next victim
1216           }
1217 
1218           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1219           KMP_ASSERT(lck != NULL);
1220           __kmp_acquire_lock(lck, gtid);
1221           limit = victim->u.p.ub; // keep initial ub
1222           if (victim->u.p.count >= limit ||
1223               (remaining = limit - victim->u.p.count) < 2) {
1224             __kmp_release_lock(lck, gtid);
1225             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1226             continue; // not enough chunks to steal
1227           }
1228           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1229           // by 1
1230           if (remaining > 3) {
1231             // steal 1/4 of remaining
1232             KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1233             init = (victim->u.p.ub -= (remaining >> 2));
1234           } else {
1235             // steal 1 chunk of 2 or 3 remaining
1236             KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1237             init = (victim->u.p.ub -= 1);
1238           }
1239           __kmp_release_lock(lck, gtid);
1240 
1241           KMP_DEBUG_ASSERT(init + 1 <= limit);
1242           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1243           status = 1;
1244           while_index = 0;
1245           // now update own count and ub with stolen range but init chunk
1246           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1247           pr->u.p.count = init + 1;
1248           pr->u.p.ub = limit;
1249           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1250         } // while (search for victim)
1251       } // if (try to find victim and steal)
1252     } else {
1253       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1254       typedef union {
1255         struct {
1256           UT count;
1257           T ub;
1258         } p;
1259         kmp_int64 b;
1260       } union_i4;
1261       // All operations on 'count' or 'ub' must be combined atomically
1262       // together.
1263       {
1264         union_i4 vold, vnew;
1265         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1266         vnew = vold;
1267         vnew.p.count++;
1268         while (!KMP_COMPARE_AND_STORE_ACQ64(
1269             (volatile kmp_int64 *)&pr->u.p.count,
1270             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1271             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1272           KMP_CPU_PAUSE();
1273           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1274           vnew = vold;
1275           vnew.p.count++;
1276         }
1277         vnew = vold;
1278         init = vnew.p.count;
1279         status = (init < (UT)vnew.p.ub);
1280       }
1281 
1282       if (!status) {
1283         kmp_info_t **other_threads = team->t.t_threads;
1284         int while_limit = nproc; // nproc attempts to find a victim
1285         int while_index = 0;
1286 
1287         // TODO: algorithm of searching for a victim
1288         // should be cleaned up and measured
1289         while ((!status) && (while_limit != ++while_index)) {
1290           union_i4 vold, vnew;
1291           kmp_int32 remaining;
1292           T victimIdx = pr->u.p.parm4;
1293           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1294           dispatch_private_info_template<T> *victim =
1295               reinterpret_cast<dispatch_private_info_template<T> *>(
1296                   other_threads[victimIdx]
1297                       ->th.th_dispatch->th_dispatch_pr_current);
1298           while ((victim == NULL || victim == pr ||
1299                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1300                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1301                  oldVictimIdx != victimIdx) {
1302             victimIdx = (victimIdx + 1) % nproc;
1303             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1304                 other_threads[victimIdx]
1305                     ->th.th_dispatch->th_dispatch_pr_current);
1306           }
1307           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1308                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1309             continue; // try once more (nproc attempts in total)
1310             // no victim is ready yet to participate in stealing
1311             // because all victims are still in kmp_init_dispatch
1312           }
1313           pr->u.p.parm4 = victimIdx; // new victim found
1314           while (1) { // CAS loop if victim has enough chunks to steal
1315             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1316             vnew = vold;
1317 
1318             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1319             if (vnew.p.count >= (UT)vnew.p.ub ||
1320                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1321               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1322               break; // not enough chunks to steal, goto next victim
1323             }
1324             if (remaining > 3) {
1325               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1326             } else {
1327               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1328             }
1329             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1330             // TODO: Should this be acquire or release?
1331             if (KMP_COMPARE_AND_STORE_ACQ64(
1332                     (volatile kmp_int64 *)&victim->u.p.count,
1333                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1334                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1335               // stealing succedded
1336               KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub);
1337               status = 1;
1338               while_index = 0;
1339               // now update own count and ub
1340               init = vnew.p.ub;
1341               vold.p.count = init + 1;
1342 #if KMP_ARCH_X86
1343               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1344 #else
1345               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1346 #endif
1347               break;
1348             } // if (check CAS result)
1349             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1350           } // while (try to steal from particular victim)
1351         } // while (search for victim)
1352       } // if (try to find victim and steal)
1353     } // if (4-byte induction variable)
1354     if (!status) {
1355       *p_lb = 0;
1356       *p_ub = 0;
1357       if (p_st != NULL)
1358         *p_st = 0;
1359     } else {
1360       start = pr->u.p.parm2;
1361       init *= chunk;
1362       limit = chunk + init - 1;
1363       incr = pr->u.p.st;
1364       KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1365 
1366       KMP_DEBUG_ASSERT(init <= trip);
1367       if ((last = (limit >= trip)) != 0)
1368         limit = trip;
1369       if (p_st != NULL)
1370         *p_st = incr;
1371 
1372       if (incr == 1) {
1373         *p_lb = start + init;
1374         *p_ub = start + limit;
1375       } else {
1376         *p_lb = start + init * incr;
1377         *p_ub = start + limit * incr;
1378       }
1379 
1380       if (pr->flags.ordered) {
1381         pr->u.p.ordered_lower = init;
1382         pr->u.p.ordered_upper = limit;
1383       } // if
1384     } // if
1385     break;
1386   } // case
1387 #endif // ( KMP_STATIC_STEAL_ENABLED )
1388   case kmp_sch_static_balanced: {
1389     KD_TRACE(
1390         10,
1391         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1392          gtid));
1393     /* check if thread has any iteration to do */
1394     if ((status = !pr->u.p.count) != 0) {
1395       pr->u.p.count = 1;
1396       *p_lb = pr->u.p.lb;
1397       *p_ub = pr->u.p.ub;
1398       last = pr->u.p.parm1;
1399       if (p_st != NULL)
1400         *p_st = pr->u.p.st;
1401     } else { /* no iterations to do */
1402       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1403     }
1404   } // case
1405   break;
1406   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1407                                  merged here */
1408   case kmp_sch_static_chunked: {
1409     T parm1;
1410 
1411     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1412                    "kmp_sch_static_[affinity|chunked] case\n",
1413                    gtid));
1414     parm1 = pr->u.p.parm1;
1415 
1416     trip = pr->u.p.tc - 1;
1417     init = parm1 * (pr->u.p.count + tid);
1418 
1419     if ((status = (init <= trip)) != 0) {
1420       start = pr->u.p.lb;
1421       incr = pr->u.p.st;
1422       limit = parm1 + init - 1;
1423 
1424       if ((last = (limit >= trip)) != 0)
1425         limit = trip;
1426 
1427       if (p_st != NULL)
1428         *p_st = incr;
1429 
1430       pr->u.p.count += nproc;
1431 
1432       if (incr == 1) {
1433         *p_lb = start + init;
1434         *p_ub = start + limit;
1435       } else {
1436         *p_lb = start + init * incr;
1437         *p_ub = start + limit * incr;
1438       }
1439 
1440       if (pr->flags.ordered) {
1441         pr->u.p.ordered_lower = init;
1442         pr->u.p.ordered_upper = limit;
1443       } // if
1444     } // if
1445   } // case
1446   break;
1447 
1448   case kmp_sch_dynamic_chunked: {
1449     T chunk = pr->u.p.parm1;
1450 
1451     KD_TRACE(
1452         100,
1453         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1454          gtid));
1455 
1456     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1457     trip = pr->u.p.tc - 1;
1458 
1459     if ((status = (init <= trip)) == 0) {
1460       *p_lb = 0;
1461       *p_ub = 0;
1462       if (p_st != NULL)
1463         *p_st = 0;
1464     } else {
1465       start = pr->u.p.lb;
1466       limit = chunk + init - 1;
1467       incr = pr->u.p.st;
1468 
1469       if ((last = (limit >= trip)) != 0)
1470         limit = trip;
1471 
1472       if (p_st != NULL)
1473         *p_st = incr;
1474 
1475       if (incr == 1) {
1476         *p_lb = start + init;
1477         *p_ub = start + limit;
1478       } else {
1479         *p_lb = start + init * incr;
1480         *p_ub = start + limit * incr;
1481       }
1482 
1483       if (pr->flags.ordered) {
1484         pr->u.p.ordered_lower = init;
1485         pr->u.p.ordered_upper = limit;
1486       } // if
1487     } // if
1488   } // case
1489   break;
1490 
1491   case kmp_sch_guided_iterative_chunked: {
1492     T chunkspec = pr->u.p.parm1;
1493     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1494                    "iterative case\n",
1495                    gtid));
1496     trip = pr->u.p.tc;
1497     // Start atomic part of calculations
1498     while (1) {
1499       ST remaining; // signed, because can be < 0
1500       init = sh->u.s.iteration; // shared value
1501       remaining = trip - init;
1502       if (remaining <= 0) { // AC: need to compare with 0 first
1503         // nothing to do, don't try atomic op
1504         status = 0;
1505         break;
1506       }
1507       if ((T)remaining <
1508           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1509         // use dynamic-style shcedule
1510         // atomically inrement iterations, get old value
1511         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1512                                  (ST)chunkspec);
1513         remaining = trip - init;
1514         if (remaining <= 0) {
1515           status = 0; // all iterations got by other threads
1516         } else {
1517           // got some iterations to work on
1518           status = 1;
1519           if ((T)remaining > chunkspec) {
1520             limit = init + chunkspec - 1;
1521           } else {
1522             last = 1; // the last chunk
1523             limit = init + remaining - 1;
1524           } // if
1525         } // if
1526         break;
1527       } // if
1528       limit = init +
1529               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1530       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1531                                (ST)init, (ST)limit)) {
1532         // CAS was successful, chunk obtained
1533         status = 1;
1534         --limit;
1535         break;
1536       } // if
1537     } // while
1538     if (status != 0) {
1539       start = pr->u.p.lb;
1540       incr = pr->u.p.st;
1541       if (p_st != NULL)
1542         *p_st = incr;
1543       *p_lb = start + init * incr;
1544       *p_ub = start + limit * incr;
1545       if (pr->flags.ordered) {
1546         pr->u.p.ordered_lower = init;
1547         pr->u.p.ordered_upper = limit;
1548       } // if
1549     } else {
1550       *p_lb = 0;
1551       *p_ub = 0;
1552       if (p_st != NULL)
1553         *p_st = 0;
1554     } // if
1555   } // case
1556   break;
1557 
1558 #if OMP_45_ENABLED
1559   case kmp_sch_guided_simd: {
1560     // same as iterative but curr-chunk adjusted to be multiple of given
1561     // chunk
1562     T chunk = pr->u.p.parm1;
1563     KD_TRACE(100,
1564              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1565               gtid));
1566     trip = pr->u.p.tc;
1567     // Start atomic part of calculations
1568     while (1) {
1569       ST remaining; // signed, because can be < 0
1570       init = sh->u.s.iteration; // shared value
1571       remaining = trip - init;
1572       if (remaining <= 0) { // AC: need to compare with 0 first
1573         status = 0; // nothing to do, don't try atomic op
1574         break;
1575       }
1576       KMP_DEBUG_ASSERT(init % chunk == 0);
1577       // compare with K*nproc*(chunk+1), K=2 by default
1578       if ((T)remaining < pr->u.p.parm2) {
1579         // use dynamic-style shcedule
1580         // atomically inrement iterations, get old value
1581         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1582                                  (ST)chunk);
1583         remaining = trip - init;
1584         if (remaining <= 0) {
1585           status = 0; // all iterations got by other threads
1586         } else {
1587           // got some iterations to work on
1588           status = 1;
1589           if ((T)remaining > chunk) {
1590             limit = init + chunk - 1;
1591           } else {
1592             last = 1; // the last chunk
1593             limit = init + remaining - 1;
1594           } // if
1595         } // if
1596         break;
1597       } // if
1598       // divide by K*nproc
1599       UT span = remaining * (*(double *)&pr->u.p.parm3);
1600       UT rem = span % chunk;
1601       if (rem) // adjust so that span%chunk == 0
1602         span += chunk - rem;
1603       limit = init + span;
1604       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1605                                (ST)init, (ST)limit)) {
1606         // CAS was successful, chunk obtained
1607         status = 1;
1608         --limit;
1609         break;
1610       } // if
1611     } // while
1612     if (status != 0) {
1613       start = pr->u.p.lb;
1614       incr = pr->u.p.st;
1615       if (p_st != NULL)
1616         *p_st = incr;
1617       *p_lb = start + init * incr;
1618       *p_ub = start + limit * incr;
1619       if (pr->flags.ordered) {
1620         pr->u.p.ordered_lower = init;
1621         pr->u.p.ordered_upper = limit;
1622       } // if
1623     } else {
1624       *p_lb = 0;
1625       *p_ub = 0;
1626       if (p_st != NULL)
1627         *p_st = 0;
1628     } // if
1629   } // case
1630   break;
1631 #endif // OMP_45_ENABLED
1632 
1633   case kmp_sch_guided_analytical_chunked: {
1634     T chunkspec = pr->u.p.parm1;
1635     UT chunkIdx;
1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1637     /* for storing original FPCW value for Windows* OS on
1638        IA-32 architecture 8-byte version */
1639     unsigned int oldFpcw;
1640     unsigned int fpcwSet = 0;
1641 #endif
1642     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1643                    "kmp_sch_guided_analytical_chunked case\n",
1644                    gtid));
1645 
1646     trip = pr->u.p.tc;
1647 
1648     KMP_DEBUG_ASSERT(nproc > 1);
1649     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1650 
1651     while (1) { /* this while loop is a safeguard against unexpected zero
1652                    chunk sizes */
1653       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1654       if (chunkIdx >= (UT)pr->u.p.parm2) {
1655         --trip;
1656         /* use dynamic-style scheduling */
1657         init = chunkIdx * chunkspec + pr->u.p.count;
1658         /* need to verify init > 0 in case of overflow in the above
1659          * calculation */
1660         if ((status = (init > 0 && init <= trip)) != 0) {
1661           limit = init + chunkspec - 1;
1662 
1663           if ((last = (limit >= trip)) != 0)
1664             limit = trip;
1665         }
1666         break;
1667       } else {
1668 /* use exponential-style scheduling */
1669 /* The following check is to workaround the lack of long double precision on
1670    Windows* OS.
1671    This check works around the possible effect that init != 0 for chunkIdx == 0.
1672  */
1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1674         /* If we haven't already done so, save original
1675            FPCW and set precision to 64-bit, as Windows* OS
1676            on IA-32 architecture defaults to 53-bit */
1677         if (!fpcwSet) {
1678           oldFpcw = _control87(0, 0);
1679           _control87(_PC_64, _MCW_PC);
1680           fpcwSet = 0x30000;
1681         }
1682 #endif
1683         if (chunkIdx) {
1684           init = __kmp_dispatch_guided_remaining<T>(
1685               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1686           KMP_DEBUG_ASSERT(init);
1687           init = trip - init;
1688         } else
1689           init = 0;
1690         limit = trip - __kmp_dispatch_guided_remaining<T>(
1691                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1692         KMP_ASSERT(init <= limit);
1693         if (init < limit) {
1694           KMP_DEBUG_ASSERT(limit <= trip);
1695           --limit;
1696           status = 1;
1697           break;
1698         } // if
1699       } // if
1700     } // while (1)
1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1702     /* restore FPCW if necessary
1703        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1704     */
1705     if (fpcwSet && (oldFpcw & fpcwSet))
1706       _control87(oldFpcw, _MCW_PC);
1707 #endif
1708     if (status != 0) {
1709       start = pr->u.p.lb;
1710       incr = pr->u.p.st;
1711       if (p_st != NULL)
1712         *p_st = incr;
1713       *p_lb = start + init * incr;
1714       *p_ub = start + limit * incr;
1715       if (pr->flags.ordered) {
1716         pr->u.p.ordered_lower = init;
1717         pr->u.p.ordered_upper = limit;
1718       }
1719     } else {
1720       *p_lb = 0;
1721       *p_ub = 0;
1722       if (p_st != NULL)
1723         *p_st = 0;
1724     }
1725   } // case
1726   break;
1727 
1728   case kmp_sch_trapezoidal: {
1729     UT index;
1730     T parm2 = pr->u.p.parm2;
1731     T parm3 = pr->u.p.parm3;
1732     T parm4 = pr->u.p.parm4;
1733     KD_TRACE(100,
1734              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1735               gtid));
1736 
1737     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1738 
1739     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1740     trip = pr->u.p.tc - 1;
1741 
1742     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1743       *p_lb = 0;
1744       *p_ub = 0;
1745       if (p_st != NULL)
1746         *p_st = 0;
1747     } else {
1748       start = pr->u.p.lb;
1749       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1750       incr = pr->u.p.st;
1751 
1752       if ((last = (limit >= trip)) != 0)
1753         limit = trip;
1754 
1755       if (p_st != NULL)
1756         *p_st = incr;
1757 
1758       if (incr == 1) {
1759         *p_lb = start + init;
1760         *p_ub = start + limit;
1761       } else {
1762         *p_lb = start + init * incr;
1763         *p_ub = start + limit * incr;
1764       }
1765 
1766       if (pr->flags.ordered) {
1767         pr->u.p.ordered_lower = init;
1768         pr->u.p.ordered_upper = limit;
1769       } // if
1770     } // if
1771   } // case
1772   break;
1773   default: {
1774     status = 0; // to avoid complaints on uninitialized variable use
1775     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1776                 KMP_HNT(GetNewerLibrary), // Hint
1777                 __kmp_msg_null // Variadic argument list terminator
1778                 );
1779   } break;
1780   } // switch
1781   if (p_last)
1782     *p_last = last;
1783 #ifdef KMP_DEBUG
1784   if (pr->flags.ordered) {
1785     char *buff;
1786     // create format specifiers before the debug output
1787     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1788                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1789                             traits_t<UT>::spec, traits_t<UT>::spec);
1790     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1791     __kmp_str_free(&buff);
1792   }
1793   {
1794     char *buff;
1795     // create format specifiers before the debug output
1796     buff = __kmp_str_format(
1797         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1798         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1799         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1800     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1801     __kmp_str_free(&buff);
1802   }
1803 #endif
1804   return status;
1805 }
1806 
1807 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1808    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1809    is not called. */
1810 #if OMPT_SUPPORT && OMPT_OPTIONAL
1811 #define OMPT_LOOP_END                                                          \
1812   if (status == 0) {                                                           \
1813     if (ompt_enabled.ompt_callback_work) {                                     \
1814       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1815       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1816       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1817           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1818           &(task_info->task_data), 0, codeptr);                                \
1819     }                                                                          \
1820   }
1821 // TODO: implement count
1822 #else
1823 #define OMPT_LOOP_END // no-op
1824 #endif
1825 
1826 template <typename T>
1827 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1828                                T *p_lb, T *p_ub,
1829                                typename traits_t<T>::signed_t *p_st
1830 #if OMPT_SUPPORT && OMPT_OPTIONAL
1831                                ,
1832                                void *codeptr
1833 #endif
1834                                ) {
1835 
1836   typedef typename traits_t<T>::unsigned_t UT;
1837   typedef typename traits_t<T>::signed_t ST;
1838   typedef typename traits_t<T>::floating_t DBL;
1839   // This is potentially slightly misleading, schedule(runtime) will appear here
1840   // even if the actual runtme schedule is static. (Which points out a
1841   // disadavantage of schedule(runtime): even when static scheduling is used it
1842   // costs more than a compile time choice to use static scheduling would.)
1843   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1844 
1845   int status;
1846   dispatch_private_info_template<T> *pr;
1847   kmp_info_t *th = __kmp_threads[gtid];
1848   kmp_team_t *team = th->th.th_team;
1849 
1850   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1851   KD_TRACE(
1852       1000,
1853       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1854        gtid, p_lb, p_ub, p_st, p_last));
1855 
1856   if (team->t.t_serialized) {
1857     /* NOTE: serialize this dispatch becase we are not at the active level */
1858     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1859         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1860     KMP_DEBUG_ASSERT(pr);
1861 
1862     if ((status = (pr->u.p.tc != 0)) == 0) {
1863       *p_lb = 0;
1864       *p_ub = 0;
1865       //            if ( p_last != NULL )
1866       //                *p_last = 0;
1867       if (p_st != NULL)
1868         *p_st = 0;
1869       if (__kmp_env_consistency_check) {
1870         if (pr->pushed_ws != ct_none) {
1871           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1872         }
1873       }
1874     } else if (pr->flags.nomerge) {
1875       kmp_int32 last;
1876       T start;
1877       UT limit, trip, init;
1878       ST incr;
1879       T chunk = pr->u.p.parm1;
1880 
1881       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1882                      gtid));
1883 
1884       init = chunk * pr->u.p.count++;
1885       trip = pr->u.p.tc - 1;
1886 
1887       if ((status = (init <= trip)) == 0) {
1888         *p_lb = 0;
1889         *p_ub = 0;
1890         //                if ( p_last != NULL )
1891         //                    *p_last = 0;
1892         if (p_st != NULL)
1893           *p_st = 0;
1894         if (__kmp_env_consistency_check) {
1895           if (pr->pushed_ws != ct_none) {
1896             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1897           }
1898         }
1899       } else {
1900         start = pr->u.p.lb;
1901         limit = chunk + init - 1;
1902         incr = pr->u.p.st;
1903 
1904         if ((last = (limit >= trip)) != 0) {
1905           limit = trip;
1906 #if KMP_OS_WINDOWS
1907           pr->u.p.last_upper = pr->u.p.ub;
1908 #endif /* KMP_OS_WINDOWS */
1909         }
1910         if (p_last != NULL)
1911           *p_last = last;
1912         if (p_st != NULL)
1913           *p_st = incr;
1914         if (incr == 1) {
1915           *p_lb = start + init;
1916           *p_ub = start + limit;
1917         } else {
1918           *p_lb = start + init * incr;
1919           *p_ub = start + limit * incr;
1920         }
1921 
1922         if (pr->flags.ordered) {
1923           pr->u.p.ordered_lower = init;
1924           pr->u.p.ordered_upper = limit;
1925 #ifdef KMP_DEBUG
1926           {
1927             char *buff;
1928             // create format specifiers before the debug output
1929             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1930                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1931                                     traits_t<UT>::spec, traits_t<UT>::spec);
1932             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1933                             pr->u.p.ordered_upper));
1934             __kmp_str_free(&buff);
1935           }
1936 #endif
1937         } // if
1938       } // if
1939     } else {
1940       pr->u.p.tc = 0;
1941       *p_lb = pr->u.p.lb;
1942       *p_ub = pr->u.p.ub;
1943 #if KMP_OS_WINDOWS
1944       pr->u.p.last_upper = *p_ub;
1945 #endif /* KMP_OS_WINDOWS */
1946       if (p_last != NULL)
1947         *p_last = TRUE;
1948       if (p_st != NULL)
1949         *p_st = pr->u.p.st;
1950     } // if
1951 #ifdef KMP_DEBUG
1952     {
1953       char *buff;
1954       // create format specifiers before the debug output
1955       buff = __kmp_str_format(
1956           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1957           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1958           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1959       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1960       __kmp_str_free(&buff);
1961     }
1962 #endif
1963 #if INCLUDE_SSC_MARKS
1964     SSC_MARK_DISPATCH_NEXT();
1965 #endif
1966     OMPT_LOOP_END;
1967     return status;
1968   } else {
1969     kmp_int32 last = 0;
1970     dispatch_shared_info_template<T> volatile *sh;
1971 
1972     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1973                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1974 
1975     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1976         th->th.th_dispatch->th_dispatch_pr_current);
1977     KMP_DEBUG_ASSERT(pr);
1978     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1979         th->th.th_dispatch->th_dispatch_sh_current);
1980     KMP_DEBUG_ASSERT(sh);
1981 
1982 #if KMP_USE_HIER_SCHED
1983     if (pr->flags.use_hier)
1984       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
1985     else
1986 #endif // KMP_USE_HIER_SCHED
1987       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
1988                                                 p_st, th->th.th_team_nproc,
1989                                                 th->th.th_info.ds.ds_tid);
1990     // status == 0: no more iterations to execute
1991     if (status == 0) {
1992       UT num_done;
1993 
1994       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
1995 #ifdef KMP_DEBUG
1996       {
1997         char *buff;
1998         // create format specifiers before the debug output
1999         buff = __kmp_str_format(
2000             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2001             traits_t<UT>::spec);
2002         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2003         __kmp_str_free(&buff);
2004       }
2005 #endif
2006 
2007 #if KMP_USE_HIER_SCHED
2008       pr->flags.use_hier = FALSE;
2009 #endif
2010       if ((ST)num_done == th->th.th_team_nproc - 1) {
2011 #if (KMP_STATIC_STEAL_ENABLED)
2012         if (pr->schedule == kmp_sch_static_steal &&
2013             traits_t<T>::type_size > 4) {
2014           int i;
2015           kmp_info_t **other_threads = team->t.t_threads;
2016           // loop complete, safe to destroy locks used for stealing
2017           for (i = 0; i < th->th.th_team_nproc; ++i) {
2018             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2019             KMP_ASSERT(lck != NULL);
2020             __kmp_destroy_lock(lck);
2021             __kmp_free(lck);
2022             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2023           }
2024         }
2025 #endif
2026         /* NOTE: release this buffer to be reused */
2027 
2028         KMP_MB(); /* Flush all pending memory write invalidates.  */
2029 
2030         sh->u.s.num_done = 0;
2031         sh->u.s.iteration = 0;
2032 
2033         /* TODO replace with general release procedure? */
2034         if (pr->flags.ordered) {
2035           sh->u.s.ordered_iteration = 0;
2036         }
2037 
2038         KMP_MB(); /* Flush all pending memory write invalidates.  */
2039 
2040         sh->buffer_index += __kmp_dispatch_num_buffers;
2041         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2042                        gtid, sh->buffer_index));
2043 
2044         KMP_MB(); /* Flush all pending memory write invalidates.  */
2045 
2046       } // if
2047       if (__kmp_env_consistency_check) {
2048         if (pr->pushed_ws != ct_none) {
2049           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2050         }
2051       }
2052 
2053       th->th.th_dispatch->th_deo_fcn = NULL;
2054       th->th.th_dispatch->th_dxo_fcn = NULL;
2055       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2056       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2057     } // if (status == 0)
2058 #if KMP_OS_WINDOWS
2059     else if (last) {
2060       pr->u.p.last_upper = pr->u.p.ub;
2061     }
2062 #endif /* KMP_OS_WINDOWS */
2063     if (p_last != NULL && status != 0)
2064       *p_last = last;
2065   } // if
2066 
2067 #ifdef KMP_DEBUG
2068   {
2069     char *buff;
2070     // create format specifiers before the debug output
2071     buff = __kmp_str_format(
2072         "__kmp_dispatch_next: T#%%d normal case: "
2073         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2074         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2075     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2076                   (p_last ? *p_last : 0), status));
2077     __kmp_str_free(&buff);
2078   }
2079 #endif
2080 #if INCLUDE_SSC_MARKS
2081   SSC_MARK_DISPATCH_NEXT();
2082 #endif
2083   OMPT_LOOP_END;
2084   return status;
2085 }
2086 
2087 template <typename T>
2088 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2089                                   kmp_int32 *plastiter, T *plower, T *pupper,
2090                                   typename traits_t<T>::signed_t incr) {
2091   typedef typename traits_t<T>::unsigned_t UT;
2092   typedef typename traits_t<T>::signed_t ST;
2093   kmp_uint32 team_id;
2094   kmp_uint32 nteams;
2095   UT trip_count;
2096   kmp_team_t *team;
2097   kmp_info_t *th;
2098 
2099   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2100   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2101 #ifdef KMP_DEBUG
2102   {
2103     char *buff;
2104     // create format specifiers before the debug output
2105     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2106                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2107                             traits_t<T>::spec, traits_t<T>::spec,
2108                             traits_t<ST>::spec, traits_t<T>::spec);
2109     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2110     __kmp_str_free(&buff);
2111   }
2112 #endif
2113 
2114   if (__kmp_env_consistency_check) {
2115     if (incr == 0) {
2116       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2117                             loc);
2118     }
2119     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2120       // The loop is illegal.
2121       // Some zero-trip loops maintained by compiler, e.g.:
2122       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2123       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2124       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2125       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2126       // Compiler does not check the following illegal loops:
2127       //   for(i=0;i<10;i+=incr) // where incr<0
2128       //   for(i=10;i>0;i-=incr) // where incr<0
2129       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2130     }
2131   }
2132   th = __kmp_threads[gtid];
2133   team = th->th.th_team;
2134 #if OMP_40_ENABLED
2135   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2136   nteams = th->th.th_teams_size.nteams;
2137 #endif
2138   team_id = team->t.t_master_tid;
2139   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2140 
2141   // compute global trip count
2142   if (incr == 1) {
2143     trip_count = *pupper - *plower + 1;
2144   } else if (incr == -1) {
2145     trip_count = *plower - *pupper + 1;
2146   } else if (incr > 0) {
2147     // upper-lower can exceed the limit of signed type
2148     trip_count = (UT)(*pupper - *plower) / incr + 1;
2149   } else {
2150     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2151   }
2152 
2153   if (trip_count <= nteams) {
2154     KMP_DEBUG_ASSERT(
2155         __kmp_static == kmp_sch_static_greedy ||
2156         __kmp_static ==
2157             kmp_sch_static_balanced); // Unknown static scheduling type.
2158     // only some teams get single iteration, others get nothing
2159     if (team_id < trip_count) {
2160       *pupper = *plower = *plower + team_id * incr;
2161     } else {
2162       *plower = *pupper + incr; // zero-trip loop
2163     }
2164     if (plastiter != NULL)
2165       *plastiter = (team_id == trip_count - 1);
2166   } else {
2167     if (__kmp_static == kmp_sch_static_balanced) {
2168       UT chunk = trip_count / nteams;
2169       UT extras = trip_count % nteams;
2170       *plower +=
2171           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2172       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2173       if (plastiter != NULL)
2174         *plastiter = (team_id == nteams - 1);
2175     } else {
2176       T chunk_inc_count =
2177           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2178       T upper = *pupper;
2179       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2180       // Unknown static scheduling type.
2181       *plower += team_id * chunk_inc_count;
2182       *pupper = *plower + chunk_inc_count - incr;
2183       // Check/correct bounds if needed
2184       if (incr > 0) {
2185         if (*pupper < *plower)
2186           *pupper = traits_t<T>::max_value;
2187         if (plastiter != NULL)
2188           *plastiter = *plower <= upper && *pupper > upper - incr;
2189         if (*pupper > upper)
2190           *pupper = upper; // tracker C73258
2191       } else {
2192         if (*pupper > *plower)
2193           *pupper = traits_t<T>::min_value;
2194         if (plastiter != NULL)
2195           *plastiter = *plower >= upper && *pupper < upper - incr;
2196         if (*pupper < upper)
2197           *pupper = upper; // tracker C73258
2198       }
2199     }
2200   }
2201 }
2202 
2203 //-----------------------------------------------------------------------------
2204 // Dispatch routines
2205 //    Transfer call to template< type T >
2206 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2207 //                         T lb, T ub, ST st, ST chunk )
2208 extern "C" {
2209 
2210 /*!
2211 @ingroup WORK_SHARING
2212 @{
2213 @param loc Source location
2214 @param gtid Global thread id
2215 @param schedule Schedule type
2216 @param lb  Lower bound
2217 @param ub  Upper bound
2218 @param st  Step (or increment if you prefer)
2219 @param chunk The chunk size to block with
2220 
2221 This function prepares the runtime to start a dynamically scheduled for loop,
2222 saving the loop arguments.
2223 These functions are all identical apart from the types of the arguments.
2224 */
2225 
2226 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2227                             enum sched_type schedule, kmp_int32 lb,
2228                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2229   KMP_DEBUG_ASSERT(__kmp_init_serial);
2230 #if OMPT_SUPPORT && OMPT_OPTIONAL
2231   OMPT_STORE_RETURN_ADDRESS(gtid);
2232 #endif
2233   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2234 }
2235 /*!
2236 See @ref __kmpc_dispatch_init_4
2237 */
2238 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2239                              enum sched_type schedule, kmp_uint32 lb,
2240                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2241   KMP_DEBUG_ASSERT(__kmp_init_serial);
2242 #if OMPT_SUPPORT && OMPT_OPTIONAL
2243   OMPT_STORE_RETURN_ADDRESS(gtid);
2244 #endif
2245   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2246 }
2247 
2248 /*!
2249 See @ref __kmpc_dispatch_init_4
2250 */
2251 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2252                             enum sched_type schedule, kmp_int64 lb,
2253                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2254   KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL
2256   OMPT_STORE_RETURN_ADDRESS(gtid);
2257 #endif
2258   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2259 }
2260 
2261 /*!
2262 See @ref __kmpc_dispatch_init_4
2263 */
2264 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2265                              enum sched_type schedule, kmp_uint64 lb,
2266                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2267   KMP_DEBUG_ASSERT(__kmp_init_serial);
2268 #if OMPT_SUPPORT && OMPT_OPTIONAL
2269   OMPT_STORE_RETURN_ADDRESS(gtid);
2270 #endif
2271   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2272 }
2273 
2274 /*!
2275 See @ref __kmpc_dispatch_init_4
2276 
2277 Difference from __kmpc_dispatch_init set of functions is these functions
2278 are called for composite distribute parallel for construct. Thus before
2279 regular iterations dispatching we need to calc per-team iteration space.
2280 
2281 These functions are all identical apart from the types of the arguments.
2282 */
2283 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2284                                  enum sched_type schedule, kmp_int32 *p_last,
2285                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2286                                  kmp_int32 chunk) {
2287   KMP_DEBUG_ASSERT(__kmp_init_serial);
2288 #if OMPT_SUPPORT && OMPT_OPTIONAL
2289   OMPT_STORE_RETURN_ADDRESS(gtid);
2290 #endif
2291   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2292   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2293 }
2294 
2295 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2296                                   enum sched_type schedule, kmp_int32 *p_last,
2297                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2298                                   kmp_int32 chunk) {
2299   KMP_DEBUG_ASSERT(__kmp_init_serial);
2300 #if OMPT_SUPPORT && OMPT_OPTIONAL
2301   OMPT_STORE_RETURN_ADDRESS(gtid);
2302 #endif
2303   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2304   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2305 }
2306 
2307 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2308                                  enum sched_type schedule, kmp_int32 *p_last,
2309                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2310                                  kmp_int64 chunk) {
2311   KMP_DEBUG_ASSERT(__kmp_init_serial);
2312 #if OMPT_SUPPORT && OMPT_OPTIONAL
2313   OMPT_STORE_RETURN_ADDRESS(gtid);
2314 #endif
2315   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2316   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2317 }
2318 
2319 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2320                                   enum sched_type schedule, kmp_int32 *p_last,
2321                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2322                                   kmp_int64 chunk) {
2323   KMP_DEBUG_ASSERT(__kmp_init_serial);
2324 #if OMPT_SUPPORT && OMPT_OPTIONAL
2325   OMPT_STORE_RETURN_ADDRESS(gtid);
2326 #endif
2327   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2328   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2329 }
2330 
2331 /*!
2332 @param loc Source code location
2333 @param gtid Global thread id
2334 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2335 otherwise
2336 @param p_lb   Pointer to the lower bound for the next chunk of work
2337 @param p_ub   Pointer to the upper bound for the next chunk of work
2338 @param p_st   Pointer to the stride for the next chunk of work
2339 @return one if there is work to be done, zero otherwise
2340 
2341 Get the next dynamically allocated chunk of work for this thread.
2342 If there is no more work, then the lb,ub and stride need not be modified.
2343 */
2344 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2345                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347   OMPT_STORE_RETURN_ADDRESS(gtid);
2348 #endif
2349   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2350 #if OMPT_SUPPORT && OMPT_OPTIONAL
2351                                         ,
2352                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2353 #endif
2354                                             );
2355 }
2356 
2357 /*!
2358 See @ref __kmpc_dispatch_next_4
2359 */
2360 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2361                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2362                             kmp_int32 *p_st) {
2363 #if OMPT_SUPPORT && OMPT_OPTIONAL
2364   OMPT_STORE_RETURN_ADDRESS(gtid);
2365 #endif
2366   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368                                          ,
2369                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2370 #endif
2371                                              );
2372 }
2373 
2374 /*!
2375 See @ref __kmpc_dispatch_next_4
2376 */
2377 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2378                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2383 #if OMPT_SUPPORT && OMPT_OPTIONAL
2384                                         ,
2385                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2386 #endif
2387                                             );
2388 }
2389 
2390 /*!
2391 See @ref __kmpc_dispatch_next_4
2392 */
2393 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2394                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2395                             kmp_int64 *p_st) {
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397   OMPT_STORE_RETURN_ADDRESS(gtid);
2398 #endif
2399   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL
2401                                          ,
2402                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2403 #endif
2404                                              );
2405 }
2406 
2407 /*!
2408 @param loc Source code location
2409 @param gtid Global thread id
2410 
2411 Mark the end of a dynamic loop.
2412 */
2413 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2414   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2415 }
2416 
2417 /*!
2418 See @ref __kmpc_dispatch_fini_4
2419 */
2420 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2421   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2422 }
2423 
2424 /*!
2425 See @ref __kmpc_dispatch_fini_4
2426 */
2427 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2428   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2429 }
2430 
2431 /*!
2432 See @ref __kmpc_dispatch_fini_4
2433 */
2434 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2435   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2436 }
2437 /*! @} */
2438 
2439 //-----------------------------------------------------------------------------
2440 // Non-template routines from kmp_dispatch.cpp used in other sources
2441 
2442 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2443   return value == checker;
2444 }
2445 
2446 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2447   return value != checker;
2448 }
2449 
2450 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2451   return value < checker;
2452 }
2453 
2454 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2455   return value >= checker;
2456 }
2457 
2458 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2459   return value <= checker;
2460 }
2461 
2462 kmp_uint32
2463 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2464                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2465                    void *obj // Higher-level synchronization object, or NULL.
2466                    ) {
2467   // note: we may not belong to a team at this point
2468   volatile kmp_uint32 *spin = spinner;
2469   kmp_uint32 check = checker;
2470   kmp_uint32 spins;
2471   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2472   kmp_uint32 r;
2473 
2474   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2475   KMP_INIT_YIELD(spins);
2476   // main wait spin loop
2477   while (!f(r = TCR_4(*spin), check)) {
2478     KMP_FSYNC_SPIN_PREPARE(obj);
2479     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2480        split. It causes problems with infinite recursion because of exit lock */
2481     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2482         __kmp_abort_thread(); */
2483 
2484     /* if we have waited a bit, or are oversubscribed, yield */
2485     /* pause is in the following code */
2486     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2487     KMP_YIELD_SPIN(spins);
2488   }
2489   KMP_FSYNC_SPIN_ACQUIRED(obj);
2490   return r;
2491 }
2492 
2493 void __kmp_wait_yield_4_ptr(
2494     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2495     void *obj // Higher-level synchronization object, or NULL.
2496     ) {
2497   // note: we may not belong to a team at this point
2498   void *spin = spinner;
2499   kmp_uint32 check = checker;
2500   kmp_uint32 spins;
2501   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2502 
2503   KMP_FSYNC_SPIN_INIT(obj, spin);
2504   KMP_INIT_YIELD(spins);
2505   // main wait spin loop
2506   while (!f(spin, check)) {
2507     KMP_FSYNC_SPIN_PREPARE(obj);
2508     /* if we have waited a bit, or are oversubscribed, yield */
2509     /* pause is in the following code */
2510     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2511     KMP_YIELD_SPIN(spins);
2512   }
2513   KMP_FSYNC_SPIN_ACQUIRED(obj);
2514 }
2515 
2516 } // extern "C"
2517 
2518 #ifdef KMP_GOMP_COMPAT
2519 
2520 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2521                                enum sched_type schedule, kmp_int32 lb,
2522                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2523                                int push_ws) {
2524   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2525                                  push_ws);
2526 }
2527 
2528 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2529                                 enum sched_type schedule, kmp_uint32 lb,
2530                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2531                                 int push_ws) {
2532   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2533                                   push_ws);
2534 }
2535 
2536 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2537                                enum sched_type schedule, kmp_int64 lb,
2538                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2539                                int push_ws) {
2540   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2541                                  push_ws);
2542 }
2543 
2544 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2545                                 enum sched_type schedule, kmp_uint64 lb,
2546                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2547                                 int push_ws) {
2548   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2549                                   push_ws);
2550 }
2551 
2552 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2553   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2554 }
2555 
2556 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2557   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2558 }
2559 
2560 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2561   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2562 }
2563 
2564 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2565   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2566 }
2567 
2568 #endif /* KMP_GOMP_COMPAT */
2569 
2570 /* ------------------------------------------------------------------------ */
2571