1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 #include "kmp_lock.h"
38 #include "kmp_dispatch.h"
39 #if KMP_USE_HIER_SCHED
40 #include "kmp_dispatch_hier.h"
41 #endif
42 
43 #if OMPT_SUPPORT
44 #include "ompt-specific.h"
45 #endif
46 
47 /* ------------------------------------------------------------------------ */
48 /* ------------------------------------------------------------------------ */
49 
50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
51   kmp_info_t *th;
52 
53   KMP_DEBUG_ASSERT(gtid_ref);
54 
55   if (__kmp_env_consistency_check) {
56     th = __kmp_threads[*gtid_ref];
57     if (th->th.th_root->r.r_active &&
58         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK
60       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
61 #else
62       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
63 #endif
64     }
65   }
66 }
67 
68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
69   kmp_info_t *th;
70 
71   if (__kmp_env_consistency_check) {
72     th = __kmp_threads[*gtid_ref];
73     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
75     }
76   }
77 }
78 
79 // Initialize a dispatch_private_info_template<T> buffer for a particular
80 // type of schedule,chunk.  The loop description is found in lb (lower bound),
81 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
82 // to the scheduling (often the number of threads in a team, but not always if
83 // hierarchical scheduling is used).  tid is the id of the thread calling
84 // the function within the group of nproc threads.  It will have a value
85 // between 0 and nproc - 1.  This is often just the thread id within a team, but
86 // is not necessarily the case when using hierarchical scheduling.
87 // loc is the source file location of the corresponding loop
88 // gtid is the global thread id
89 template <typename T>
90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
91                                    dispatch_private_info_template<T> *pr,
92                                    enum sched_type schedule, T lb, T ub,
93                                    typename traits_t<T>::signed_t st,
94 #if USE_ITT_BUILD
95                                    kmp_uint64 *cur_chunk,
96 #endif
97                                    typename traits_t<T>::signed_t chunk,
98                                    T nproc, T tid) {
99   typedef typename traits_t<T>::unsigned_t UT;
100   typedef typename traits_t<T>::floating_t DBL;
101 
102   int active;
103   T tc;
104   kmp_info_t *th;
105   kmp_team_t *team;
106 
107 #ifdef KMP_DEBUG
108   typedef typename traits_t<T>::signed_t ST;
109   {
110     char *buff;
111     // create format specifiers before the debug output
112     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
113                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
114                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115                             traits_t<T>::spec, traits_t<T>::spec,
116                             traits_t<ST>::spec, traits_t<ST>::spec,
117                             traits_t<T>::spec, traits_t<T>::spec);
118     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119     __kmp_str_free(&buff);
120   }
121 #endif
122   /* setup data */
123   th = __kmp_threads[gtid];
124   team = th->th.th_team;
125   active = !team->t.t_serialized;
126 
127 #if USE_ITT_BUILD
128   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129                                     __kmp_forkjoin_frames_mode == 3 &&
130                                     KMP_MASTER_GTID(gtid) &&
131 #if OMP_40_ENABLED
132                                     th->th.th_teams_microtask == NULL &&
133 #endif
134                                     team->t.t_active_level == 1;
135 #endif
136 #if (KMP_STATIC_STEAL_ENABLED)
137   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
138     // AC: we now have only one implementation of stealing, so use it
139     schedule = kmp_sch_static_steal;
140   else
141 #endif
142     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
143 
144   /* Pick up the nomerge/ordered bits from the scheduling type */
145   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
146     pr->flags.nomerge = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
149   } else {
150     pr->flags.nomerge = FALSE;
151   }
152   pr->type_size = traits_t<T>::type_size; // remember the size of variables
153   if (kmp_ord_lower & schedule) {
154     pr->flags.ordered = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
157   } else {
158     pr->flags.ordered = FALSE;
159   }
160 
161   if (schedule == kmp_sch_static) {
162     schedule = __kmp_static;
163   } else {
164     if (schedule == kmp_sch_runtime) {
165       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
166       // not specified)
167       schedule = team->t.t_sched.r_sched_type;
168       // Detail the schedule if needed (global controls are differentiated
169       // appropriately)
170       if (schedule == kmp_sch_guided_chunked) {
171         schedule = __kmp_guided;
172       } else if (schedule == kmp_sch_static) {
173         schedule = __kmp_static;
174       }
175       // Use the chunk size specified by OMP_SCHEDULE (or default if not
176       // specified)
177       chunk = team->t.t_sched.chunk;
178 #if USE_ITT_BUILD
179       if (cur_chunk)
180         *cur_chunk = chunk;
181 #endif
182 #ifdef KMP_DEBUG
183       {
184         char *buff;
185         // create format specifiers before the debug output
186         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
187                                 "schedule:%%d chunk:%%%s\n",
188                                 traits_t<ST>::spec);
189         KD_TRACE(10, (buff, gtid, schedule, chunk));
190         __kmp_str_free(&buff);
191       }
192 #endif
193     } else {
194       if (schedule == kmp_sch_guided_chunked) {
195         schedule = __kmp_guided;
196       }
197       if (chunk <= 0) {
198         chunk = KMP_DEFAULT_CHUNK;
199       }
200     }
201 
202     if (schedule == kmp_sch_auto) {
203       // mapping and differentiation: in the __kmp_do_serial_initialize()
204       schedule = __kmp_auto;
205 #ifdef KMP_DEBUG
206       {
207         char *buff;
208         // create format specifiers before the debug output
209         buff = __kmp_str_format(
210             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
211             "schedule:%%d chunk:%%%s\n",
212             traits_t<ST>::spec);
213         KD_TRACE(10, (buff, gtid, schedule, chunk));
214         __kmp_str_free(&buff);
215       }
216 #endif
217     }
218 
219     /* guided analytical not safe for too many threads */
220     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221       schedule = kmp_sch_guided_iterative_chunked;
222       KMP_WARNING(DispatchManyThreads);
223     }
224 #if OMP_45_ENABLED
225     if (schedule == kmp_sch_runtime_simd) {
226       // compiler provides simd_width in the chunk parameter
227       schedule = team->t.t_sched.r_sched_type;
228       // Detail the schedule if needed (global controls are differentiated
229       // appropriately)
230       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
231           schedule == __kmp_static) {
232         schedule = kmp_sch_static_balanced_chunked;
233       } else {
234         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
235           schedule = kmp_sch_guided_simd;
236         }
237         chunk = team->t.t_sched.chunk * chunk;
238       }
239 #if USE_ITT_BUILD
240       if (cur_chunk)
241         *cur_chunk = chunk;
242 #endif
243 #ifdef KMP_DEBUG
244       {
245         char *buff;
246         // create format specifiers before the debug output
247         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
248                                 " chunk:%%%s\n",
249                                 traits_t<ST>::spec);
250         KD_TRACE(10, (buff, gtid, schedule, chunk));
251         __kmp_str_free(&buff);
252       }
253 #endif
254     }
255 #endif // OMP_45_ENABLED
256     pr->u.p.parm1 = chunk;
257   }
258   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
259               "unknown scheduling type");
260 
261   pr->u.p.count = 0;
262 
263   if (__kmp_env_consistency_check) {
264     if (st == 0) {
265       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
267     }
268   }
269   // compute trip count
270   if (st == 1) { // most common case
271     if (ub >= lb) {
272       tc = ub - lb + 1;
273     } else { // ub < lb
274       tc = 0; // zero-trip
275     }
276   } else if (st < 0) {
277     if (lb >= ub) {
278       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(lb - ub) / (-st) + 1;
281     } else { // lb < ub
282       tc = 0; // zero-trip
283     }
284   } else { // st > 0
285     if (ub >= lb) {
286       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
287       // where the division needs to be unsigned regardless of the result type
288       tc = (UT)(ub - lb) / st + 1;
289     } else { // ub < lb
290       tc = 0; // zero-trip
291     }
292   }
293 
294   pr->u.p.lb = lb;
295   pr->u.p.ub = ub;
296   pr->u.p.st = st;
297   pr->u.p.tc = tc;
298 
299 #if KMP_OS_WINDOWS
300   pr->u.p.last_upper = ub + st;
301 #endif /* KMP_OS_WINDOWS */
302 
303   /* NOTE: only the active parallel region(s) has active ordered sections */
304 
305   if (active) {
306     if (pr->flags.ordered) {
307       pr->ordered_bumped = 0;
308       pr->u.p.ordered_lower = 1;
309       pr->u.p.ordered_upper = 0;
310     }
311   }
312 
313   switch (schedule) {
314 #if (KMP_STATIC_STEAL_ENABLED)
315   case kmp_sch_static_steal: {
316     T ntc, init;
317 
318     KD_TRACE(100,
319              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320               gtid));
321 
322     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323     if (nproc > 1 && ntc >= nproc) {
324       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
325       T id = tid;
326       T small_chunk, extras;
327 
328       small_chunk = ntc / nproc;
329       extras = ntc % nproc;
330 
331       init = id * small_chunk + (id < extras ? id : extras);
332       pr->u.p.count = init;
333       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
334 
335       pr->u.p.parm2 = lb;
336       // pr->pfields.parm3 = 0; // it's not used in static_steal
337       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
338       pr->u.p.st = st;
339       if (traits_t<T>::type_size > 4) {
340         // AC: TODO: check if 16-byte CAS available and use it to
341         // improve performance (probably wait for explicit request
342         // before spending time on this).
343         // For now use dynamically allocated per-thread lock,
344         // free memory in __kmp_dispatch_next when status==0.
345         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346         th->th.th_dispatch->th_steal_lock =
347             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
348         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
349       }
350       break;
351     } else {
352       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
353                      "kmp_sch_static_balanced\n",
354                      gtid));
355       schedule = kmp_sch_static_balanced;
356       /* too few iterations: fall-through to kmp_sch_static_balanced */
357     } // if
358     /* FALL-THROUGH to static balanced */
359   } // case
360 #endif
361   case kmp_sch_static_balanced: {
362     T init, limit;
363 
364     KD_TRACE(
365         100,
366         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
367          gtid));
368 
369     if (nproc > 1) {
370       T id = tid;
371 
372       if (tc < nproc) {
373         if (id < tc) {
374           init = id;
375           limit = id;
376           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
377         } else {
378           pr->u.p.count = 1; /* means no more chunks to execute */
379           pr->u.p.parm1 = FALSE;
380           break;
381         }
382       } else {
383         T small_chunk = tc / nproc;
384         T extras = tc % nproc;
385         init = id * small_chunk + (id < extras ? id : extras);
386         limit = init + small_chunk - (id < extras ? 0 : 1);
387         pr->u.p.parm1 = (id == nproc - 1);
388       }
389     } else {
390       if (tc > 0) {
391         init = 0;
392         limit = tc - 1;
393         pr->u.p.parm1 = TRUE;
394       } else {
395         // zero trip count
396         pr->u.p.count = 1; /* means no more chunks to execute */
397         pr->u.p.parm1 = FALSE;
398         break;
399       }
400     }
401 #if USE_ITT_BUILD
402     // Calculate chunk for metadata report
403     if (itt_need_metadata_reporting)
404       if (cur_chunk)
405         *cur_chunk = limit - init + 1;
406 #endif
407     if (st == 1) {
408       pr->u.p.lb = lb + init;
409       pr->u.p.ub = lb + limit;
410     } else {
411       // calculated upper bound, "ub" is user-defined upper bound
412       T ub_tmp = lb + limit * st;
413       pr->u.p.lb = lb + init * st;
414       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
415       // it exactly
416       if (st > 0) {
417         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418       } else {
419         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
420       }
421     }
422     if (pr->flags.ordered) {
423       pr->u.p.ordered_lower = init;
424       pr->u.p.ordered_upper = limit;
425     }
426     break;
427   } // case
428 #if OMP_45_ENABLED
429   case kmp_sch_static_balanced_chunked: {
430     // similar to balanced, but chunk adjusted to multiple of simd width
431     T nth = nproc;
432     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
433                    " -> falling-through to static_greedy\n",
434                    gtid));
435     schedule = kmp_sch_static_greedy;
436     if (nth > 1)
437       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
438     else
439       pr->u.p.parm1 = tc;
440     break;
441   } // case
442   case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED
444   case kmp_sch_guided_iterative_chunked: {
445     KD_TRACE(
446         100,
447         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
448          " case\n",
449          gtid));
450 
451     if (nproc > 1) {
452       if ((2L * chunk + 1) * nproc >= tc) {
453         /* chunk size too large, switch to dynamic */
454         schedule = kmp_sch_dynamic_chunked;
455       } else {
456         // when remaining iters become less than parm2 - switch to dynamic
457         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458         *(double *)&pr->u.p.parm3 =
459             guided_flt_param / nproc; // may occupy parm3 and parm4
460       }
461     } else {
462       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
463                      "kmp_sch_static_greedy\n",
464                      gtid));
465       schedule = kmp_sch_static_greedy;
466       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
467       KD_TRACE(
468           100,
469           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
470            gtid));
471       pr->u.p.parm1 = tc;
472     } // if
473   } // case
474   break;
475   case kmp_sch_guided_analytical_chunked: {
476     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
477                    "kmp_sch_guided_analytical_chunked case\n",
478                    gtid));
479 
480     if (nproc > 1) {
481       if ((2L * chunk + 1) * nproc >= tc) {
482         /* chunk size too large, switch to dynamic */
483         schedule = kmp_sch_dynamic_chunked;
484       } else {
485         /* commonly used term: (2 nproc - 1)/(2 nproc) */
486         DBL x;
487 
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86
489         /* Linux* OS already has 64-bit computation by default for long double,
490            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
491            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
492            instead of the default 53-bit. Even though long double doesn't work
493            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
494            expected to impact the correctness of the algorithm, but this has not
495            been mathematically proven. */
496         // save original FPCW and set precision to 64-bit, as
497         // Windows* OS on IA-32 architecture defaults to 53-bit
498         unsigned int oldFpcw = _control87(0, 0);
499         _control87(_PC_64, _MCW_PC); // 0,0x30000
500 #endif
501         /* value used for comparison in solver for cross-over point */
502         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
503 
504         /* crossover point--chunk indexes equal to or greater than
505            this point switch to dynamic-style scheduling */
506         UT cross;
507 
508         /* commonly used term: (2 nproc - 1)/(2 nproc) */
509         x = (long double)1.0 - (long double)0.5 / nproc;
510 
511 #ifdef KMP_DEBUG
512         { // test natural alignment
513           struct _test_a {
514             char a;
515             union {
516               char b;
517               DBL d;
518             };
519           } t;
520           ptrdiff_t natural_alignment =
521               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
522           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
523           // long)natural_alignment );
524           KMP_DEBUG_ASSERT(
525               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
526         }
527 #endif // KMP_DEBUG
528 
529         /* save the term in thread private dispatch structure */
530         *(DBL *)&pr->u.p.parm3 = x;
531 
532         /* solve for the crossover point to the nearest integer i for which C_i
533            <= chunk */
534         {
535           UT left, right, mid;
536           long double p;
537 
538           /* estimate initial upper and lower bound */
539 
540           /* doesn't matter what value right is as long as it is positive, but
541              it affects performance of the solver */
542           right = 229;
543           p = __kmp_pow<UT>(x, right);
544           if (p > target) {
545             do {
546               p *= p;
547               right <<= 1;
548             } while (p > target && right < (1 << 27));
549             /* lower bound is previous (failed) estimate of upper bound */
550             left = right >> 1;
551           } else {
552             left = 0;
553           }
554 
555           /* bisection root-finding method */
556           while (left + 1 < right) {
557             mid = (left + right) / 2;
558             if (__kmp_pow<UT>(x, mid) > target) {
559               left = mid;
560             } else {
561               right = mid;
562             }
563           } // while
564           cross = right;
565         }
566         /* assert sanity of computed crossover point */
567         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568                    __kmp_pow<UT>(x, cross) <= target);
569 
570         /* save the crossover point in thread private dispatch structure */
571         pr->u.p.parm2 = cross;
572 
573 // C75803
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
576 #else
577 #define GUIDED_ANALYTICAL_WORKAROUND (x)
578 #endif
579         /* dynamic-style scheduling offset */
580         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582                         cross * chunk;
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86
584         // restore FPCW
585         _control87(oldFpcw, _MCW_PC);
586 #endif
587       } // if
588     } else {
589       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
590                      "kmp_sch_static_greedy\n",
591                      gtid));
592       schedule = kmp_sch_static_greedy;
593       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
594       pr->u.p.parm1 = tc;
595     } // if
596   } // case
597   break;
598   case kmp_sch_static_greedy:
599     KD_TRACE(
600         100,
601         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602          gtid));
603     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604     break;
605   case kmp_sch_static_chunked:
606   case kmp_sch_dynamic_chunked:
607     if (pr->u.p.parm1 <= 0) {
608       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609     }
610     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
611                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
612                    gtid));
613     break;
614   case kmp_sch_trapezoidal: {
615     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
616 
617     T parm1, parm2, parm3, parm4;
618     KD_TRACE(100,
619              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
620               gtid));
621 
622     parm1 = chunk;
623 
624     /* F : size of the first cycle */
625     parm2 = (tc / (2 * nproc));
626 
627     if (parm2 < 1) {
628       parm2 = 1;
629     }
630 
631     /* L : size of the last cycle.  Make sure the last cycle is not larger
632        than the first cycle. */
633     if (parm1 < 1) {
634       parm1 = 1;
635     } else if (parm1 > parm2) {
636       parm1 = parm2;
637     }
638 
639     /* N : number of cycles */
640     parm3 = (parm2 + parm1);
641     parm3 = (2 * tc + parm3 - 1) / parm3;
642 
643     if (parm3 < 2) {
644       parm3 = 2;
645     }
646 
647     /* sigma : decreasing incr of the trapezoid */
648     parm4 = (parm3 - 1);
649     parm4 = (parm2 - parm1) / parm4;
650 
651     // pointless check, because parm4 >= 0 always
652     // if ( parm4 < 0 ) {
653     //    parm4 = 0;
654     //}
655 
656     pr->u.p.parm1 = parm1;
657     pr->u.p.parm2 = parm2;
658     pr->u.p.parm3 = parm3;
659     pr->u.p.parm4 = parm4;
660   } // case
661   break;
662 
663   default: {
664     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
665                 KMP_HNT(GetNewerLibrary), // Hint
666                 __kmp_msg_null // Variadic argument list terminator
667                 );
668   } break;
669   } // switch
670   pr->schedule = schedule;
671 }
672 
673 #if KMP_USE_HIER_SCHED
674 template <typename T>
675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
676                                              typename traits_t<T>::signed_t st);
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
680                                             kmp_int32 ub, kmp_int32 st) {
681   __kmp_dispatch_init_hierarchy<kmp_int32>(
682       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
688                                              kmp_uint32 ub, kmp_int32 st) {
689   __kmp_dispatch_init_hierarchy<kmp_uint32>(
690       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
696                                             kmp_int64 ub, kmp_int64 st) {
697   __kmp_dispatch_init_hierarchy<kmp_int64>(
698       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 template <>
702 inline void
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
704                                              kmp_uint64 ub, kmp_int64 st) {
705   __kmp_dispatch_init_hierarchy<kmp_uint64>(
706       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
708 }
709 
710 // free all the hierarchy scheduling memory associated with the team
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713   for (int i = 0; i < num_disp_buff; ++i) {
714     // type does not matter here so use kmp_int32
715     auto sh =
716         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
717             &team->t.t_disp_buffer[i]);
718     if (sh->hier) {
719       sh->hier->deallocate();
720       __kmp_free(sh->hier);
721     }
722   }
723 }
724 #endif
725 
726 // UT - unsigned flavor of T, ST - signed flavor of T,
727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
728 template <typename T>
729 static void
730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
731                     T ub, typename traits_t<T>::signed_t st,
732                     typename traits_t<T>::signed_t chunk, int push_ws) {
733   typedef typename traits_t<T>::unsigned_t UT;
734 
735   int active;
736   kmp_info_t *th;
737   kmp_team_t *team;
738   kmp_uint32 my_buffer_index;
739   dispatch_private_info_template<T> *pr;
740   dispatch_shared_info_template<T> volatile *sh;
741 
742   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
743                    sizeof(dispatch_private_info));
744   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
745                    sizeof(dispatch_shared_info));
746 
747   if (!TCR_4(__kmp_init_parallel))
748     __kmp_parallel_initialize();
749 
750 #if INCLUDE_SSC_MARKS
751   SSC_MARK_DISPATCH_INIT();
752 #endif
753 #ifdef KMP_DEBUG
754   typedef typename traits_t<T>::signed_t ST;
755   {
756     char *buff;
757     // create format specifiers before the debug output
758     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
759                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
760                             traits_t<ST>::spec, traits_t<T>::spec,
761                             traits_t<T>::spec, traits_t<ST>::spec);
762     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
763     __kmp_str_free(&buff);
764   }
765 #endif
766   /* setup data */
767   th = __kmp_threads[gtid];
768   team = th->th.th_team;
769   active = !team->t.t_serialized;
770   th->th.th_ident = loc;
771 
772   // Any half-decent optimizer will remove this test when the blocks are empty
773   // since the macros expand to nothing
774   // when statistics are disabled.
775   if (schedule == __kmp_static) {
776     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
777   } else {
778     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
779   }
780 
781 #if KMP_USE_HIER_SCHED
782   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
783   // Hierarchical scheduling does not work with ordered, so if ordered is
784   // detected, then revert back to threaded scheduling.
785   bool ordered;
786   enum sched_type my_sched = schedule;
787   my_buffer_index = th->th.th_dispatch->th_disp_index;
788   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
789       &th->th.th_dispatch
790            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
791   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
792   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
793     my_sched =
794         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
795   ordered = (kmp_ord_lower & my_sched);
796   if (pr->flags.use_hier) {
797     if (ordered) {
798       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
799                      "Disabling hierarchical scheduling.\n",
800                      gtid));
801       pr->flags.use_hier = FALSE;
802     }
803   }
804   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
805     // Don't use hierarchical for ordered parallel loops and don't
806     // use the runtime hierarchy if one was specified in the program
807     if (!ordered && !pr->flags.use_hier)
808       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
809   }
810 #endif // KMP_USE_HIER_SCHED
811 
812 #if USE_ITT_BUILD
813   kmp_uint64 cur_chunk = chunk;
814   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
815                                     __kmp_forkjoin_frames_mode == 3 &&
816                                     KMP_MASTER_GTID(gtid) &&
817 #if OMP_40_ENABLED
818                                     th->th.th_teams_microtask == NULL &&
819 #endif
820                                     team->t.t_active_level == 1;
821 #endif
822   if (!active) {
823     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
824         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
825   } else {
826     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
827                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
828 
829     my_buffer_index = th->th.th_dispatch->th_disp_index++;
830 
831     /* What happens when number of threads changes, need to resize buffer? */
832     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
833         &th->th.th_dispatch
834              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
835     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
836         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
837     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
838                   my_buffer_index));
839   }
840 
841   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
842 #if USE_ITT_BUILD
843                                 &cur_chunk,
844 #endif
845                                 chunk, (T)th->th.th_team_nproc,
846                                 (T)th->th.th_info.ds.ds_tid);
847   if (active) {
848     if (pr->flags.ordered == 0) {
849       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
850       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
851     } else {
852       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
853       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
854     }
855   }
856 
857   if (active) {
858     /* The name of this buffer should be my_buffer_index when it's free to use
859      * it */
860 
861     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
862                    "sh->buffer_index:%d\n",
863                    gtid, my_buffer_index, sh->buffer_index));
864     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
865                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
866     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
867     // my_buffer_index are *always* 32-bit integers.
868     KMP_MB(); /* is this necessary? */
869     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
870                    "sh->buffer_index:%d\n",
871                    gtid, my_buffer_index, sh->buffer_index));
872 
873     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
874     th->th.th_dispatch->th_dispatch_sh_current =
875         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
876 #if USE_ITT_BUILD
877     if (pr->flags.ordered) {
878       __kmp_itt_ordered_init(gtid);
879     }
880     // Report loop metadata
881     if (itt_need_metadata_reporting) {
882       // Only report metadata by master of active team at level 1
883       kmp_uint64 schedtype = 0;
884       switch (schedule) {
885       case kmp_sch_static_chunked:
886       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
887         break;
888       case kmp_sch_static_greedy:
889         cur_chunk = pr->u.p.parm1;
890         break;
891       case kmp_sch_dynamic_chunked:
892         schedtype = 1;
893         break;
894       case kmp_sch_guided_iterative_chunked:
895       case kmp_sch_guided_analytical_chunked:
896 #if OMP_45_ENABLED
897       case kmp_sch_guided_simd:
898 #endif
899         schedtype = 2;
900         break;
901       default:
902         // Should we put this case under "static"?
903         // case kmp_sch_static_steal:
904         schedtype = 3;
905         break;
906       }
907       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
908     }
909 #if KMP_USE_HIER_SCHED
910     if (pr->flags.use_hier) {
911       pr->u.p.count = 0;
912       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
913     }
914 #endif // KMP_USER_HIER_SCHED
915 #endif /* USE_ITT_BUILD */
916   }
917 
918 #ifdef KMP_DEBUG
919   {
920     char *buff;
921     // create format specifiers before the debug output
922     buff = __kmp_str_format(
923         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
924         "lb:%%%s ub:%%%s"
925         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
926         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
927         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
928         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
929         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
930         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
931     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
932                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
933                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
934                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
935     __kmp_str_free(&buff);
936   }
937 #endif
938 #if (KMP_STATIC_STEAL_ENABLED)
939   // It cannot be guaranteed that after execution of a loop with some other
940   // schedule kind all the parm3 variables will contain the same value. Even if
941   // all parm3 will be the same, it still exists a bad case like using 0 and 1
942   // rather than program life-time increment. So the dedicated variable is
943   // required. The 'static_steal_counter' is used.
944   if (schedule == kmp_sch_static_steal) {
945     // Other threads will inspect this variable when searching for a victim.
946     // This is a flag showing that other threads may steal from this thread
947     // since then.
948     volatile T *p = &pr->u.p.static_steal_counter;
949     *p = *p + 1;
950   }
951 #endif // ( KMP_STATIC_STEAL_ENABLED )
952 
953 #if OMPT_SUPPORT && OMPT_OPTIONAL
954   if (ompt_enabled.ompt_callback_work) {
955     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
956     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
957     ompt_callbacks.ompt_callback(ompt_callback_work)(
958         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
959         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
960   }
961 #endif
962   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
963 }
964 
965 /* For ordered loops, either __kmp_dispatch_finish() should be called after
966  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
967  * every chunk of iterations.  If the ordered section(s) were not executed
968  * for this iteration (or every iteration in this chunk), we need to set the
969  * ordered iteration counters so that the next thread can proceed. */
970 template <typename UT>
971 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
972   typedef typename traits_t<UT>::signed_t ST;
973   kmp_info_t *th = __kmp_threads[gtid];
974 
975   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
976   if (!th->th.th_team->t.t_serialized) {
977 
978     dispatch_private_info_template<UT> *pr =
979         reinterpret_cast<dispatch_private_info_template<UT> *>(
980             th->th.th_dispatch->th_dispatch_pr_current);
981     dispatch_shared_info_template<UT> volatile *sh =
982         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
983             th->th.th_dispatch->th_dispatch_sh_current);
984     KMP_DEBUG_ASSERT(pr);
985     KMP_DEBUG_ASSERT(sh);
986     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
987                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
988 
989     if (pr->ordered_bumped) {
990       KD_TRACE(
991           1000,
992           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
993            gtid));
994       pr->ordered_bumped = 0;
995     } else {
996       UT lower = pr->u.p.ordered_lower;
997 
998 #ifdef KMP_DEBUG
999       {
1000         char *buff;
1001         // create format specifiers before the debug output
1002         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1003                                 "ordered_iteration:%%%s lower:%%%s\n",
1004                                 traits_t<UT>::spec, traits_t<UT>::spec);
1005         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1006         __kmp_str_free(&buff);
1007       }
1008 #endif
1009 
1010       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1011                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1012       KMP_MB(); /* is this necessary? */
1013 #ifdef KMP_DEBUG
1014       {
1015         char *buff;
1016         // create format specifiers before the debug output
1017         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1018                                 "ordered_iteration:%%%s lower:%%%s\n",
1019                                 traits_t<UT>::spec, traits_t<UT>::spec);
1020         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1021         __kmp_str_free(&buff);
1022       }
1023 #endif
1024 
1025       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1026     } // if
1027   } // if
1028   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1029 }
1030 
1031 #ifdef KMP_GOMP_COMPAT
1032 
1033 template <typename UT>
1034 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1035   typedef typename traits_t<UT>::signed_t ST;
1036   kmp_info_t *th = __kmp_threads[gtid];
1037 
1038   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1039   if (!th->th.th_team->t.t_serialized) {
1040     //        int cid;
1041     dispatch_private_info_template<UT> *pr =
1042         reinterpret_cast<dispatch_private_info_template<UT> *>(
1043             th->th.th_dispatch->th_dispatch_pr_current);
1044     dispatch_shared_info_template<UT> volatile *sh =
1045         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1046             th->th.th_dispatch->th_dispatch_sh_current);
1047     KMP_DEBUG_ASSERT(pr);
1048     KMP_DEBUG_ASSERT(sh);
1049     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1050                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1051 
1052     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1053     UT lower = pr->u.p.ordered_lower;
1054     UT upper = pr->u.p.ordered_upper;
1055     UT inc = upper - lower + 1;
1056 
1057     if (pr->ordered_bumped == inc) {
1058       KD_TRACE(
1059           1000,
1060           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1061            gtid));
1062       pr->ordered_bumped = 0;
1063     } else {
1064       inc -= pr->ordered_bumped;
1065 
1066 #ifdef KMP_DEBUG
1067       {
1068         char *buff;
1069         // create format specifiers before the debug output
1070         buff = __kmp_str_format(
1071             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1072             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1073             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1074         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1075         __kmp_str_free(&buff);
1076       }
1077 #endif
1078 
1079       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1080                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1081 
1082       KMP_MB(); /* is this necessary? */
1083       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1084                       "ordered_bumped to zero\n",
1085                       gtid));
1086       pr->ordered_bumped = 0;
1087 //!!!!! TODO check if the inc should be unsigned, or signed???
1088 #ifdef KMP_DEBUG
1089       {
1090         char *buff;
1091         // create format specifiers before the debug output
1092         buff = __kmp_str_format(
1093             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1094             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1095             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1096             traits_t<UT>::spec);
1097         KD_TRACE(1000,
1098                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1099         __kmp_str_free(&buff);
1100       }
1101 #endif
1102 
1103       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1104     }
1105     //        }
1106   }
1107   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1108 }
1109 
1110 #endif /* KMP_GOMP_COMPAT */
1111 
1112 template <typename T>
1113 int __kmp_dispatch_next_algorithm(int gtid,
1114                                   dispatch_private_info_template<T> *pr,
1115                                   dispatch_shared_info_template<T> volatile *sh,
1116                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1117                                   typename traits_t<T>::signed_t *p_st, T nproc,
1118                                   T tid) {
1119   typedef typename traits_t<T>::unsigned_t UT;
1120   typedef typename traits_t<T>::signed_t ST;
1121   typedef typename traits_t<T>::floating_t DBL;
1122   int status = 0;
1123   kmp_int32 last = 0;
1124   T start;
1125   ST incr;
1126   UT limit, trip, init;
1127   kmp_info_t *th = __kmp_threads[gtid];
1128   kmp_team_t *team = th->th.th_team;
1129 
1130   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1131                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1132   KMP_DEBUG_ASSERT(pr);
1133   KMP_DEBUG_ASSERT(sh);
1134   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1135 #ifdef KMP_DEBUG
1136   {
1137     char *buff;
1138     // create format specifiers before the debug output
1139     buff =
1140         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1141                          "sh:%%p nproc:%%%s tid:%%%s\n",
1142                          traits_t<T>::spec, traits_t<T>::spec);
1143     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1144     __kmp_str_free(&buff);
1145   }
1146 #endif
1147 
1148   // zero trip count
1149   if (pr->u.p.tc == 0) {
1150     KD_TRACE(10,
1151              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1152               "zero status:%d\n",
1153               gtid, status));
1154     return 0;
1155   }
1156 
1157   switch (pr->schedule) {
1158 #if (KMP_STATIC_STEAL_ENABLED)
1159   case kmp_sch_static_steal: {
1160     T chunk = pr->u.p.parm1;
1161 
1162     KD_TRACE(100,
1163              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1164               gtid));
1165 
1166     trip = pr->u.p.tc - 1;
1167 
1168     if (traits_t<T>::type_size > 4) {
1169       // use lock for 8-byte and CAS for 4-byte induction
1170       // variable. TODO (optional): check and use 16-byte CAS
1171       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1172       KMP_DEBUG_ASSERT(lck != NULL);
1173       if (pr->u.p.count < (UT)pr->u.p.ub) {
1174         __kmp_acquire_lock(lck, gtid);
1175         // try to get own chunk of iterations
1176         init = (pr->u.p.count)++;
1177         status = (init < (UT)pr->u.p.ub);
1178         __kmp_release_lock(lck, gtid);
1179       } else {
1180         status = 0; // no own chunks
1181       }
1182       if (!status) { // try to steal
1183         kmp_info_t **other_threads = team->t.t_threads;
1184         int while_limit = nproc; // nproc attempts to find a victim
1185         int while_index = 0;
1186         // TODO: algorithm of searching for a victim
1187         // should be cleaned up and measured
1188         while ((!status) && (while_limit != ++while_index)) {
1189           T remaining;
1190           T victimIdx = pr->u.p.parm4;
1191           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1192           dispatch_private_info_template<T> *victim =
1193               reinterpret_cast<dispatch_private_info_template<T> *>(
1194                   other_threads[victimIdx]
1195                       ->th.th_dispatch->th_dispatch_pr_current);
1196           while ((victim == NULL || victim == pr ||
1197                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1198                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1199                  oldVictimIdx != victimIdx) {
1200             victimIdx = (victimIdx + 1) % nproc;
1201             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1202                 other_threads[victimIdx]
1203                     ->th.th_dispatch->th_dispatch_pr_current);
1204           }
1205           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1206                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1207             continue; // try once more (nproc attempts in total)
1208             // no victim is ready yet to participate in stealing
1209             // because all victims are still in kmp_init_dispatch
1210           }
1211           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1212             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1213             continue; // not enough chunks to steal, goto next victim
1214           }
1215 
1216           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1217           KMP_ASSERT(lck != NULL);
1218           __kmp_acquire_lock(lck, gtid);
1219           limit = victim->u.p.ub; // keep initial ub
1220           if (victim->u.p.count >= limit ||
1221               (remaining = limit - victim->u.p.count) < 2) {
1222             __kmp_release_lock(lck, gtid);
1223             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1224             continue; // not enough chunks to steal
1225           }
1226           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1227           // by 1
1228           if (remaining > 3) {
1229             // steal 1/4 of remaining
1230             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1231             init = (victim->u.p.ub -= (remaining >> 2));
1232           } else {
1233             // steal 1 chunk of 2 or 3 remaining
1234             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1235             init = (victim->u.p.ub -= 1);
1236           }
1237           __kmp_release_lock(lck, gtid);
1238 
1239           KMP_DEBUG_ASSERT(init + 1 <= limit);
1240           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1241           status = 1;
1242           while_index = 0;
1243           // now update own count and ub with stolen range but init chunk
1244           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1245           pr->u.p.count = init + 1;
1246           pr->u.p.ub = limit;
1247           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1248         } // while (search for victim)
1249       } // if (try to find victim and steal)
1250     } else {
1251       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1252       typedef union {
1253         struct {
1254           UT count;
1255           T ub;
1256         } p;
1257         kmp_int64 b;
1258       } union_i4;
1259       // All operations on 'count' or 'ub' must be combined atomically
1260       // together.
1261       {
1262         union_i4 vold, vnew;
1263         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1264         vnew = vold;
1265         vnew.p.count++;
1266         while (!KMP_COMPARE_AND_STORE_ACQ64(
1267             (volatile kmp_int64 *)&pr->u.p.count,
1268             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1269             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1270           KMP_CPU_PAUSE();
1271           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1272           vnew = vold;
1273           vnew.p.count++;
1274         }
1275         vnew = vold;
1276         init = vnew.p.count;
1277         status = (init < (UT)vnew.p.ub);
1278       }
1279 
1280       if (!status) {
1281         kmp_info_t **other_threads = team->t.t_threads;
1282         int while_limit = nproc; // nproc attempts to find a victim
1283         int while_index = 0;
1284 
1285         // TODO: algorithm of searching for a victim
1286         // should be cleaned up and measured
1287         while ((!status) && (while_limit != ++while_index)) {
1288           union_i4 vold, vnew;
1289           kmp_int32 remaining;
1290           T victimIdx = pr->u.p.parm4;
1291           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1292           dispatch_private_info_template<T> *victim =
1293               reinterpret_cast<dispatch_private_info_template<T> *>(
1294                   other_threads[victimIdx]
1295                       ->th.th_dispatch->th_dispatch_pr_current);
1296           while ((victim == NULL || victim == pr ||
1297                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1298                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1299                  oldVictimIdx != victimIdx) {
1300             victimIdx = (victimIdx + 1) % nproc;
1301             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1302                 other_threads[victimIdx]
1303                     ->th.th_dispatch->th_dispatch_pr_current);
1304           }
1305           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1306                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1307             continue; // try once more (nproc attempts in total)
1308             // no victim is ready yet to participate in stealing
1309             // because all victims are still in kmp_init_dispatch
1310           }
1311           pr->u.p.parm4 = victimIdx; // new victim found
1312           while (1) { // CAS loop if victim has enough chunks to steal
1313             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1314             vnew = vold;
1315 
1316             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1317             if (vnew.p.count >= (UT)vnew.p.ub ||
1318                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1319               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1320               break; // not enough chunks to steal, goto next victim
1321             }
1322             if (remaining > 3) {
1323               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1324             } else {
1325               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1326             }
1327             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1328             // TODO: Should this be acquire or release?
1329             if (KMP_COMPARE_AND_STORE_ACQ64(
1330                     (volatile kmp_int64 *)&victim->u.p.count,
1331                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1332                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1333               // stealing succedded
1334               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1335                                         vold.p.ub - vnew.p.ub);
1336               status = 1;
1337               while_index = 0;
1338               // now update own count and ub
1339               init = vnew.p.ub;
1340               vold.p.count = init + 1;
1341 #if KMP_ARCH_X86
1342               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1343 #else
1344               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1345 #endif
1346               break;
1347             } // if (check CAS result)
1348             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1349           } // while (try to steal from particular victim)
1350         } // while (search for victim)
1351       } // if (try to find victim and steal)
1352     } // if (4-byte induction variable)
1353     if (!status) {
1354       *p_lb = 0;
1355       *p_ub = 0;
1356       if (p_st != NULL)
1357         *p_st = 0;
1358     } else {
1359       start = pr->u.p.parm2;
1360       init *= chunk;
1361       limit = chunk + init - 1;
1362       incr = pr->u.p.st;
1363       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1364 
1365       KMP_DEBUG_ASSERT(init <= trip);
1366       if ((last = (limit >= trip)) != 0)
1367         limit = trip;
1368       if (p_st != NULL)
1369         *p_st = incr;
1370 
1371       if (incr == 1) {
1372         *p_lb = start + init;
1373         *p_ub = start + limit;
1374       } else {
1375         *p_lb = start + init * incr;
1376         *p_ub = start + limit * incr;
1377       }
1378 
1379       if (pr->flags.ordered) {
1380         pr->u.p.ordered_lower = init;
1381         pr->u.p.ordered_upper = limit;
1382       } // if
1383     } // if
1384     break;
1385   } // case
1386 #endif // ( KMP_STATIC_STEAL_ENABLED )
1387   case kmp_sch_static_balanced: {
1388     KD_TRACE(
1389         10,
1390         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1391          gtid));
1392     /* check if thread has any iteration to do */
1393     if ((status = !pr->u.p.count) != 0) {
1394       pr->u.p.count = 1;
1395       *p_lb = pr->u.p.lb;
1396       *p_ub = pr->u.p.ub;
1397       last = pr->u.p.parm1;
1398       if (p_st != NULL)
1399         *p_st = pr->u.p.st;
1400     } else { /* no iterations to do */
1401       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1402     }
1403   } // case
1404   break;
1405   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1406                                  merged here */
1407   case kmp_sch_static_chunked: {
1408     T parm1;
1409 
1410     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1411                    "kmp_sch_static_[affinity|chunked] case\n",
1412                    gtid));
1413     parm1 = pr->u.p.parm1;
1414 
1415     trip = pr->u.p.tc - 1;
1416     init = parm1 * (pr->u.p.count + tid);
1417 
1418     if ((status = (init <= trip)) != 0) {
1419       start = pr->u.p.lb;
1420       incr = pr->u.p.st;
1421       limit = parm1 + init - 1;
1422 
1423       if ((last = (limit >= trip)) != 0)
1424         limit = trip;
1425 
1426       if (p_st != NULL)
1427         *p_st = incr;
1428 
1429       pr->u.p.count += nproc;
1430 
1431       if (incr == 1) {
1432         *p_lb = start + init;
1433         *p_ub = start + limit;
1434       } else {
1435         *p_lb = start + init * incr;
1436         *p_ub = start + limit * incr;
1437       }
1438 
1439       if (pr->flags.ordered) {
1440         pr->u.p.ordered_lower = init;
1441         pr->u.p.ordered_upper = limit;
1442       } // if
1443     } // if
1444   } // case
1445   break;
1446 
1447   case kmp_sch_dynamic_chunked: {
1448     T chunk = pr->u.p.parm1;
1449 
1450     KD_TRACE(
1451         100,
1452         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1453          gtid));
1454 
1455     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1456     trip = pr->u.p.tc - 1;
1457 
1458     if ((status = (init <= trip)) == 0) {
1459       *p_lb = 0;
1460       *p_ub = 0;
1461       if (p_st != NULL)
1462         *p_st = 0;
1463     } else {
1464       start = pr->u.p.lb;
1465       limit = chunk + init - 1;
1466       incr = pr->u.p.st;
1467 
1468       if ((last = (limit >= trip)) != 0)
1469         limit = trip;
1470 
1471       if (p_st != NULL)
1472         *p_st = incr;
1473 
1474       if (incr == 1) {
1475         *p_lb = start + init;
1476         *p_ub = start + limit;
1477       } else {
1478         *p_lb = start + init * incr;
1479         *p_ub = start + limit * incr;
1480       }
1481 
1482       if (pr->flags.ordered) {
1483         pr->u.p.ordered_lower = init;
1484         pr->u.p.ordered_upper = limit;
1485       } // if
1486     } // if
1487   } // case
1488   break;
1489 
1490   case kmp_sch_guided_iterative_chunked: {
1491     T chunkspec = pr->u.p.parm1;
1492     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1493                    "iterative case\n",
1494                    gtid));
1495     trip = pr->u.p.tc;
1496     // Start atomic part of calculations
1497     while (1) {
1498       ST remaining; // signed, because can be < 0
1499       init = sh->u.s.iteration; // shared value
1500       remaining = trip - init;
1501       if (remaining <= 0) { // AC: need to compare with 0 first
1502         // nothing to do, don't try atomic op
1503         status = 0;
1504         break;
1505       }
1506       if ((T)remaining <
1507           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1508         // use dynamic-style shcedule
1509         // atomically inrement iterations, get old value
1510         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1511                                  (ST)chunkspec);
1512         remaining = trip - init;
1513         if (remaining <= 0) {
1514           status = 0; // all iterations got by other threads
1515         } else {
1516           // got some iterations to work on
1517           status = 1;
1518           if ((T)remaining > chunkspec) {
1519             limit = init + chunkspec - 1;
1520           } else {
1521             last = 1; // the last chunk
1522             limit = init + remaining - 1;
1523           } // if
1524         } // if
1525         break;
1526       } // if
1527       limit = init +
1528               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1529       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1530                                (ST)init, (ST)limit)) {
1531         // CAS was successful, chunk obtained
1532         status = 1;
1533         --limit;
1534         break;
1535       } // if
1536     } // while
1537     if (status != 0) {
1538       start = pr->u.p.lb;
1539       incr = pr->u.p.st;
1540       if (p_st != NULL)
1541         *p_st = incr;
1542       *p_lb = start + init * incr;
1543       *p_ub = start + limit * incr;
1544       if (pr->flags.ordered) {
1545         pr->u.p.ordered_lower = init;
1546         pr->u.p.ordered_upper = limit;
1547       } // if
1548     } else {
1549       *p_lb = 0;
1550       *p_ub = 0;
1551       if (p_st != NULL)
1552         *p_st = 0;
1553     } // if
1554   } // case
1555   break;
1556 
1557 #if OMP_45_ENABLED
1558   case kmp_sch_guided_simd: {
1559     // same as iterative but curr-chunk adjusted to be multiple of given
1560     // chunk
1561     T chunk = pr->u.p.parm1;
1562     KD_TRACE(100,
1563              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1564               gtid));
1565     trip = pr->u.p.tc;
1566     // Start atomic part of calculations
1567     while (1) {
1568       ST remaining; // signed, because can be < 0
1569       init = sh->u.s.iteration; // shared value
1570       remaining = trip - init;
1571       if (remaining <= 0) { // AC: need to compare with 0 first
1572         status = 0; // nothing to do, don't try atomic op
1573         break;
1574       }
1575       KMP_DEBUG_ASSERT(init % chunk == 0);
1576       // compare with K*nproc*(chunk+1), K=2 by default
1577       if ((T)remaining < pr->u.p.parm2) {
1578         // use dynamic-style shcedule
1579         // atomically inrement iterations, get old value
1580         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1581                                  (ST)chunk);
1582         remaining = trip - init;
1583         if (remaining <= 0) {
1584           status = 0; // all iterations got by other threads
1585         } else {
1586           // got some iterations to work on
1587           status = 1;
1588           if ((T)remaining > chunk) {
1589             limit = init + chunk - 1;
1590           } else {
1591             last = 1; // the last chunk
1592             limit = init + remaining - 1;
1593           } // if
1594         } // if
1595         break;
1596       } // if
1597       // divide by K*nproc
1598       UT span = remaining * (*(double *)&pr->u.p.parm3);
1599       UT rem = span % chunk;
1600       if (rem) // adjust so that span%chunk == 0
1601         span += chunk - rem;
1602       limit = init + span;
1603       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1604                                (ST)init, (ST)limit)) {
1605         // CAS was successful, chunk obtained
1606         status = 1;
1607         --limit;
1608         break;
1609       } // if
1610     } // while
1611     if (status != 0) {
1612       start = pr->u.p.lb;
1613       incr = pr->u.p.st;
1614       if (p_st != NULL)
1615         *p_st = incr;
1616       *p_lb = start + init * incr;
1617       *p_ub = start + limit * incr;
1618       if (pr->flags.ordered) {
1619         pr->u.p.ordered_lower = init;
1620         pr->u.p.ordered_upper = limit;
1621       } // if
1622     } else {
1623       *p_lb = 0;
1624       *p_ub = 0;
1625       if (p_st != NULL)
1626         *p_st = 0;
1627     } // if
1628   } // case
1629   break;
1630 #endif // OMP_45_ENABLED
1631 
1632   case kmp_sch_guided_analytical_chunked: {
1633     T chunkspec = pr->u.p.parm1;
1634     UT chunkIdx;
1635 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1636     /* for storing original FPCW value for Windows* OS on
1637        IA-32 architecture 8-byte version */
1638     unsigned int oldFpcw;
1639     unsigned int fpcwSet = 0;
1640 #endif
1641     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1642                    "kmp_sch_guided_analytical_chunked case\n",
1643                    gtid));
1644 
1645     trip = pr->u.p.tc;
1646 
1647     KMP_DEBUG_ASSERT(nproc > 1);
1648     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1649 
1650     while (1) { /* this while loop is a safeguard against unexpected zero
1651                    chunk sizes */
1652       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1653       if (chunkIdx >= (UT)pr->u.p.parm2) {
1654         --trip;
1655         /* use dynamic-style scheduling */
1656         init = chunkIdx * chunkspec + pr->u.p.count;
1657         /* need to verify init > 0 in case of overflow in the above
1658          * calculation */
1659         if ((status = (init > 0 && init <= trip)) != 0) {
1660           limit = init + chunkspec - 1;
1661 
1662           if ((last = (limit >= trip)) != 0)
1663             limit = trip;
1664         }
1665         break;
1666       } else {
1667 /* use exponential-style scheduling */
1668 /* The following check is to workaround the lack of long double precision on
1669    Windows* OS.
1670    This check works around the possible effect that init != 0 for chunkIdx == 0.
1671  */
1672 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1673         /* If we haven't already done so, save original
1674            FPCW and set precision to 64-bit, as Windows* OS
1675            on IA-32 architecture defaults to 53-bit */
1676         if (!fpcwSet) {
1677           oldFpcw = _control87(0, 0);
1678           _control87(_PC_64, _MCW_PC);
1679           fpcwSet = 0x30000;
1680         }
1681 #endif
1682         if (chunkIdx) {
1683           init = __kmp_dispatch_guided_remaining<T>(
1684               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1685           KMP_DEBUG_ASSERT(init);
1686           init = trip - init;
1687         } else
1688           init = 0;
1689         limit = trip - __kmp_dispatch_guided_remaining<T>(
1690                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1691         KMP_ASSERT(init <= limit);
1692         if (init < limit) {
1693           KMP_DEBUG_ASSERT(limit <= trip);
1694           --limit;
1695           status = 1;
1696           break;
1697         } // if
1698       } // if
1699     } // while (1)
1700 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1701     /* restore FPCW if necessary
1702        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1703     */
1704     if (fpcwSet && (oldFpcw & fpcwSet))
1705       _control87(oldFpcw, _MCW_PC);
1706 #endif
1707     if (status != 0) {
1708       start = pr->u.p.lb;
1709       incr = pr->u.p.st;
1710       if (p_st != NULL)
1711         *p_st = incr;
1712       *p_lb = start + init * incr;
1713       *p_ub = start + limit * incr;
1714       if (pr->flags.ordered) {
1715         pr->u.p.ordered_lower = init;
1716         pr->u.p.ordered_upper = limit;
1717       }
1718     } else {
1719       *p_lb = 0;
1720       *p_ub = 0;
1721       if (p_st != NULL)
1722         *p_st = 0;
1723     }
1724   } // case
1725   break;
1726 
1727   case kmp_sch_trapezoidal: {
1728     UT index;
1729     T parm2 = pr->u.p.parm2;
1730     T parm3 = pr->u.p.parm3;
1731     T parm4 = pr->u.p.parm4;
1732     KD_TRACE(100,
1733              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1734               gtid));
1735 
1736     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1737 
1738     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1739     trip = pr->u.p.tc - 1;
1740 
1741     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1742       *p_lb = 0;
1743       *p_ub = 0;
1744       if (p_st != NULL)
1745         *p_st = 0;
1746     } else {
1747       start = pr->u.p.lb;
1748       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1749       incr = pr->u.p.st;
1750 
1751       if ((last = (limit >= trip)) != 0)
1752         limit = trip;
1753 
1754       if (p_st != NULL)
1755         *p_st = incr;
1756 
1757       if (incr == 1) {
1758         *p_lb = start + init;
1759         *p_ub = start + limit;
1760       } else {
1761         *p_lb = start + init * incr;
1762         *p_ub = start + limit * incr;
1763       }
1764 
1765       if (pr->flags.ordered) {
1766         pr->u.p.ordered_lower = init;
1767         pr->u.p.ordered_upper = limit;
1768       } // if
1769     } // if
1770   } // case
1771   break;
1772   default: {
1773     status = 0; // to avoid complaints on uninitialized variable use
1774     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1775                 KMP_HNT(GetNewerLibrary), // Hint
1776                 __kmp_msg_null // Variadic argument list terminator
1777                 );
1778   } break;
1779   } // switch
1780   if (p_last)
1781     *p_last = last;
1782 #ifdef KMP_DEBUG
1783   if (pr->flags.ordered) {
1784     char *buff;
1785     // create format specifiers before the debug output
1786     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1787                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1788                             traits_t<UT>::spec, traits_t<UT>::spec);
1789     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1790     __kmp_str_free(&buff);
1791   }
1792   {
1793     char *buff;
1794     // create format specifiers before the debug output
1795     buff = __kmp_str_format(
1796         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1797         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1798         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1799     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1800     __kmp_str_free(&buff);
1801   }
1802 #endif
1803   return status;
1804 }
1805 
1806 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1807    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1808    is not called. */
1809 #if OMPT_SUPPORT && OMPT_OPTIONAL
1810 #define OMPT_LOOP_END                                                          \
1811   if (status == 0) {                                                           \
1812     if (ompt_enabled.ompt_callback_work) {                                     \
1813       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1814       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1815       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1816           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1817           &(task_info->task_data), 0, codeptr);                                \
1818     }                                                                          \
1819   }
1820 // TODO: implement count
1821 #else
1822 #define OMPT_LOOP_END // no-op
1823 #endif
1824 
1825 #if KMP_STATS_ENABLED
1826 #define KMP_STATS_LOOP_END                                                     \
1827   {                                                                            \
1828     kmp_int64 u, l, t, i;                                                      \
1829     l = (kmp_int64)(*p_lb);                                                    \
1830     u = (kmp_int64)(*p_ub);                                                    \
1831     i = (kmp_int64)(pr->u.p.st);                                               \
1832     if (status == 0) {                                                         \
1833       t = 0;                                                                   \
1834       KMP_POP_PARTITIONED_TIMER();                                             \
1835     } else if (i == 1) {                                                       \
1836       if (u >= l)                                                              \
1837         t = u - l + 1;                                                         \
1838       else                                                                     \
1839         t = 0;                                                                 \
1840     } else if (i < 0) {                                                        \
1841       if (l >= u)                                                              \
1842         t = (l - u) / (-i) + 1;                                                \
1843       else                                                                     \
1844         t = 0;                                                                 \
1845     } else {                                                                   \
1846       if (u >= l)                                                              \
1847         t = (u - l) / i + 1;                                                   \
1848       else                                                                     \
1849         t = 0;                                                                 \
1850     }                                                                          \
1851     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1852   }
1853 #else
1854 #define KMP_STATS_LOOP_END /* Nothing */
1855 #endif
1856 
1857 template <typename T>
1858 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1859                                T *p_lb, T *p_ub,
1860                                typename traits_t<T>::signed_t *p_st
1861 #if OMPT_SUPPORT && OMPT_OPTIONAL
1862                                ,
1863                                void *codeptr
1864 #endif
1865                                ) {
1866 
1867   typedef typename traits_t<T>::unsigned_t UT;
1868   typedef typename traits_t<T>::signed_t ST;
1869   // This is potentially slightly misleading, schedule(runtime) will appear here
1870   // even if the actual runtme schedule is static. (Which points out a
1871   // disadavantage of schedule(runtime): even when static scheduling is used it
1872   // costs more than a compile time choice to use static scheduling would.)
1873   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1874 
1875   int status;
1876   dispatch_private_info_template<T> *pr;
1877   kmp_info_t *th = __kmp_threads[gtid];
1878   kmp_team_t *team = th->th.th_team;
1879 
1880   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1881   KD_TRACE(
1882       1000,
1883       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1884        gtid, p_lb, p_ub, p_st, p_last));
1885 
1886   if (team->t.t_serialized) {
1887     /* NOTE: serialize this dispatch becase we are not at the active level */
1888     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1889         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1890     KMP_DEBUG_ASSERT(pr);
1891 
1892     if ((status = (pr->u.p.tc != 0)) == 0) {
1893       *p_lb = 0;
1894       *p_ub = 0;
1895       //            if ( p_last != NULL )
1896       //                *p_last = 0;
1897       if (p_st != NULL)
1898         *p_st = 0;
1899       if (__kmp_env_consistency_check) {
1900         if (pr->pushed_ws != ct_none) {
1901           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1902         }
1903       }
1904     } else if (pr->flags.nomerge) {
1905       kmp_int32 last;
1906       T start;
1907       UT limit, trip, init;
1908       ST incr;
1909       T chunk = pr->u.p.parm1;
1910 
1911       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1912                      gtid));
1913 
1914       init = chunk * pr->u.p.count++;
1915       trip = pr->u.p.tc - 1;
1916 
1917       if ((status = (init <= trip)) == 0) {
1918         *p_lb = 0;
1919         *p_ub = 0;
1920         //                if ( p_last != NULL )
1921         //                    *p_last = 0;
1922         if (p_st != NULL)
1923           *p_st = 0;
1924         if (__kmp_env_consistency_check) {
1925           if (pr->pushed_ws != ct_none) {
1926             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1927           }
1928         }
1929       } else {
1930         start = pr->u.p.lb;
1931         limit = chunk + init - 1;
1932         incr = pr->u.p.st;
1933 
1934         if ((last = (limit >= trip)) != 0) {
1935           limit = trip;
1936 #if KMP_OS_WINDOWS
1937           pr->u.p.last_upper = pr->u.p.ub;
1938 #endif /* KMP_OS_WINDOWS */
1939         }
1940         if (p_last != NULL)
1941           *p_last = last;
1942         if (p_st != NULL)
1943           *p_st = incr;
1944         if (incr == 1) {
1945           *p_lb = start + init;
1946           *p_ub = start + limit;
1947         } else {
1948           *p_lb = start + init * incr;
1949           *p_ub = start + limit * incr;
1950         }
1951 
1952         if (pr->flags.ordered) {
1953           pr->u.p.ordered_lower = init;
1954           pr->u.p.ordered_upper = limit;
1955 #ifdef KMP_DEBUG
1956           {
1957             char *buff;
1958             // create format specifiers before the debug output
1959             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1960                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1961                                     traits_t<UT>::spec, traits_t<UT>::spec);
1962             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1963                             pr->u.p.ordered_upper));
1964             __kmp_str_free(&buff);
1965           }
1966 #endif
1967         } // if
1968       } // if
1969     } else {
1970       pr->u.p.tc = 0;
1971       *p_lb = pr->u.p.lb;
1972       *p_ub = pr->u.p.ub;
1973 #if KMP_OS_WINDOWS
1974       pr->u.p.last_upper = *p_ub;
1975 #endif /* KMP_OS_WINDOWS */
1976       if (p_last != NULL)
1977         *p_last = TRUE;
1978       if (p_st != NULL)
1979         *p_st = pr->u.p.st;
1980     } // if
1981 #ifdef KMP_DEBUG
1982     {
1983       char *buff;
1984       // create format specifiers before the debug output
1985       buff = __kmp_str_format(
1986           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1987           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1988           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1989       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1990       __kmp_str_free(&buff);
1991     }
1992 #endif
1993 #if INCLUDE_SSC_MARKS
1994     SSC_MARK_DISPATCH_NEXT();
1995 #endif
1996     OMPT_LOOP_END;
1997     KMP_STATS_LOOP_END;
1998     return status;
1999   } else {
2000     kmp_int32 last = 0;
2001     dispatch_shared_info_template<T> volatile *sh;
2002 
2003     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2004                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2005 
2006     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2007         th->th.th_dispatch->th_dispatch_pr_current);
2008     KMP_DEBUG_ASSERT(pr);
2009     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2010         th->th.th_dispatch->th_dispatch_sh_current);
2011     KMP_DEBUG_ASSERT(sh);
2012 
2013 #if KMP_USE_HIER_SCHED
2014     if (pr->flags.use_hier)
2015       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2016     else
2017 #endif // KMP_USE_HIER_SCHED
2018       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2019                                                 p_st, th->th.th_team_nproc,
2020                                                 th->th.th_info.ds.ds_tid);
2021     // status == 0: no more iterations to execute
2022     if (status == 0) {
2023       UT num_done;
2024 
2025       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2026 #ifdef KMP_DEBUG
2027       {
2028         char *buff;
2029         // create format specifiers before the debug output
2030         buff = __kmp_str_format(
2031             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2032             traits_t<UT>::spec);
2033         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2034         __kmp_str_free(&buff);
2035       }
2036 #endif
2037 
2038 #if KMP_USE_HIER_SCHED
2039       pr->flags.use_hier = FALSE;
2040 #endif
2041       if ((ST)num_done == th->th.th_team_nproc - 1) {
2042 #if (KMP_STATIC_STEAL_ENABLED)
2043         if (pr->schedule == kmp_sch_static_steal &&
2044             traits_t<T>::type_size > 4) {
2045           int i;
2046           kmp_info_t **other_threads = team->t.t_threads;
2047           // loop complete, safe to destroy locks used for stealing
2048           for (i = 0; i < th->th.th_team_nproc; ++i) {
2049             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2050             KMP_ASSERT(lck != NULL);
2051             __kmp_destroy_lock(lck);
2052             __kmp_free(lck);
2053             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2054           }
2055         }
2056 #endif
2057         /* NOTE: release this buffer to be reused */
2058 
2059         KMP_MB(); /* Flush all pending memory write invalidates.  */
2060 
2061         sh->u.s.num_done = 0;
2062         sh->u.s.iteration = 0;
2063 
2064         /* TODO replace with general release procedure? */
2065         if (pr->flags.ordered) {
2066           sh->u.s.ordered_iteration = 0;
2067         }
2068 
2069         KMP_MB(); /* Flush all pending memory write invalidates.  */
2070 
2071         sh->buffer_index += __kmp_dispatch_num_buffers;
2072         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2073                        gtid, sh->buffer_index));
2074 
2075         KMP_MB(); /* Flush all pending memory write invalidates.  */
2076 
2077       } // if
2078       if (__kmp_env_consistency_check) {
2079         if (pr->pushed_ws != ct_none) {
2080           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2081         }
2082       }
2083 
2084       th->th.th_dispatch->th_deo_fcn = NULL;
2085       th->th.th_dispatch->th_dxo_fcn = NULL;
2086       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2087       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2088     } // if (status == 0)
2089 #if KMP_OS_WINDOWS
2090     else if (last) {
2091       pr->u.p.last_upper = pr->u.p.ub;
2092     }
2093 #endif /* KMP_OS_WINDOWS */
2094     if (p_last != NULL && status != 0)
2095       *p_last = last;
2096   } // if
2097 
2098 #ifdef KMP_DEBUG
2099   {
2100     char *buff;
2101     // create format specifiers before the debug output
2102     buff = __kmp_str_format(
2103         "__kmp_dispatch_next: T#%%d normal case: "
2104         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2105         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2107                   (p_last ? *p_last : 0), status));
2108     __kmp_str_free(&buff);
2109   }
2110 #endif
2111 #if INCLUDE_SSC_MARKS
2112   SSC_MARK_DISPATCH_NEXT();
2113 #endif
2114   OMPT_LOOP_END;
2115   KMP_STATS_LOOP_END;
2116   return status;
2117 }
2118 
2119 template <typename T>
2120 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2121                                   kmp_int32 *plastiter, T *plower, T *pupper,
2122                                   typename traits_t<T>::signed_t incr) {
2123   typedef typename traits_t<T>::unsigned_t UT;
2124   kmp_uint32 team_id;
2125   kmp_uint32 nteams;
2126   UT trip_count;
2127   kmp_team_t *team;
2128   kmp_info_t *th;
2129 
2130   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2131   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2132 #ifdef KMP_DEBUG
2133   typedef typename traits_t<T>::signed_t ST;
2134   {
2135     char *buff;
2136     // create format specifiers before the debug output
2137     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2138                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2139                             traits_t<T>::spec, traits_t<T>::spec,
2140                             traits_t<ST>::spec, traits_t<T>::spec);
2141     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2142     __kmp_str_free(&buff);
2143   }
2144 #endif
2145 
2146   if (__kmp_env_consistency_check) {
2147     if (incr == 0) {
2148       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2149                             loc);
2150     }
2151     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2152       // The loop is illegal.
2153       // Some zero-trip loops maintained by compiler, e.g.:
2154       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2155       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2156       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2157       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2158       // Compiler does not check the following illegal loops:
2159       //   for(i=0;i<10;i+=incr) // where incr<0
2160       //   for(i=10;i>0;i-=incr) // where incr<0
2161       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2162     }
2163   }
2164   th = __kmp_threads[gtid];
2165   team = th->th.th_team;
2166 #if OMP_40_ENABLED
2167   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2168   nteams = th->th.th_teams_size.nteams;
2169 #endif
2170   team_id = team->t.t_master_tid;
2171   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2172 
2173   // compute global trip count
2174   if (incr == 1) {
2175     trip_count = *pupper - *plower + 1;
2176   } else if (incr == -1) {
2177     trip_count = *plower - *pupper + 1;
2178   } else if (incr > 0) {
2179     // upper-lower can exceed the limit of signed type
2180     trip_count = (UT)(*pupper - *plower) / incr + 1;
2181   } else {
2182     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2183   }
2184 
2185   if (trip_count <= nteams) {
2186     KMP_DEBUG_ASSERT(
2187         __kmp_static == kmp_sch_static_greedy ||
2188         __kmp_static ==
2189             kmp_sch_static_balanced); // Unknown static scheduling type.
2190     // only some teams get single iteration, others get nothing
2191     if (team_id < trip_count) {
2192       *pupper = *plower = *plower + team_id * incr;
2193     } else {
2194       *plower = *pupper + incr; // zero-trip loop
2195     }
2196     if (plastiter != NULL)
2197       *plastiter = (team_id == trip_count - 1);
2198   } else {
2199     if (__kmp_static == kmp_sch_static_balanced) {
2200       UT chunk = trip_count / nteams;
2201       UT extras = trip_count % nteams;
2202       *plower +=
2203           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2204       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2205       if (plastiter != NULL)
2206         *plastiter = (team_id == nteams - 1);
2207     } else {
2208       T chunk_inc_count =
2209           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2210       T upper = *pupper;
2211       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2212       // Unknown static scheduling type.
2213       *plower += team_id * chunk_inc_count;
2214       *pupper = *plower + chunk_inc_count - incr;
2215       // Check/correct bounds if needed
2216       if (incr > 0) {
2217         if (*pupper < *plower)
2218           *pupper = traits_t<T>::max_value;
2219         if (plastiter != NULL)
2220           *plastiter = *plower <= upper && *pupper > upper - incr;
2221         if (*pupper > upper)
2222           *pupper = upper; // tracker C73258
2223       } else {
2224         if (*pupper > *plower)
2225           *pupper = traits_t<T>::min_value;
2226         if (plastiter != NULL)
2227           *plastiter = *plower >= upper && *pupper < upper - incr;
2228         if (*pupper < upper)
2229           *pupper = upper; // tracker C73258
2230       }
2231     }
2232   }
2233 }
2234 
2235 //-----------------------------------------------------------------------------
2236 // Dispatch routines
2237 //    Transfer call to template< type T >
2238 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2239 //                         T lb, T ub, ST st, ST chunk )
2240 extern "C" {
2241 
2242 /*!
2243 @ingroup WORK_SHARING
2244 @{
2245 @param loc Source location
2246 @param gtid Global thread id
2247 @param schedule Schedule type
2248 @param lb  Lower bound
2249 @param ub  Upper bound
2250 @param st  Step (or increment if you prefer)
2251 @param chunk The chunk size to block with
2252 
2253 This function prepares the runtime to start a dynamically scheduled for loop,
2254 saving the loop arguments.
2255 These functions are all identical apart from the types of the arguments.
2256 */
2257 
2258 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2259                             enum sched_type schedule, kmp_int32 lb,
2260                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2261   KMP_DEBUG_ASSERT(__kmp_init_serial);
2262 #if OMPT_SUPPORT && OMPT_OPTIONAL
2263   OMPT_STORE_RETURN_ADDRESS(gtid);
2264 #endif
2265   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2266 }
2267 /*!
2268 See @ref __kmpc_dispatch_init_4
2269 */
2270 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2271                              enum sched_type schedule, kmp_uint32 lb,
2272                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2273   KMP_DEBUG_ASSERT(__kmp_init_serial);
2274 #if OMPT_SUPPORT && OMPT_OPTIONAL
2275   OMPT_STORE_RETURN_ADDRESS(gtid);
2276 #endif
2277   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2278 }
2279 
2280 /*!
2281 See @ref __kmpc_dispatch_init_4
2282 */
2283 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2284                             enum sched_type schedule, kmp_int64 lb,
2285                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2286   KMP_DEBUG_ASSERT(__kmp_init_serial);
2287 #if OMPT_SUPPORT && OMPT_OPTIONAL
2288   OMPT_STORE_RETURN_ADDRESS(gtid);
2289 #endif
2290   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2291 }
2292 
2293 /*!
2294 See @ref __kmpc_dispatch_init_4
2295 */
2296 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2297                              enum sched_type schedule, kmp_uint64 lb,
2298                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2299   KMP_DEBUG_ASSERT(__kmp_init_serial);
2300 #if OMPT_SUPPORT && OMPT_OPTIONAL
2301   OMPT_STORE_RETURN_ADDRESS(gtid);
2302 #endif
2303   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2304 }
2305 
2306 /*!
2307 See @ref __kmpc_dispatch_init_4
2308 
2309 Difference from __kmpc_dispatch_init set of functions is these functions
2310 are called for composite distribute parallel for construct. Thus before
2311 regular iterations dispatching we need to calc per-team iteration space.
2312 
2313 These functions are all identical apart from the types of the arguments.
2314 */
2315 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2316                                  enum sched_type schedule, kmp_int32 *p_last,
2317                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2318                                  kmp_int32 chunk) {
2319   KMP_DEBUG_ASSERT(__kmp_init_serial);
2320 #if OMPT_SUPPORT && OMPT_OPTIONAL
2321   OMPT_STORE_RETURN_ADDRESS(gtid);
2322 #endif
2323   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2324   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2325 }
2326 
2327 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2328                                   enum sched_type schedule, kmp_int32 *p_last,
2329                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2330                                   kmp_int32 chunk) {
2331   KMP_DEBUG_ASSERT(__kmp_init_serial);
2332 #if OMPT_SUPPORT && OMPT_OPTIONAL
2333   OMPT_STORE_RETURN_ADDRESS(gtid);
2334 #endif
2335   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2336   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2337 }
2338 
2339 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2340                                  enum sched_type schedule, kmp_int32 *p_last,
2341                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2342                                  kmp_int64 chunk) {
2343   KMP_DEBUG_ASSERT(__kmp_init_serial);
2344 #if OMPT_SUPPORT && OMPT_OPTIONAL
2345   OMPT_STORE_RETURN_ADDRESS(gtid);
2346 #endif
2347   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2348   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2349 }
2350 
2351 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2352                                   enum sched_type schedule, kmp_int32 *p_last,
2353                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2354                                   kmp_int64 chunk) {
2355   KMP_DEBUG_ASSERT(__kmp_init_serial);
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357   OMPT_STORE_RETURN_ADDRESS(gtid);
2358 #endif
2359   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2360   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2361 }
2362 
2363 /*!
2364 @param loc Source code location
2365 @param gtid Global thread id
2366 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2367 otherwise
2368 @param p_lb   Pointer to the lower bound for the next chunk of work
2369 @param p_ub   Pointer to the upper bound for the next chunk of work
2370 @param p_st   Pointer to the stride for the next chunk of work
2371 @return one if there is work to be done, zero otherwise
2372 
2373 Get the next dynamically allocated chunk of work for this thread.
2374 If there is no more work, then the lb,ub and stride need not be modified.
2375 */
2376 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2377                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2382 #if OMPT_SUPPORT && OMPT_OPTIONAL
2383                                         ,
2384                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2385 #endif
2386                                             );
2387 }
2388 
2389 /*!
2390 See @ref __kmpc_dispatch_next_4
2391 */
2392 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2393                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2394                             kmp_int32 *p_st) {
2395 #if OMPT_SUPPORT && OMPT_OPTIONAL
2396   OMPT_STORE_RETURN_ADDRESS(gtid);
2397 #endif
2398   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2399 #if OMPT_SUPPORT && OMPT_OPTIONAL
2400                                          ,
2401                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2402 #endif
2403                                              );
2404 }
2405 
2406 /*!
2407 See @ref __kmpc_dispatch_next_4
2408 */
2409 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2410                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2411 #if OMPT_SUPPORT && OMPT_OPTIONAL
2412   OMPT_STORE_RETURN_ADDRESS(gtid);
2413 #endif
2414   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2415 #if OMPT_SUPPORT && OMPT_OPTIONAL
2416                                         ,
2417                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2418 #endif
2419                                             );
2420 }
2421 
2422 /*!
2423 See @ref __kmpc_dispatch_next_4
2424 */
2425 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2426                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2427                             kmp_int64 *p_st) {
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429   OMPT_STORE_RETURN_ADDRESS(gtid);
2430 #endif
2431   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2432 #if OMPT_SUPPORT && OMPT_OPTIONAL
2433                                          ,
2434                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2435 #endif
2436                                              );
2437 }
2438 
2439 /*!
2440 @param loc Source code location
2441 @param gtid Global thread id
2442 
2443 Mark the end of a dynamic loop.
2444 */
2445 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2446   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2447 }
2448 
2449 /*!
2450 See @ref __kmpc_dispatch_fini_4
2451 */
2452 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2453   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2454 }
2455 
2456 /*!
2457 See @ref __kmpc_dispatch_fini_4
2458 */
2459 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2460   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2461 }
2462 
2463 /*!
2464 See @ref __kmpc_dispatch_fini_4
2465 */
2466 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2467   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2468 }
2469 /*! @} */
2470 
2471 //-----------------------------------------------------------------------------
2472 // Non-template routines from kmp_dispatch.cpp used in other sources
2473 
2474 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2475   return value == checker;
2476 }
2477 
2478 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2479   return value != checker;
2480 }
2481 
2482 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2483   return value < checker;
2484 }
2485 
2486 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2487   return value >= checker;
2488 }
2489 
2490 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2491   return value <= checker;
2492 }
2493 
2494 kmp_uint32
2495 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2496                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2497                    void *obj // Higher-level synchronization object, or NULL.
2498                    ) {
2499   // note: we may not belong to a team at this point
2500   volatile kmp_uint32 *spin = spinner;
2501   kmp_uint32 check = checker;
2502   kmp_uint32 spins;
2503   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2504   kmp_uint32 r;
2505 
2506   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2507   KMP_INIT_YIELD(spins);
2508   // main wait spin loop
2509   while (!f(r = TCR_4(*spin), check)) {
2510     KMP_FSYNC_SPIN_PREPARE(obj);
2511     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2512        split. It causes problems with infinite recursion because of exit lock */
2513     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2514         __kmp_abort_thread(); */
2515 
2516     /* if we have waited a bit, or are oversubscribed, yield */
2517     /* pause is in the following code */
2518     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2519     KMP_YIELD_SPIN(spins);
2520   }
2521   KMP_FSYNC_SPIN_ACQUIRED(obj);
2522   return r;
2523 }
2524 
2525 void __kmp_wait_yield_4_ptr(
2526     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2527     void *obj // Higher-level synchronization object, or NULL.
2528     ) {
2529   // note: we may not belong to a team at this point
2530   void *spin = spinner;
2531   kmp_uint32 check = checker;
2532   kmp_uint32 spins;
2533   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2534 
2535   KMP_FSYNC_SPIN_INIT(obj, spin);
2536   KMP_INIT_YIELD(spins);
2537   // main wait spin loop
2538   while (!f(spin, check)) {
2539     KMP_FSYNC_SPIN_PREPARE(obj);
2540     /* if we have waited a bit, or are oversubscribed, yield */
2541     /* pause is in the following code */
2542     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2543     KMP_YIELD_SPIN(spins);
2544   }
2545   KMP_FSYNC_SPIN_ACQUIRED(obj);
2546 }
2547 
2548 } // extern "C"
2549 
2550 #ifdef KMP_GOMP_COMPAT
2551 
2552 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2553                                enum sched_type schedule, kmp_int32 lb,
2554                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2555                                int push_ws) {
2556   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2557                                  push_ws);
2558 }
2559 
2560 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2561                                 enum sched_type schedule, kmp_uint32 lb,
2562                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2563                                 int push_ws) {
2564   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2565                                   push_ws);
2566 }
2567 
2568 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2569                                enum sched_type schedule, kmp_int64 lb,
2570                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2571                                int push_ws) {
2572   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2573                                  push_ws);
2574 }
2575 
2576 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2577                                 enum sched_type schedule, kmp_uint64 lb,
2578                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2579                                 int push_ws) {
2580   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2581                                   push_ws);
2582 }
2583 
2584 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2585   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2586 }
2587 
2588 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2589   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2590 }
2591 
2592 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2593   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2594 }
2595 
2596 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2597   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2598 }
2599 
2600 #endif /* KMP_GOMP_COMPAT */
2601 
2602 /* ------------------------------------------------------------------------ */
2603