1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   int monotonicity;
76   // default to monotonic
77   monotonicity = SCHEDULE_MONOTONIC;
78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79     monotonicity = SCHEDULE_NONMONOTONIC;
80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
81     monotonicity = SCHEDULE_MONOTONIC;
82   return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used).  tid is the id of the thread calling
90 // the function within the group of nproc threads.  It will have a value
91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                    dispatch_private_info_template<T> *pr,
98                                    enum sched_type schedule, T lb, T ub,
99                                    typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101                                    kmp_uint64 *cur_chunk,
102 #endif
103                                    typename traits_t<T>::signed_t chunk,
104                                    T nproc, T tid) {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::floating_t DBL;
107 
108   int active;
109   T tc;
110   kmp_info_t *th;
111   kmp_team_t *team;
112   int monotonicity;
113   bool use_hier;
114 
115 #ifdef KMP_DEBUG
116   typedef typename traits_t<T>::signed_t ST;
117   {
118     char *buff;
119     // create format specifiers before the debug output
120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                             traits_t<T>::spec, traits_t<T>::spec,
124                             traits_t<ST>::spec, traits_t<ST>::spec,
125                             traits_t<T>::spec, traits_t<T>::spec);
126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127     __kmp_str_free(&buff);
128   }
129 #endif
130   /* setup data */
131   th = __kmp_threads[gtid];
132   team = th->th.th_team;
133   active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
137                                     __kmp_forkjoin_frames_mode == 3 &&
138                                     KMP_MASTER_GTID(gtid) &&
139 #if OMP_40_ENABLED
140                                     th->th.th_teams_microtask == NULL &&
141 #endif
142                                     team->t.t_active_level == 1;
143 #endif
144 
145 #if KMP_USE_HIER_SCHED
146   use_hier = pr->flags.use_hier;
147 #else
148   use_hier = false;
149 #endif
150 
151   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
152   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
153   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 
155   /* Pick up the nomerge/ordered bits from the scheduling type */
156   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
157     pr->flags.nomerge = TRUE;
158     schedule =
159         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
160   } else {
161     pr->flags.nomerge = FALSE;
162   }
163   pr->type_size = traits_t<T>::type_size; // remember the size of variables
164   if (kmp_ord_lower & schedule) {
165     pr->flags.ordered = TRUE;
166     schedule =
167         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
168   } else {
169     pr->flags.ordered = FALSE;
170   }
171   // Ordered overrides nonmonotonic
172   if (pr->flags.ordered) {
173     monotonicity = SCHEDULE_MONOTONIC;
174   }
175 
176   if (schedule == kmp_sch_static) {
177     schedule = __kmp_static;
178   } else {
179     if (schedule == kmp_sch_runtime) {
180       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
181       // not specified)
182       schedule = team->t.t_sched.r_sched_type;
183       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
184       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185       // Detail the schedule if needed (global controls are differentiated
186       // appropriately)
187       if (schedule == kmp_sch_guided_chunked) {
188         schedule = __kmp_guided;
189       } else if (schedule == kmp_sch_static) {
190         schedule = __kmp_static;
191       }
192       // Use the chunk size specified by OMP_SCHEDULE (or default if not
193       // specified)
194       chunk = team->t.t_sched.chunk;
195 #if USE_ITT_BUILD
196       if (cur_chunk)
197         *cur_chunk = chunk;
198 #endif
199 #ifdef KMP_DEBUG
200       {
201         char *buff;
202         // create format specifiers before the debug output
203         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
204                                 "schedule:%%d chunk:%%%s\n",
205                                 traits_t<ST>::spec);
206         KD_TRACE(10, (buff, gtid, schedule, chunk));
207         __kmp_str_free(&buff);
208       }
209 #endif
210     } else {
211       if (schedule == kmp_sch_guided_chunked) {
212         schedule = __kmp_guided;
213       }
214       if (chunk <= 0) {
215         chunk = KMP_DEFAULT_CHUNK;
216       }
217     }
218 
219     if (schedule == kmp_sch_auto) {
220       // mapping and differentiation: in the __kmp_do_serial_initialize()
221       schedule = __kmp_auto;
222 #ifdef KMP_DEBUG
223       {
224         char *buff;
225         // create format specifiers before the debug output
226         buff = __kmp_str_format(
227             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
228             "schedule:%%d chunk:%%%s\n",
229             traits_t<ST>::spec);
230         KD_TRACE(10, (buff, gtid, schedule, chunk));
231         __kmp_str_free(&buff);
232       }
233 #endif
234     }
235 #if KMP_STATIC_STEAL_ENABLED
236     // map nonmonotonic:dynamic to static steal
237     if (schedule == kmp_sch_dynamic_chunked) {
238       if (monotonicity == SCHEDULE_NONMONOTONIC)
239         schedule = kmp_sch_static_steal;
240     }
241 #endif
242     /* guided analytical not safe for too many threads */
243     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
244       schedule = kmp_sch_guided_iterative_chunked;
245       KMP_WARNING(DispatchManyThreads);
246     }
247 #if OMP_45_ENABLED
248     if (schedule == kmp_sch_runtime_simd) {
249       // compiler provides simd_width in the chunk parameter
250       schedule = team->t.t_sched.r_sched_type;
251       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
252       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
253       // Detail the schedule if needed (global controls are differentiated
254       // appropriately)
255       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
256           schedule == __kmp_static) {
257         schedule = kmp_sch_static_balanced_chunked;
258       } else {
259         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
260           schedule = kmp_sch_guided_simd;
261         }
262         chunk = team->t.t_sched.chunk * chunk;
263       }
264 #if USE_ITT_BUILD
265       if (cur_chunk)
266         *cur_chunk = chunk;
267 #endif
268 #ifdef KMP_DEBUG
269       {
270         char *buff;
271         // create format specifiers before the debug output
272         buff = __kmp_str_format(
273             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
274             " chunk:%%%s\n",
275             traits_t<ST>::spec);
276         KD_TRACE(10, (buff, gtid, schedule, chunk));
277         __kmp_str_free(&buff);
278       }
279 #endif
280     }
281 #endif // OMP_45_ENABLED
282     pr->u.p.parm1 = chunk;
283   }
284   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
285               "unknown scheduling type");
286 
287   pr->u.p.count = 0;
288 
289   if (__kmp_env_consistency_check) {
290     if (st == 0) {
291       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
292                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
293     }
294   }
295   // compute trip count
296   if (st == 1) { // most common case
297     if (ub >= lb) {
298       tc = ub - lb + 1;
299     } else { // ub < lb
300       tc = 0; // zero-trip
301     }
302   } else if (st < 0) {
303     if (lb >= ub) {
304       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
305       // where the division needs to be unsigned regardless of the result type
306       tc = (UT)(lb - ub) / (-st) + 1;
307     } else { // lb < ub
308       tc = 0; // zero-trip
309     }
310   } else { // st > 0
311     if (ub >= lb) {
312       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
313       // where the division needs to be unsigned regardless of the result type
314       tc = (UT)(ub - lb) / st + 1;
315     } else { // ub < lb
316       tc = 0; // zero-trip
317     }
318   }
319 
320 #if KMP_STATS_ENABLED
321   if (KMP_MASTER_GTID(gtid)) {
322     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
323   }
324 #endif
325 
326   pr->u.p.lb = lb;
327   pr->u.p.ub = ub;
328   pr->u.p.st = st;
329   pr->u.p.tc = tc;
330 
331 #if KMP_OS_WINDOWS
332   pr->u.p.last_upper = ub + st;
333 #endif /* KMP_OS_WINDOWS */
334 
335   /* NOTE: only the active parallel region(s) has active ordered sections */
336 
337   if (active) {
338     if (pr->flags.ordered) {
339       pr->ordered_bumped = 0;
340       pr->u.p.ordered_lower = 1;
341       pr->u.p.ordered_upper = 0;
342     }
343   }
344 
345   switch (schedule) {
346 #if (KMP_STATIC_STEAL_ENABLED)
347   case kmp_sch_static_steal: {
348     T ntc, init;
349 
350     KD_TRACE(100,
351              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
352               gtid));
353 
354     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
355     if (nproc > 1 && ntc >= nproc) {
356       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
357       T id = tid;
358       T small_chunk, extras;
359 
360       small_chunk = ntc / nproc;
361       extras = ntc % nproc;
362 
363       init = id * small_chunk + (id < extras ? id : extras);
364       pr->u.p.count = init;
365       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
366 
367       pr->u.p.parm2 = lb;
368       // parm3 is the number of times to attempt stealing which is
369       // proportional to the number of chunks per thread up until
370       // the maximum value of nproc.
371       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
372       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
373       pr->u.p.st = st;
374       if (traits_t<T>::type_size > 4) {
375         // AC: TODO: check if 16-byte CAS available and use it to
376         // improve performance (probably wait for explicit request
377         // before spending time on this).
378         // For now use dynamically allocated per-thread lock,
379         // free memory in __kmp_dispatch_next when status==0.
380         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
381         th->th.th_dispatch->th_steal_lock =
382             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
383         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
384       }
385       break;
386     } else {
387       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
388                      "kmp_sch_static_balanced\n",
389                      gtid));
390       schedule = kmp_sch_static_balanced;
391       /* too few iterations: fall-through to kmp_sch_static_balanced */
392     } // if
393     /* FALL-THROUGH to static balanced */
394     KMP_FALLTHROUGH();
395   } // case
396 #endif
397   case kmp_sch_static_balanced: {
398     T init, limit;
399 
400     KD_TRACE(
401         100,
402         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
403          gtid));
404 
405     if (nproc > 1) {
406       T id = tid;
407 
408       if (tc < nproc) {
409         if (id < tc) {
410           init = id;
411           limit = id;
412           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
413         } else {
414           pr->u.p.count = 1; /* means no more chunks to execute */
415           pr->u.p.parm1 = FALSE;
416           break;
417         }
418       } else {
419         T small_chunk = tc / nproc;
420         T extras = tc % nproc;
421         init = id * small_chunk + (id < extras ? id : extras);
422         limit = init + small_chunk - (id < extras ? 0 : 1);
423         pr->u.p.parm1 = (id == nproc - 1);
424       }
425     } else {
426       if (tc > 0) {
427         init = 0;
428         limit = tc - 1;
429         pr->u.p.parm1 = TRUE;
430       } else {
431         // zero trip count
432         pr->u.p.count = 1; /* means no more chunks to execute */
433         pr->u.p.parm1 = FALSE;
434         break;
435       }
436     }
437 #if USE_ITT_BUILD
438     // Calculate chunk for metadata report
439     if (itt_need_metadata_reporting)
440       if (cur_chunk)
441         *cur_chunk = limit - init + 1;
442 #endif
443     if (st == 1) {
444       pr->u.p.lb = lb + init;
445       pr->u.p.ub = lb + limit;
446     } else {
447       // calculated upper bound, "ub" is user-defined upper bound
448       T ub_tmp = lb + limit * st;
449       pr->u.p.lb = lb + init * st;
450       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
451       // it exactly
452       if (st > 0) {
453         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
454       } else {
455         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
456       }
457     }
458     if (pr->flags.ordered) {
459       pr->u.p.ordered_lower = init;
460       pr->u.p.ordered_upper = limit;
461     }
462     break;
463   } // case
464 #if OMP_45_ENABLED
465   case kmp_sch_static_balanced_chunked: {
466     // similar to balanced, but chunk adjusted to multiple of simd width
467     T nth = nproc;
468     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
469                    " -> falling-through to static_greedy\n",
470                    gtid));
471     schedule = kmp_sch_static_greedy;
472     if (nth > 1)
473       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
474     else
475       pr->u.p.parm1 = tc;
476     break;
477   } // case
478   case kmp_sch_guided_simd:
479 #endif // OMP_45_ENABLED
480   case kmp_sch_guided_iterative_chunked: {
481     KD_TRACE(
482         100,
483         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
484          " case\n",
485          gtid));
486 
487     if (nproc > 1) {
488       if ((2L * chunk + 1) * nproc >= tc) {
489         /* chunk size too large, switch to dynamic */
490         schedule = kmp_sch_dynamic_chunked;
491       } else {
492         // when remaining iters become less than parm2 - switch to dynamic
493         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
494         *(double *)&pr->u.p.parm3 =
495             guided_flt_param / nproc; // may occupy parm3 and parm4
496       }
497     } else {
498       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
499                      "kmp_sch_static_greedy\n",
500                      gtid));
501       schedule = kmp_sch_static_greedy;
502       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
503       KD_TRACE(
504           100,
505           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
506            gtid));
507       pr->u.p.parm1 = tc;
508     } // if
509   } // case
510   break;
511   case kmp_sch_guided_analytical_chunked: {
512     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
513                    "kmp_sch_guided_analytical_chunked case\n",
514                    gtid));
515 
516     if (nproc > 1) {
517       if ((2L * chunk + 1) * nproc >= tc) {
518         /* chunk size too large, switch to dynamic */
519         schedule = kmp_sch_dynamic_chunked;
520       } else {
521         /* commonly used term: (2 nproc - 1)/(2 nproc) */
522         DBL x;
523 
524 #if KMP_USE_X87CONTROL
525         /* Linux* OS already has 64-bit computation by default for long double,
526            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
527            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
528            instead of the default 53-bit. Even though long double doesn't work
529            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
530            expected to impact the correctness of the algorithm, but this has not
531            been mathematically proven. */
532         // save original FPCW and set precision to 64-bit, as
533         // Windows* OS on IA-32 architecture defaults to 53-bit
534         unsigned int oldFpcw = _control87(0, 0);
535         _control87(_PC_64, _MCW_PC); // 0,0x30000
536 #endif
537         /* value used for comparison in solver for cross-over point */
538         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
539 
540         /* crossover point--chunk indexes equal to or greater than
541            this point switch to dynamic-style scheduling */
542         UT cross;
543 
544         /* commonly used term: (2 nproc - 1)/(2 nproc) */
545         x = (long double)1.0 - (long double)0.5 / nproc;
546 
547 #ifdef KMP_DEBUG
548         { // test natural alignment
549           struct _test_a {
550             char a;
551             union {
552               char b;
553               DBL d;
554             };
555           } t;
556           ptrdiff_t natural_alignment =
557               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
558           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
559           // long)natural_alignment );
560           KMP_DEBUG_ASSERT(
561               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
562         }
563 #endif // KMP_DEBUG
564 
565         /* save the term in thread private dispatch structure */
566         *(DBL *)&pr->u.p.parm3 = x;
567 
568         /* solve for the crossover point to the nearest integer i for which C_i
569            <= chunk */
570         {
571           UT left, right, mid;
572           long double p;
573 
574           /* estimate initial upper and lower bound */
575 
576           /* doesn't matter what value right is as long as it is positive, but
577              it affects performance of the solver */
578           right = 229;
579           p = __kmp_pow<UT>(x, right);
580           if (p > target) {
581             do {
582               p *= p;
583               right <<= 1;
584             } while (p > target && right < (1 << 27));
585             /* lower bound is previous (failed) estimate of upper bound */
586             left = right >> 1;
587           } else {
588             left = 0;
589           }
590 
591           /* bisection root-finding method */
592           while (left + 1 < right) {
593             mid = (left + right) / 2;
594             if (__kmp_pow<UT>(x, mid) > target) {
595               left = mid;
596             } else {
597               right = mid;
598             }
599           } // while
600           cross = right;
601         }
602         /* assert sanity of computed crossover point */
603         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
604                    __kmp_pow<UT>(x, cross) <= target);
605 
606         /* save the crossover point in thread private dispatch structure */
607         pr->u.p.parm2 = cross;
608 
609 // C75803
610 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
611 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
612 #else
613 #define GUIDED_ANALYTICAL_WORKAROUND (x)
614 #endif
615         /* dynamic-style scheduling offset */
616         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
617                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
618                         cross * chunk;
619 #if KMP_USE_X87CONTROL
620         // restore FPCW
621         _control87(oldFpcw, _MCW_PC);
622 #endif
623       } // if
624     } else {
625       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
626                      "kmp_sch_static_greedy\n",
627                      gtid));
628       schedule = kmp_sch_static_greedy;
629       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
630       pr->u.p.parm1 = tc;
631     } // if
632   } // case
633   break;
634   case kmp_sch_static_greedy:
635     KD_TRACE(
636         100,
637         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
638          gtid));
639     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
640     break;
641   case kmp_sch_static_chunked:
642   case kmp_sch_dynamic_chunked:
643     if (pr->u.p.parm1 <= 0) {
644       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
645     }
646     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
647                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
648                    gtid));
649     break;
650   case kmp_sch_trapezoidal: {
651     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
652 
653     T parm1, parm2, parm3, parm4;
654     KD_TRACE(100,
655              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
656               gtid));
657 
658     parm1 = chunk;
659 
660     /* F : size of the first cycle */
661     parm2 = (tc / (2 * nproc));
662 
663     if (parm2 < 1) {
664       parm2 = 1;
665     }
666 
667     /* L : size of the last cycle.  Make sure the last cycle is not larger
668        than the first cycle. */
669     if (parm1 < 1) {
670       parm1 = 1;
671     } else if (parm1 > parm2) {
672       parm1 = parm2;
673     }
674 
675     /* N : number of cycles */
676     parm3 = (parm2 + parm1);
677     parm3 = (2 * tc + parm3 - 1) / parm3;
678 
679     if (parm3 < 2) {
680       parm3 = 2;
681     }
682 
683     /* sigma : decreasing incr of the trapezoid */
684     parm4 = (parm3 - 1);
685     parm4 = (parm2 - parm1) / parm4;
686 
687     // pointless check, because parm4 >= 0 always
688     // if ( parm4 < 0 ) {
689     //    parm4 = 0;
690     //}
691 
692     pr->u.p.parm1 = parm1;
693     pr->u.p.parm2 = parm2;
694     pr->u.p.parm3 = parm3;
695     pr->u.p.parm4 = parm4;
696   } // case
697   break;
698 
699   default: {
700     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
701                 KMP_HNT(GetNewerLibrary), // Hint
702                 __kmp_msg_null // Variadic argument list terminator
703                 );
704   } break;
705   } // switch
706   pr->schedule = schedule;
707 }
708 
709 #if KMP_USE_HIER_SCHED
710 template <typename T>
711 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
712                                              typename traits_t<T>::signed_t st);
713 template <>
714 inline void
715 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
716                                             kmp_int32 ub, kmp_int32 st) {
717   __kmp_dispatch_init_hierarchy<kmp_int32>(
718       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
719       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
720 }
721 template <>
722 inline void
723 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
724                                              kmp_uint32 ub, kmp_int32 st) {
725   __kmp_dispatch_init_hierarchy<kmp_uint32>(
726       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
727       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
728 }
729 template <>
730 inline void
731 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
732                                             kmp_int64 ub, kmp_int64 st) {
733   __kmp_dispatch_init_hierarchy<kmp_int64>(
734       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
735       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
736 }
737 template <>
738 inline void
739 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
740                                              kmp_uint64 ub, kmp_int64 st) {
741   __kmp_dispatch_init_hierarchy<kmp_uint64>(
742       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
743       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
744 }
745 
746 // free all the hierarchy scheduling memory associated with the team
747 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
748   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
749   for (int i = 0; i < num_disp_buff; ++i) {
750     // type does not matter here so use kmp_int32
751     auto sh =
752         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
753             &team->t.t_disp_buffer[i]);
754     if (sh->hier) {
755       sh->hier->deallocate();
756       __kmp_free(sh->hier);
757     }
758   }
759 }
760 #endif
761 
762 // UT - unsigned flavor of T, ST - signed flavor of T,
763 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
764 template <typename T>
765 static void
766 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
767                     T ub, typename traits_t<T>::signed_t st,
768                     typename traits_t<T>::signed_t chunk, int push_ws) {
769   typedef typename traits_t<T>::unsigned_t UT;
770 
771   int active;
772   kmp_info_t *th;
773   kmp_team_t *team;
774   kmp_uint32 my_buffer_index;
775   dispatch_private_info_template<T> *pr;
776   dispatch_shared_info_template<T> volatile *sh;
777 
778   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
779                    sizeof(dispatch_private_info));
780   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
781                    sizeof(dispatch_shared_info));
782 
783   if (!TCR_4(__kmp_init_parallel))
784     __kmp_parallel_initialize();
785 
786 #if OMP_50_ENABLED
787   __kmp_resume_if_soft_paused();
788 #endif
789 
790 #if INCLUDE_SSC_MARKS
791   SSC_MARK_DISPATCH_INIT();
792 #endif
793 #ifdef KMP_DEBUG
794   typedef typename traits_t<T>::signed_t ST;
795   {
796     char *buff;
797     // create format specifiers before the debug output
798     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
799                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
800                             traits_t<ST>::spec, traits_t<T>::spec,
801                             traits_t<T>::spec, traits_t<ST>::spec);
802     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
803     __kmp_str_free(&buff);
804   }
805 #endif
806   /* setup data */
807   th = __kmp_threads[gtid];
808   team = th->th.th_team;
809   active = !team->t.t_serialized;
810   th->th.th_ident = loc;
811 
812   // Any half-decent optimizer will remove this test when the blocks are empty
813   // since the macros expand to nothing
814   // when statistics are disabled.
815   if (schedule == __kmp_static) {
816     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
817   } else {
818     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
819   }
820 
821 #if KMP_USE_HIER_SCHED
822   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
823   // Hierarchical scheduling does not work with ordered, so if ordered is
824   // detected, then revert back to threaded scheduling.
825   bool ordered;
826   enum sched_type my_sched = schedule;
827   my_buffer_index = th->th.th_dispatch->th_disp_index;
828   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
829       &th->th.th_dispatch
830            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
831   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
832   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
833     my_sched =
834         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
835   ordered = (kmp_ord_lower & my_sched);
836   if (pr->flags.use_hier) {
837     if (ordered) {
838       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
839                      "Disabling hierarchical scheduling.\n",
840                      gtid));
841       pr->flags.use_hier = FALSE;
842     }
843   }
844   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
845     // Don't use hierarchical for ordered parallel loops and don't
846     // use the runtime hierarchy if one was specified in the program
847     if (!ordered && !pr->flags.use_hier)
848       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
849   }
850 #endif // KMP_USE_HIER_SCHED
851 
852 #if USE_ITT_BUILD
853   kmp_uint64 cur_chunk = chunk;
854   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
855                                     __kmp_forkjoin_frames_mode == 3 &&
856                                     KMP_MASTER_GTID(gtid) &&
857 #if OMP_40_ENABLED
858                                     th->th.th_teams_microtask == NULL &&
859 #endif
860                                     team->t.t_active_level == 1;
861 #endif
862   if (!active) {
863     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
864         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
865   } else {
866     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
867                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
868 
869     my_buffer_index = th->th.th_dispatch->th_disp_index++;
870 
871     /* What happens when number of threads changes, need to resize buffer? */
872     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
873         &th->th.th_dispatch
874              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
876         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
877     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
878                   my_buffer_index));
879   }
880 
881   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
882 #if USE_ITT_BUILD
883                                 &cur_chunk,
884 #endif
885                                 chunk, (T)th->th.th_team_nproc,
886                                 (T)th->th.th_info.ds.ds_tid);
887   if (active) {
888     if (pr->flags.ordered == 0) {
889       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
890       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
891     } else {
892       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
893       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
894     }
895   }
896 
897   if (active) {
898     /* The name of this buffer should be my_buffer_index when it's free to use
899      * it */
900 
901     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
902                    "sh->buffer_index:%d\n",
903                    gtid, my_buffer_index, sh->buffer_index));
904     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
905                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
906     // Note: KMP_WAIT() cannot be used there: buffer index and
907     // my_buffer_index are *always* 32-bit integers.
908     KMP_MB(); /* is this necessary? */
909     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
910                    "sh->buffer_index:%d\n",
911                    gtid, my_buffer_index, sh->buffer_index));
912 
913     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
914     th->th.th_dispatch->th_dispatch_sh_current =
915         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
916 #if USE_ITT_BUILD
917     if (pr->flags.ordered) {
918       __kmp_itt_ordered_init(gtid);
919     }
920     // Report loop metadata
921     if (itt_need_metadata_reporting) {
922       // Only report metadata by master of active team at level 1
923       kmp_uint64 schedtype = 0;
924       switch (schedule) {
925       case kmp_sch_static_chunked:
926       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
927         break;
928       case kmp_sch_static_greedy:
929         cur_chunk = pr->u.p.parm1;
930         break;
931       case kmp_sch_dynamic_chunked:
932         schedtype = 1;
933         break;
934       case kmp_sch_guided_iterative_chunked:
935       case kmp_sch_guided_analytical_chunked:
936 #if OMP_45_ENABLED
937       case kmp_sch_guided_simd:
938 #endif
939         schedtype = 2;
940         break;
941       default:
942         // Should we put this case under "static"?
943         // case kmp_sch_static_steal:
944         schedtype = 3;
945         break;
946       }
947       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
948     }
949 #if KMP_USE_HIER_SCHED
950     if (pr->flags.use_hier) {
951       pr->u.p.count = 0;
952       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
953     }
954 #endif // KMP_USER_HIER_SCHED
955 #endif /* USE_ITT_BUILD */
956   }
957 
958 #ifdef KMP_DEBUG
959   {
960     char *buff;
961     // create format specifiers before the debug output
962     buff = __kmp_str_format(
963         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
964         "lb:%%%s ub:%%%s"
965         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
966         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
967         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
968         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
969         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
970         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
971     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
972                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
973                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
974                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
975     __kmp_str_free(&buff);
976   }
977 #endif
978 #if (KMP_STATIC_STEAL_ENABLED)
979   // It cannot be guaranteed that after execution of a loop with some other
980   // schedule kind all the parm3 variables will contain the same value. Even if
981   // all parm3 will be the same, it still exists a bad case like using 0 and 1
982   // rather than program life-time increment. So the dedicated variable is
983   // required. The 'static_steal_counter' is used.
984   if (schedule == kmp_sch_static_steal) {
985     // Other threads will inspect this variable when searching for a victim.
986     // This is a flag showing that other threads may steal from this thread
987     // since then.
988     volatile T *p = &pr->u.p.static_steal_counter;
989     *p = *p + 1;
990   }
991 #endif // ( KMP_STATIC_STEAL_ENABLED )
992 
993 #if OMPT_SUPPORT && OMPT_OPTIONAL
994   if (ompt_enabled.ompt_callback_work) {
995     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
996     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
997     ompt_callbacks.ompt_callback(ompt_callback_work)(
998         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
999         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1000   }
1001 #endif
1002   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1003 }
1004 
1005 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1006  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1007  * every chunk of iterations.  If the ordered section(s) were not executed
1008  * for this iteration (or every iteration in this chunk), we need to set the
1009  * ordered iteration counters so that the next thread can proceed. */
1010 template <typename UT>
1011 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1012   typedef typename traits_t<UT>::signed_t ST;
1013   kmp_info_t *th = __kmp_threads[gtid];
1014 
1015   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1016   if (!th->th.th_team->t.t_serialized) {
1017 
1018     dispatch_private_info_template<UT> *pr =
1019         reinterpret_cast<dispatch_private_info_template<UT> *>(
1020             th->th.th_dispatch->th_dispatch_pr_current);
1021     dispatch_shared_info_template<UT> volatile *sh =
1022         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1023             th->th.th_dispatch->th_dispatch_sh_current);
1024     KMP_DEBUG_ASSERT(pr);
1025     KMP_DEBUG_ASSERT(sh);
1026     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1027                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1028 
1029     if (pr->ordered_bumped) {
1030       KD_TRACE(
1031           1000,
1032           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1033            gtid));
1034       pr->ordered_bumped = 0;
1035     } else {
1036       UT lower = pr->u.p.ordered_lower;
1037 
1038 #ifdef KMP_DEBUG
1039       {
1040         char *buff;
1041         // create format specifiers before the debug output
1042         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1043                                 "ordered_iteration:%%%s lower:%%%s\n",
1044                                 traits_t<UT>::spec, traits_t<UT>::spec);
1045         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1046         __kmp_str_free(&buff);
1047       }
1048 #endif
1049 
1050       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1051                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1052       KMP_MB(); /* is this necessary? */
1053 #ifdef KMP_DEBUG
1054       {
1055         char *buff;
1056         // create format specifiers before the debug output
1057         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1058                                 "ordered_iteration:%%%s lower:%%%s\n",
1059                                 traits_t<UT>::spec, traits_t<UT>::spec);
1060         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1061         __kmp_str_free(&buff);
1062       }
1063 #endif
1064 
1065       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1066     } // if
1067   } // if
1068   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1069 }
1070 
1071 #ifdef KMP_GOMP_COMPAT
1072 
1073 template <typename UT>
1074 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1075   typedef typename traits_t<UT>::signed_t ST;
1076   kmp_info_t *th = __kmp_threads[gtid];
1077 
1078   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1079   if (!th->th.th_team->t.t_serialized) {
1080     //        int cid;
1081     dispatch_private_info_template<UT> *pr =
1082         reinterpret_cast<dispatch_private_info_template<UT> *>(
1083             th->th.th_dispatch->th_dispatch_pr_current);
1084     dispatch_shared_info_template<UT> volatile *sh =
1085         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1086             th->th.th_dispatch->th_dispatch_sh_current);
1087     KMP_DEBUG_ASSERT(pr);
1088     KMP_DEBUG_ASSERT(sh);
1089     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1090                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1091 
1092     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1093     UT lower = pr->u.p.ordered_lower;
1094     UT upper = pr->u.p.ordered_upper;
1095     UT inc = upper - lower + 1;
1096 
1097     if (pr->ordered_bumped == inc) {
1098       KD_TRACE(
1099           1000,
1100           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1101            gtid));
1102       pr->ordered_bumped = 0;
1103     } else {
1104       inc -= pr->ordered_bumped;
1105 
1106 #ifdef KMP_DEBUG
1107       {
1108         char *buff;
1109         // create format specifiers before the debug output
1110         buff = __kmp_str_format(
1111             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1112             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1113             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1114         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1115         __kmp_str_free(&buff);
1116       }
1117 #endif
1118 
1119       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1120                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1121 
1122       KMP_MB(); /* is this necessary? */
1123       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1124                       "ordered_bumped to zero\n",
1125                       gtid));
1126       pr->ordered_bumped = 0;
1127 //!!!!! TODO check if the inc should be unsigned, or signed???
1128 #ifdef KMP_DEBUG
1129       {
1130         char *buff;
1131         // create format specifiers before the debug output
1132         buff = __kmp_str_format(
1133             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1134             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1135             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1136             traits_t<UT>::spec);
1137         KD_TRACE(1000,
1138                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1139         __kmp_str_free(&buff);
1140       }
1141 #endif
1142 
1143       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1144     }
1145     //        }
1146   }
1147   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1148 }
1149 
1150 #endif /* KMP_GOMP_COMPAT */
1151 
1152 template <typename T>
1153 int __kmp_dispatch_next_algorithm(int gtid,
1154                                   dispatch_private_info_template<T> *pr,
1155                                   dispatch_shared_info_template<T> volatile *sh,
1156                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1157                                   typename traits_t<T>::signed_t *p_st, T nproc,
1158                                   T tid) {
1159   typedef typename traits_t<T>::unsigned_t UT;
1160   typedef typename traits_t<T>::signed_t ST;
1161   typedef typename traits_t<T>::floating_t DBL;
1162   int status = 0;
1163   kmp_int32 last = 0;
1164   T start;
1165   ST incr;
1166   UT limit, trip, init;
1167   kmp_info_t *th = __kmp_threads[gtid];
1168   kmp_team_t *team = th->th.th_team;
1169 
1170   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1171                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1172   KMP_DEBUG_ASSERT(pr);
1173   KMP_DEBUG_ASSERT(sh);
1174   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1175 #ifdef KMP_DEBUG
1176   {
1177     char *buff;
1178     // create format specifiers before the debug output
1179     buff =
1180         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1181                          "sh:%%p nproc:%%%s tid:%%%s\n",
1182                          traits_t<T>::spec, traits_t<T>::spec);
1183     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1184     __kmp_str_free(&buff);
1185   }
1186 #endif
1187 
1188   // zero trip count
1189   if (pr->u.p.tc == 0) {
1190     KD_TRACE(10,
1191              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1192               "zero status:%d\n",
1193               gtid, status));
1194     return 0;
1195   }
1196 
1197   switch (pr->schedule) {
1198 #if (KMP_STATIC_STEAL_ENABLED)
1199   case kmp_sch_static_steal: {
1200     T chunk = pr->u.p.parm1;
1201 
1202     KD_TRACE(100,
1203              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1204               gtid));
1205 
1206     trip = pr->u.p.tc - 1;
1207 
1208     if (traits_t<T>::type_size > 4) {
1209       // use lock for 8-byte and CAS for 4-byte induction
1210       // variable. TODO (optional): check and use 16-byte CAS
1211       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1212       KMP_DEBUG_ASSERT(lck != NULL);
1213       if (pr->u.p.count < (UT)pr->u.p.ub) {
1214         __kmp_acquire_lock(lck, gtid);
1215         // try to get own chunk of iterations
1216         init = (pr->u.p.count)++;
1217         status = (init < (UT)pr->u.p.ub);
1218         __kmp_release_lock(lck, gtid);
1219       } else {
1220         status = 0; // no own chunks
1221       }
1222       if (!status) { // try to steal
1223         kmp_info_t **other_threads = team->t.t_threads;
1224         int while_limit = pr->u.p.parm3;
1225         int while_index = 0;
1226         // TODO: algorithm of searching for a victim
1227         // should be cleaned up and measured
1228         while ((!status) && (while_limit != ++while_index)) {
1229           T remaining;
1230           T victimIdx = pr->u.p.parm4;
1231           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1232           dispatch_private_info_template<T> *victim =
1233               reinterpret_cast<dispatch_private_info_template<T> *>(
1234                   other_threads[victimIdx]
1235                       ->th.th_dispatch->th_dispatch_pr_current);
1236           while ((victim == NULL || victim == pr ||
1237                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1238                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1239                  oldVictimIdx != victimIdx) {
1240             victimIdx = (victimIdx + 1) % nproc;
1241             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1242                 other_threads[victimIdx]
1243                     ->th.th_dispatch->th_dispatch_pr_current);
1244           }
1245           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1246                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1247             continue; // try once more (nproc attempts in total)
1248             // no victim is ready yet to participate in stealing
1249             // because all victims are still in kmp_init_dispatch
1250           }
1251           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1252             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1253             continue; // not enough chunks to steal, goto next victim
1254           }
1255 
1256           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1257           KMP_ASSERT(lck != NULL);
1258           __kmp_acquire_lock(lck, gtid);
1259           limit = victim->u.p.ub; // keep initial ub
1260           if (victim->u.p.count >= limit ||
1261               (remaining = limit - victim->u.p.count) < 2) {
1262             __kmp_release_lock(lck, gtid);
1263             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1264             continue; // not enough chunks to steal
1265           }
1266           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1267           // by 1
1268           if (remaining > 3) {
1269             // steal 1/4 of remaining
1270             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1271             init = (victim->u.p.ub -= (remaining >> 2));
1272           } else {
1273             // steal 1 chunk of 2 or 3 remaining
1274             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1275             init = (victim->u.p.ub -= 1);
1276           }
1277           __kmp_release_lock(lck, gtid);
1278 
1279           KMP_DEBUG_ASSERT(init + 1 <= limit);
1280           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1281           status = 1;
1282           while_index = 0;
1283           // now update own count and ub with stolen range but init chunk
1284           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1285           pr->u.p.count = init + 1;
1286           pr->u.p.ub = limit;
1287           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1288         } // while (search for victim)
1289       } // if (try to find victim and steal)
1290     } else {
1291       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1292       typedef union {
1293         struct {
1294           UT count;
1295           T ub;
1296         } p;
1297         kmp_int64 b;
1298       } union_i4;
1299       // All operations on 'count' or 'ub' must be combined atomically
1300       // together.
1301       {
1302         union_i4 vold, vnew;
1303         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1304         vnew = vold;
1305         vnew.p.count++;
1306         while (!KMP_COMPARE_AND_STORE_ACQ64(
1307             (volatile kmp_int64 *)&pr->u.p.count,
1308             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1309             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1310           KMP_CPU_PAUSE();
1311           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1312           vnew = vold;
1313           vnew.p.count++;
1314         }
1315         vnew = vold;
1316         init = vnew.p.count;
1317         status = (init < (UT)vnew.p.ub);
1318       }
1319 
1320       if (!status) {
1321         kmp_info_t **other_threads = team->t.t_threads;
1322         int while_limit = pr->u.p.parm3;
1323         int while_index = 0;
1324 
1325         // TODO: algorithm of searching for a victim
1326         // should be cleaned up and measured
1327         while ((!status) && (while_limit != ++while_index)) {
1328           union_i4 vold, vnew;
1329           kmp_int32 remaining;
1330           T victimIdx = pr->u.p.parm4;
1331           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1332           dispatch_private_info_template<T> *victim =
1333               reinterpret_cast<dispatch_private_info_template<T> *>(
1334                   other_threads[victimIdx]
1335                       ->th.th_dispatch->th_dispatch_pr_current);
1336           while ((victim == NULL || victim == pr ||
1337                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1338                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1339                  oldVictimIdx != victimIdx) {
1340             victimIdx = (victimIdx + 1) % nproc;
1341             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1342                 other_threads[victimIdx]
1343                     ->th.th_dispatch->th_dispatch_pr_current);
1344           }
1345           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1346                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1347             continue; // try once more (nproc attempts in total)
1348             // no victim is ready yet to participate in stealing
1349             // because all victims are still in kmp_init_dispatch
1350           }
1351           pr->u.p.parm4 = victimIdx; // new victim found
1352           while (1) { // CAS loop if victim has enough chunks to steal
1353             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1354             vnew = vold;
1355 
1356             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1357             if (vnew.p.count >= (UT)vnew.p.ub ||
1358                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1359               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1360               break; // not enough chunks to steal, goto next victim
1361             }
1362             if (remaining > 3) {
1363               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1364             } else {
1365               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1366             }
1367             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1368             // TODO: Should this be acquire or release?
1369             if (KMP_COMPARE_AND_STORE_ACQ64(
1370                     (volatile kmp_int64 *)&victim->u.p.count,
1371                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1372                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1373               // stealing succedded
1374               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1375                                         vold.p.ub - vnew.p.ub);
1376               status = 1;
1377               while_index = 0;
1378               // now update own count and ub
1379               init = vnew.p.ub;
1380               vold.p.count = init + 1;
1381 #if KMP_ARCH_X86
1382               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1383 #else
1384               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1385 #endif
1386               break;
1387             } // if (check CAS result)
1388             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1389           } // while (try to steal from particular victim)
1390         } // while (search for victim)
1391       } // if (try to find victim and steal)
1392     } // if (4-byte induction variable)
1393     if (!status) {
1394       *p_lb = 0;
1395       *p_ub = 0;
1396       if (p_st != NULL)
1397         *p_st = 0;
1398     } else {
1399       start = pr->u.p.parm2;
1400       init *= chunk;
1401       limit = chunk + init - 1;
1402       incr = pr->u.p.st;
1403       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1404 
1405       KMP_DEBUG_ASSERT(init <= trip);
1406       if ((last = (limit >= trip)) != 0)
1407         limit = trip;
1408       if (p_st != NULL)
1409         *p_st = incr;
1410 
1411       if (incr == 1) {
1412         *p_lb = start + init;
1413         *p_ub = start + limit;
1414       } else {
1415         *p_lb = start + init * incr;
1416         *p_ub = start + limit * incr;
1417       }
1418 
1419       if (pr->flags.ordered) {
1420         pr->u.p.ordered_lower = init;
1421         pr->u.p.ordered_upper = limit;
1422       } // if
1423     } // if
1424     break;
1425   } // case
1426 #endif // ( KMP_STATIC_STEAL_ENABLED )
1427   case kmp_sch_static_balanced: {
1428     KD_TRACE(
1429         10,
1430         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1431          gtid));
1432     /* check if thread has any iteration to do */
1433     if ((status = !pr->u.p.count) != 0) {
1434       pr->u.p.count = 1;
1435       *p_lb = pr->u.p.lb;
1436       *p_ub = pr->u.p.ub;
1437       last = pr->u.p.parm1;
1438       if (p_st != NULL)
1439         *p_st = pr->u.p.st;
1440     } else { /* no iterations to do */
1441       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1442     }
1443   } // case
1444   break;
1445   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1446                                  merged here */
1447   case kmp_sch_static_chunked: {
1448     T parm1;
1449 
1450     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1451                    "kmp_sch_static_[affinity|chunked] case\n",
1452                    gtid));
1453     parm1 = pr->u.p.parm1;
1454 
1455     trip = pr->u.p.tc - 1;
1456     init = parm1 * (pr->u.p.count + tid);
1457 
1458     if ((status = (init <= trip)) != 0) {
1459       start = pr->u.p.lb;
1460       incr = pr->u.p.st;
1461       limit = parm1 + init - 1;
1462 
1463       if ((last = (limit >= trip)) != 0)
1464         limit = trip;
1465 
1466       if (p_st != NULL)
1467         *p_st = incr;
1468 
1469       pr->u.p.count += nproc;
1470 
1471       if (incr == 1) {
1472         *p_lb = start + init;
1473         *p_ub = start + limit;
1474       } else {
1475         *p_lb = start + init * incr;
1476         *p_ub = start + limit * incr;
1477       }
1478 
1479       if (pr->flags.ordered) {
1480         pr->u.p.ordered_lower = init;
1481         pr->u.p.ordered_upper = limit;
1482       } // if
1483     } // if
1484   } // case
1485   break;
1486 
1487   case kmp_sch_dynamic_chunked: {
1488     T chunk = pr->u.p.parm1;
1489 
1490     KD_TRACE(
1491         100,
1492         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1493          gtid));
1494 
1495     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1496     trip = pr->u.p.tc - 1;
1497 
1498     if ((status = (init <= trip)) == 0) {
1499       *p_lb = 0;
1500       *p_ub = 0;
1501       if (p_st != NULL)
1502         *p_st = 0;
1503     } else {
1504       start = pr->u.p.lb;
1505       limit = chunk + init - 1;
1506       incr = pr->u.p.st;
1507 
1508       if ((last = (limit >= trip)) != 0)
1509         limit = trip;
1510 
1511       if (p_st != NULL)
1512         *p_st = incr;
1513 
1514       if (incr == 1) {
1515         *p_lb = start + init;
1516         *p_ub = start + limit;
1517       } else {
1518         *p_lb = start + init * incr;
1519         *p_ub = start + limit * incr;
1520       }
1521 
1522       if (pr->flags.ordered) {
1523         pr->u.p.ordered_lower = init;
1524         pr->u.p.ordered_upper = limit;
1525       } // if
1526     } // if
1527   } // case
1528   break;
1529 
1530   case kmp_sch_guided_iterative_chunked: {
1531     T chunkspec = pr->u.p.parm1;
1532     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1533                    "iterative case\n",
1534                    gtid));
1535     trip = pr->u.p.tc;
1536     // Start atomic part of calculations
1537     while (1) {
1538       ST remaining; // signed, because can be < 0
1539       init = sh->u.s.iteration; // shared value
1540       remaining = trip - init;
1541       if (remaining <= 0) { // AC: need to compare with 0 first
1542         // nothing to do, don't try atomic op
1543         status = 0;
1544         break;
1545       }
1546       if ((T)remaining <
1547           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1548         // use dynamic-style shcedule
1549         // atomically inrement iterations, get old value
1550         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1551                                  (ST)chunkspec);
1552         remaining = trip - init;
1553         if (remaining <= 0) {
1554           status = 0; // all iterations got by other threads
1555         } else {
1556           // got some iterations to work on
1557           status = 1;
1558           if ((T)remaining > chunkspec) {
1559             limit = init + chunkspec - 1;
1560           } else {
1561             last = 1; // the last chunk
1562             limit = init + remaining - 1;
1563           } // if
1564         } // if
1565         break;
1566       } // if
1567       limit = init +
1568               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1569       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1570                                (ST)init, (ST)limit)) {
1571         // CAS was successful, chunk obtained
1572         status = 1;
1573         --limit;
1574         break;
1575       } // if
1576     } // while
1577     if (status != 0) {
1578       start = pr->u.p.lb;
1579       incr = pr->u.p.st;
1580       if (p_st != NULL)
1581         *p_st = incr;
1582       *p_lb = start + init * incr;
1583       *p_ub = start + limit * incr;
1584       if (pr->flags.ordered) {
1585         pr->u.p.ordered_lower = init;
1586         pr->u.p.ordered_upper = limit;
1587       } // if
1588     } else {
1589       *p_lb = 0;
1590       *p_ub = 0;
1591       if (p_st != NULL)
1592         *p_st = 0;
1593     } // if
1594   } // case
1595   break;
1596 
1597 #if OMP_45_ENABLED
1598   case kmp_sch_guided_simd: {
1599     // same as iterative but curr-chunk adjusted to be multiple of given
1600     // chunk
1601     T chunk = pr->u.p.parm1;
1602     KD_TRACE(100,
1603              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1604               gtid));
1605     trip = pr->u.p.tc;
1606     // Start atomic part of calculations
1607     while (1) {
1608       ST remaining; // signed, because can be < 0
1609       init = sh->u.s.iteration; // shared value
1610       remaining = trip - init;
1611       if (remaining <= 0) { // AC: need to compare with 0 first
1612         status = 0; // nothing to do, don't try atomic op
1613         break;
1614       }
1615       KMP_DEBUG_ASSERT(init % chunk == 0);
1616       // compare with K*nproc*(chunk+1), K=2 by default
1617       if ((T)remaining < pr->u.p.parm2) {
1618         // use dynamic-style shcedule
1619         // atomically inrement iterations, get old value
1620         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1621                                  (ST)chunk);
1622         remaining = trip - init;
1623         if (remaining <= 0) {
1624           status = 0; // all iterations got by other threads
1625         } else {
1626           // got some iterations to work on
1627           status = 1;
1628           if ((T)remaining > chunk) {
1629             limit = init + chunk - 1;
1630           } else {
1631             last = 1; // the last chunk
1632             limit = init + remaining - 1;
1633           } // if
1634         } // if
1635         break;
1636       } // if
1637       // divide by K*nproc
1638       UT span = remaining * (*(double *)&pr->u.p.parm3);
1639       UT rem = span % chunk;
1640       if (rem) // adjust so that span%chunk == 0
1641         span += chunk - rem;
1642       limit = init + span;
1643       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1644                                (ST)init, (ST)limit)) {
1645         // CAS was successful, chunk obtained
1646         status = 1;
1647         --limit;
1648         break;
1649       } // if
1650     } // while
1651     if (status != 0) {
1652       start = pr->u.p.lb;
1653       incr = pr->u.p.st;
1654       if (p_st != NULL)
1655         *p_st = incr;
1656       *p_lb = start + init * incr;
1657       *p_ub = start + limit * incr;
1658       if (pr->flags.ordered) {
1659         pr->u.p.ordered_lower = init;
1660         pr->u.p.ordered_upper = limit;
1661       } // if
1662     } else {
1663       *p_lb = 0;
1664       *p_ub = 0;
1665       if (p_st != NULL)
1666         *p_st = 0;
1667     } // if
1668   } // case
1669   break;
1670 #endif // OMP_45_ENABLED
1671 
1672   case kmp_sch_guided_analytical_chunked: {
1673     T chunkspec = pr->u.p.parm1;
1674     UT chunkIdx;
1675 #if KMP_USE_X87CONTROL
1676     /* for storing original FPCW value for Windows* OS on
1677        IA-32 architecture 8-byte version */
1678     unsigned int oldFpcw;
1679     unsigned int fpcwSet = 0;
1680 #endif
1681     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1682                    "kmp_sch_guided_analytical_chunked case\n",
1683                    gtid));
1684 
1685     trip = pr->u.p.tc;
1686 
1687     KMP_DEBUG_ASSERT(nproc > 1);
1688     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1689 
1690     while (1) { /* this while loop is a safeguard against unexpected zero
1691                    chunk sizes */
1692       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1693       if (chunkIdx >= (UT)pr->u.p.parm2) {
1694         --trip;
1695         /* use dynamic-style scheduling */
1696         init = chunkIdx * chunkspec + pr->u.p.count;
1697         /* need to verify init > 0 in case of overflow in the above
1698          * calculation */
1699         if ((status = (init > 0 && init <= trip)) != 0) {
1700           limit = init + chunkspec - 1;
1701 
1702           if ((last = (limit >= trip)) != 0)
1703             limit = trip;
1704         }
1705         break;
1706       } else {
1707 /* use exponential-style scheduling */
1708 /* The following check is to workaround the lack of long double precision on
1709    Windows* OS.
1710    This check works around the possible effect that init != 0 for chunkIdx == 0.
1711  */
1712 #if KMP_USE_X87CONTROL
1713         /* If we haven't already done so, save original
1714            FPCW and set precision to 64-bit, as Windows* OS
1715            on IA-32 architecture defaults to 53-bit */
1716         if (!fpcwSet) {
1717           oldFpcw = _control87(0, 0);
1718           _control87(_PC_64, _MCW_PC);
1719           fpcwSet = 0x30000;
1720         }
1721 #endif
1722         if (chunkIdx) {
1723           init = __kmp_dispatch_guided_remaining<T>(
1724               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1725           KMP_DEBUG_ASSERT(init);
1726           init = trip - init;
1727         } else
1728           init = 0;
1729         limit = trip - __kmp_dispatch_guided_remaining<T>(
1730                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1731         KMP_ASSERT(init <= limit);
1732         if (init < limit) {
1733           KMP_DEBUG_ASSERT(limit <= trip);
1734           --limit;
1735           status = 1;
1736           break;
1737         } // if
1738       } // if
1739     } // while (1)
1740 #if KMP_USE_X87CONTROL
1741     /* restore FPCW if necessary
1742        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1743     */
1744     if (fpcwSet && (oldFpcw & fpcwSet))
1745       _control87(oldFpcw, _MCW_PC);
1746 #endif
1747     if (status != 0) {
1748       start = pr->u.p.lb;
1749       incr = pr->u.p.st;
1750       if (p_st != NULL)
1751         *p_st = incr;
1752       *p_lb = start + init * incr;
1753       *p_ub = start + limit * incr;
1754       if (pr->flags.ordered) {
1755         pr->u.p.ordered_lower = init;
1756         pr->u.p.ordered_upper = limit;
1757       }
1758     } else {
1759       *p_lb = 0;
1760       *p_ub = 0;
1761       if (p_st != NULL)
1762         *p_st = 0;
1763     }
1764   } // case
1765   break;
1766 
1767   case kmp_sch_trapezoidal: {
1768     UT index;
1769     T parm2 = pr->u.p.parm2;
1770     T parm3 = pr->u.p.parm3;
1771     T parm4 = pr->u.p.parm4;
1772     KD_TRACE(100,
1773              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1774               gtid));
1775 
1776     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1777 
1778     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1779     trip = pr->u.p.tc - 1;
1780 
1781     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1782       *p_lb = 0;
1783       *p_ub = 0;
1784       if (p_st != NULL)
1785         *p_st = 0;
1786     } else {
1787       start = pr->u.p.lb;
1788       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1789       incr = pr->u.p.st;
1790 
1791       if ((last = (limit >= trip)) != 0)
1792         limit = trip;
1793 
1794       if (p_st != NULL)
1795         *p_st = incr;
1796 
1797       if (incr == 1) {
1798         *p_lb = start + init;
1799         *p_ub = start + limit;
1800       } else {
1801         *p_lb = start + init * incr;
1802         *p_ub = start + limit * incr;
1803       }
1804 
1805       if (pr->flags.ordered) {
1806         pr->u.p.ordered_lower = init;
1807         pr->u.p.ordered_upper = limit;
1808       } // if
1809     } // if
1810   } // case
1811   break;
1812   default: {
1813     status = 0; // to avoid complaints on uninitialized variable use
1814     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1815                 KMP_HNT(GetNewerLibrary), // Hint
1816                 __kmp_msg_null // Variadic argument list terminator
1817                 );
1818   } break;
1819   } // switch
1820   if (p_last)
1821     *p_last = last;
1822 #ifdef KMP_DEBUG
1823   if (pr->flags.ordered) {
1824     char *buff;
1825     // create format specifiers before the debug output
1826     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1827                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1828                             traits_t<UT>::spec, traits_t<UT>::spec);
1829     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1830     __kmp_str_free(&buff);
1831   }
1832   {
1833     char *buff;
1834     // create format specifiers before the debug output
1835     buff = __kmp_str_format(
1836         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1837         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1838         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1839     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1840     __kmp_str_free(&buff);
1841   }
1842 #endif
1843   return status;
1844 }
1845 
1846 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1847    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1848    is not called. */
1849 #if OMPT_SUPPORT && OMPT_OPTIONAL
1850 #define OMPT_LOOP_END                                                          \
1851   if (status == 0) {                                                           \
1852     if (ompt_enabled.ompt_callback_work) {                                     \
1853       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1854       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1855       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1856           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1857           &(task_info->task_data), 0, codeptr);                                \
1858     }                                                                          \
1859   }
1860 // TODO: implement count
1861 #else
1862 #define OMPT_LOOP_END // no-op
1863 #endif
1864 
1865 #if KMP_STATS_ENABLED
1866 #define KMP_STATS_LOOP_END                                                     \
1867   {                                                                            \
1868     kmp_int64 u, l, t, i;                                                      \
1869     l = (kmp_int64)(*p_lb);                                                    \
1870     u = (kmp_int64)(*p_ub);                                                    \
1871     i = (kmp_int64)(pr->u.p.st);                                               \
1872     if (status == 0) {                                                         \
1873       t = 0;                                                                   \
1874       KMP_POP_PARTITIONED_TIMER();                                             \
1875     } else if (i == 1) {                                                       \
1876       if (u >= l)                                                              \
1877         t = u - l + 1;                                                         \
1878       else                                                                     \
1879         t = 0;                                                                 \
1880     } else if (i < 0) {                                                        \
1881       if (l >= u)                                                              \
1882         t = (l - u) / (-i) + 1;                                                \
1883       else                                                                     \
1884         t = 0;                                                                 \
1885     } else {                                                                   \
1886       if (u >= l)                                                              \
1887         t = (u - l) / i + 1;                                                   \
1888       else                                                                     \
1889         t = 0;                                                                 \
1890     }                                                                          \
1891     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1892   }
1893 #else
1894 #define KMP_STATS_LOOP_END /* Nothing */
1895 #endif
1896 
1897 template <typename T>
1898 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1899                                T *p_lb, T *p_ub,
1900                                typename traits_t<T>::signed_t *p_st
1901 #if OMPT_SUPPORT && OMPT_OPTIONAL
1902                                ,
1903                                void *codeptr
1904 #endif
1905                                ) {
1906 
1907   typedef typename traits_t<T>::unsigned_t UT;
1908   typedef typename traits_t<T>::signed_t ST;
1909   // This is potentially slightly misleading, schedule(runtime) will appear here
1910   // even if the actual runtme schedule is static. (Which points out a
1911   // disadavantage of schedule(runtime): even when static scheduling is used it
1912   // costs more than a compile time choice to use static scheduling would.)
1913   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1914 
1915   int status;
1916   dispatch_private_info_template<T> *pr;
1917   kmp_info_t *th = __kmp_threads[gtid];
1918   kmp_team_t *team = th->th.th_team;
1919 
1920   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1921   KD_TRACE(
1922       1000,
1923       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1924        gtid, p_lb, p_ub, p_st, p_last));
1925 
1926   if (team->t.t_serialized) {
1927     /* NOTE: serialize this dispatch becase we are not at the active level */
1928     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1929         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1930     KMP_DEBUG_ASSERT(pr);
1931 
1932     if ((status = (pr->u.p.tc != 0)) == 0) {
1933       *p_lb = 0;
1934       *p_ub = 0;
1935       //            if ( p_last != NULL )
1936       //                *p_last = 0;
1937       if (p_st != NULL)
1938         *p_st = 0;
1939       if (__kmp_env_consistency_check) {
1940         if (pr->pushed_ws != ct_none) {
1941           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1942         }
1943       }
1944     } else if (pr->flags.nomerge) {
1945       kmp_int32 last;
1946       T start;
1947       UT limit, trip, init;
1948       ST incr;
1949       T chunk = pr->u.p.parm1;
1950 
1951       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1952                      gtid));
1953 
1954       init = chunk * pr->u.p.count++;
1955       trip = pr->u.p.tc - 1;
1956 
1957       if ((status = (init <= trip)) == 0) {
1958         *p_lb = 0;
1959         *p_ub = 0;
1960         //                if ( p_last != NULL )
1961         //                    *p_last = 0;
1962         if (p_st != NULL)
1963           *p_st = 0;
1964         if (__kmp_env_consistency_check) {
1965           if (pr->pushed_ws != ct_none) {
1966             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1967           }
1968         }
1969       } else {
1970         start = pr->u.p.lb;
1971         limit = chunk + init - 1;
1972         incr = pr->u.p.st;
1973 
1974         if ((last = (limit >= trip)) != 0) {
1975           limit = trip;
1976 #if KMP_OS_WINDOWS
1977           pr->u.p.last_upper = pr->u.p.ub;
1978 #endif /* KMP_OS_WINDOWS */
1979         }
1980         if (p_last != NULL)
1981           *p_last = last;
1982         if (p_st != NULL)
1983           *p_st = incr;
1984         if (incr == 1) {
1985           *p_lb = start + init;
1986           *p_ub = start + limit;
1987         } else {
1988           *p_lb = start + init * incr;
1989           *p_ub = start + limit * incr;
1990         }
1991 
1992         if (pr->flags.ordered) {
1993           pr->u.p.ordered_lower = init;
1994           pr->u.p.ordered_upper = limit;
1995 #ifdef KMP_DEBUG
1996           {
1997             char *buff;
1998             // create format specifiers before the debug output
1999             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2000                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2001                                     traits_t<UT>::spec, traits_t<UT>::spec);
2002             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2003                             pr->u.p.ordered_upper));
2004             __kmp_str_free(&buff);
2005           }
2006 #endif
2007         } // if
2008       } // if
2009     } else {
2010       pr->u.p.tc = 0;
2011       *p_lb = pr->u.p.lb;
2012       *p_ub = pr->u.p.ub;
2013 #if KMP_OS_WINDOWS
2014       pr->u.p.last_upper = *p_ub;
2015 #endif /* KMP_OS_WINDOWS */
2016       if (p_last != NULL)
2017         *p_last = TRUE;
2018       if (p_st != NULL)
2019         *p_st = pr->u.p.st;
2020     } // if
2021 #ifdef KMP_DEBUG
2022     {
2023       char *buff;
2024       // create format specifiers before the debug output
2025       buff = __kmp_str_format(
2026           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2027           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2028           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2029       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2030       __kmp_str_free(&buff);
2031     }
2032 #endif
2033 #if INCLUDE_SSC_MARKS
2034     SSC_MARK_DISPATCH_NEXT();
2035 #endif
2036     OMPT_LOOP_END;
2037     KMP_STATS_LOOP_END;
2038     return status;
2039   } else {
2040     kmp_int32 last = 0;
2041     dispatch_shared_info_template<T> volatile *sh;
2042 
2043     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2044                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2045 
2046     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2047         th->th.th_dispatch->th_dispatch_pr_current);
2048     KMP_DEBUG_ASSERT(pr);
2049     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2050         th->th.th_dispatch->th_dispatch_sh_current);
2051     KMP_DEBUG_ASSERT(sh);
2052 
2053 #if KMP_USE_HIER_SCHED
2054     if (pr->flags.use_hier)
2055       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2056     else
2057 #endif // KMP_USE_HIER_SCHED
2058       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2059                                                 p_st, th->th.th_team_nproc,
2060                                                 th->th.th_info.ds.ds_tid);
2061     // status == 0: no more iterations to execute
2062     if (status == 0) {
2063       UT num_done;
2064 
2065       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2066 #ifdef KMP_DEBUG
2067       {
2068         char *buff;
2069         // create format specifiers before the debug output
2070         buff = __kmp_str_format(
2071             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2072             traits_t<UT>::spec);
2073         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2074         __kmp_str_free(&buff);
2075       }
2076 #endif
2077 
2078 #if KMP_USE_HIER_SCHED
2079       pr->flags.use_hier = FALSE;
2080 #endif
2081       if ((ST)num_done == th->th.th_team_nproc - 1) {
2082 #if (KMP_STATIC_STEAL_ENABLED)
2083         if (pr->schedule == kmp_sch_static_steal &&
2084             traits_t<T>::type_size > 4) {
2085           int i;
2086           kmp_info_t **other_threads = team->t.t_threads;
2087           // loop complete, safe to destroy locks used for stealing
2088           for (i = 0; i < th->th.th_team_nproc; ++i) {
2089             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2090             KMP_ASSERT(lck != NULL);
2091             __kmp_destroy_lock(lck);
2092             __kmp_free(lck);
2093             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2094           }
2095         }
2096 #endif
2097         /* NOTE: release this buffer to be reused */
2098 
2099         KMP_MB(); /* Flush all pending memory write invalidates.  */
2100 
2101         sh->u.s.num_done = 0;
2102         sh->u.s.iteration = 0;
2103 
2104         /* TODO replace with general release procedure? */
2105         if (pr->flags.ordered) {
2106           sh->u.s.ordered_iteration = 0;
2107         }
2108 
2109         KMP_MB(); /* Flush all pending memory write invalidates.  */
2110 
2111         sh->buffer_index += __kmp_dispatch_num_buffers;
2112         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2113                        gtid, sh->buffer_index));
2114 
2115         KMP_MB(); /* Flush all pending memory write invalidates.  */
2116 
2117       } // if
2118       if (__kmp_env_consistency_check) {
2119         if (pr->pushed_ws != ct_none) {
2120           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2121         }
2122       }
2123 
2124       th->th.th_dispatch->th_deo_fcn = NULL;
2125       th->th.th_dispatch->th_dxo_fcn = NULL;
2126       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2127       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2128     } // if (status == 0)
2129 #if KMP_OS_WINDOWS
2130     else if (last) {
2131       pr->u.p.last_upper = pr->u.p.ub;
2132     }
2133 #endif /* KMP_OS_WINDOWS */
2134     if (p_last != NULL && status != 0)
2135       *p_last = last;
2136   } // if
2137 
2138 #ifdef KMP_DEBUG
2139   {
2140     char *buff;
2141     // create format specifiers before the debug output
2142     buff = __kmp_str_format(
2143         "__kmp_dispatch_next: T#%%d normal case: "
2144         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2145         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2146     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2147                   (p_last ? *p_last : 0), status));
2148     __kmp_str_free(&buff);
2149   }
2150 #endif
2151 #if INCLUDE_SSC_MARKS
2152   SSC_MARK_DISPATCH_NEXT();
2153 #endif
2154   OMPT_LOOP_END;
2155   KMP_STATS_LOOP_END;
2156   return status;
2157 }
2158 
2159 template <typename T>
2160 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2161                                   kmp_int32 *plastiter, T *plower, T *pupper,
2162                                   typename traits_t<T>::signed_t incr) {
2163   typedef typename traits_t<T>::unsigned_t UT;
2164   kmp_uint32 team_id;
2165   kmp_uint32 nteams;
2166   UT trip_count;
2167   kmp_team_t *team;
2168   kmp_info_t *th;
2169 
2170   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2171   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2172 #ifdef KMP_DEBUG
2173   typedef typename traits_t<T>::signed_t ST;
2174   {
2175     char *buff;
2176     // create format specifiers before the debug output
2177     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2178                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2179                             traits_t<T>::spec, traits_t<T>::spec,
2180                             traits_t<ST>::spec, traits_t<T>::spec);
2181     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2182     __kmp_str_free(&buff);
2183   }
2184 #endif
2185 
2186   if (__kmp_env_consistency_check) {
2187     if (incr == 0) {
2188       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2189                             loc);
2190     }
2191     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2192       // The loop is illegal.
2193       // Some zero-trip loops maintained by compiler, e.g.:
2194       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2195       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2196       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2197       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2198       // Compiler does not check the following illegal loops:
2199       //   for(i=0;i<10;i+=incr) // where incr<0
2200       //   for(i=10;i>0;i-=incr) // where incr<0
2201       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2202     }
2203   }
2204   th = __kmp_threads[gtid];
2205   team = th->th.th_team;
2206 #if OMP_40_ENABLED
2207   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2208   nteams = th->th.th_teams_size.nteams;
2209 #endif
2210   team_id = team->t.t_master_tid;
2211   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2212 
2213   // compute global trip count
2214   if (incr == 1) {
2215     trip_count = *pupper - *plower + 1;
2216   } else if (incr == -1) {
2217     trip_count = *plower - *pupper + 1;
2218   } else if (incr > 0) {
2219     // upper-lower can exceed the limit of signed type
2220     trip_count = (UT)(*pupper - *plower) / incr + 1;
2221   } else {
2222     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2223   }
2224 
2225   if (trip_count <= nteams) {
2226     KMP_DEBUG_ASSERT(
2227         __kmp_static == kmp_sch_static_greedy ||
2228         __kmp_static ==
2229             kmp_sch_static_balanced); // Unknown static scheduling type.
2230     // only some teams get single iteration, others get nothing
2231     if (team_id < trip_count) {
2232       *pupper = *plower = *plower + team_id * incr;
2233     } else {
2234       *plower = *pupper + incr; // zero-trip loop
2235     }
2236     if (plastiter != NULL)
2237       *plastiter = (team_id == trip_count - 1);
2238   } else {
2239     if (__kmp_static == kmp_sch_static_balanced) {
2240       UT chunk = trip_count / nteams;
2241       UT extras = trip_count % nteams;
2242       *plower +=
2243           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2244       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2245       if (plastiter != NULL)
2246         *plastiter = (team_id == nteams - 1);
2247     } else {
2248       T chunk_inc_count =
2249           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2250       T upper = *pupper;
2251       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2252       // Unknown static scheduling type.
2253       *plower += team_id * chunk_inc_count;
2254       *pupper = *plower + chunk_inc_count - incr;
2255       // Check/correct bounds if needed
2256       if (incr > 0) {
2257         if (*pupper < *plower)
2258           *pupper = traits_t<T>::max_value;
2259         if (plastiter != NULL)
2260           *plastiter = *plower <= upper && *pupper > upper - incr;
2261         if (*pupper > upper)
2262           *pupper = upper; // tracker C73258
2263       } else {
2264         if (*pupper > *plower)
2265           *pupper = traits_t<T>::min_value;
2266         if (plastiter != NULL)
2267           *plastiter = *plower >= upper && *pupper < upper - incr;
2268         if (*pupper < upper)
2269           *pupper = upper; // tracker C73258
2270       }
2271     }
2272   }
2273 }
2274 
2275 //-----------------------------------------------------------------------------
2276 // Dispatch routines
2277 //    Transfer call to template< type T >
2278 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2279 //                         T lb, T ub, ST st, ST chunk )
2280 extern "C" {
2281 
2282 /*!
2283 @ingroup WORK_SHARING
2284 @{
2285 @param loc Source location
2286 @param gtid Global thread id
2287 @param schedule Schedule type
2288 @param lb  Lower bound
2289 @param ub  Upper bound
2290 @param st  Step (or increment if you prefer)
2291 @param chunk The chunk size to block with
2292 
2293 This function prepares the runtime to start a dynamically scheduled for loop,
2294 saving the loop arguments.
2295 These functions are all identical apart from the types of the arguments.
2296 */
2297 
2298 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2299                             enum sched_type schedule, kmp_int32 lb,
2300                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2301   KMP_DEBUG_ASSERT(__kmp_init_serial);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303   OMPT_STORE_RETURN_ADDRESS(gtid);
2304 #endif
2305   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2306 }
2307 /*!
2308 See @ref __kmpc_dispatch_init_4
2309 */
2310 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2311                              enum sched_type schedule, kmp_uint32 lb,
2312                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2313   KMP_DEBUG_ASSERT(__kmp_init_serial);
2314 #if OMPT_SUPPORT && OMPT_OPTIONAL
2315   OMPT_STORE_RETURN_ADDRESS(gtid);
2316 #endif
2317   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2318 }
2319 
2320 /*!
2321 See @ref __kmpc_dispatch_init_4
2322 */
2323 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2324                             enum sched_type schedule, kmp_int64 lb,
2325                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2326   KMP_DEBUG_ASSERT(__kmp_init_serial);
2327 #if OMPT_SUPPORT && OMPT_OPTIONAL
2328   OMPT_STORE_RETURN_ADDRESS(gtid);
2329 #endif
2330   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2331 }
2332 
2333 /*!
2334 See @ref __kmpc_dispatch_init_4
2335 */
2336 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2337                              enum sched_type schedule, kmp_uint64 lb,
2338                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2339   KMP_DEBUG_ASSERT(__kmp_init_serial);
2340 #if OMPT_SUPPORT && OMPT_OPTIONAL
2341   OMPT_STORE_RETURN_ADDRESS(gtid);
2342 #endif
2343   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2344 }
2345 
2346 /*!
2347 See @ref __kmpc_dispatch_init_4
2348 
2349 Difference from __kmpc_dispatch_init set of functions is these functions
2350 are called for composite distribute parallel for construct. Thus before
2351 regular iterations dispatching we need to calc per-team iteration space.
2352 
2353 These functions are all identical apart from the types of the arguments.
2354 */
2355 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2356                                  enum sched_type schedule, kmp_int32 *p_last,
2357                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2358                                  kmp_int32 chunk) {
2359   KMP_DEBUG_ASSERT(__kmp_init_serial);
2360 #if OMPT_SUPPORT && OMPT_OPTIONAL
2361   OMPT_STORE_RETURN_ADDRESS(gtid);
2362 #endif
2363   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2364   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2365 }
2366 
2367 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2368                                   enum sched_type schedule, kmp_int32 *p_last,
2369                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2370                                   kmp_int32 chunk) {
2371   KMP_DEBUG_ASSERT(__kmp_init_serial);
2372 #if OMPT_SUPPORT && OMPT_OPTIONAL
2373   OMPT_STORE_RETURN_ADDRESS(gtid);
2374 #endif
2375   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2376   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2377 }
2378 
2379 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2380                                  enum sched_type schedule, kmp_int32 *p_last,
2381                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2382                                  kmp_int64 chunk) {
2383   KMP_DEBUG_ASSERT(__kmp_init_serial);
2384 #if OMPT_SUPPORT && OMPT_OPTIONAL
2385   OMPT_STORE_RETURN_ADDRESS(gtid);
2386 #endif
2387   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2388   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2389 }
2390 
2391 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2392                                   enum sched_type schedule, kmp_int32 *p_last,
2393                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2394                                   kmp_int64 chunk) {
2395   KMP_DEBUG_ASSERT(__kmp_init_serial);
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397   OMPT_STORE_RETURN_ADDRESS(gtid);
2398 #endif
2399   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2400   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2401 }
2402 
2403 /*!
2404 @param loc Source code location
2405 @param gtid Global thread id
2406 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2407 otherwise
2408 @param p_lb   Pointer to the lower bound for the next chunk of work
2409 @param p_ub   Pointer to the upper bound for the next chunk of work
2410 @param p_st   Pointer to the stride for the next chunk of work
2411 @return one if there is work to be done, zero otherwise
2412 
2413 Get the next dynamically allocated chunk of work for this thread.
2414 If there is no more work, then the lb,ub and stride need not be modified.
2415 */
2416 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2417                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419   OMPT_STORE_RETURN_ADDRESS(gtid);
2420 #endif
2421   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2423                                         ,
2424                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2425 #endif
2426                                             );
2427 }
2428 
2429 /*!
2430 See @ref __kmpc_dispatch_next_4
2431 */
2432 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2434                             kmp_int32 *p_st) {
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436   OMPT_STORE_RETURN_ADDRESS(gtid);
2437 #endif
2438   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440                                          ,
2441                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2442 #endif
2443                                              );
2444 }
2445 
2446 /*!
2447 See @ref __kmpc_dispatch_next_4
2448 */
2449 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2450                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452   OMPT_STORE_RETURN_ADDRESS(gtid);
2453 #endif
2454   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #if OMPT_SUPPORT && OMPT_OPTIONAL
2456                                         ,
2457                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2458 #endif
2459                                             );
2460 }
2461 
2462 /*!
2463 See @ref __kmpc_dispatch_next_4
2464 */
2465 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2466                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2467                             kmp_int64 *p_st) {
2468 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469   OMPT_STORE_RETURN_ADDRESS(gtid);
2470 #endif
2471   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473                                          ,
2474                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2475 #endif
2476                                              );
2477 }
2478 
2479 /*!
2480 @param loc Source code location
2481 @param gtid Global thread id
2482 
2483 Mark the end of a dynamic loop.
2484 */
2485 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2486   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2487 }
2488 
2489 /*!
2490 See @ref __kmpc_dispatch_fini_4
2491 */
2492 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2493   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2494 }
2495 
2496 /*!
2497 See @ref __kmpc_dispatch_fini_4
2498 */
2499 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2500   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2501 }
2502 
2503 /*!
2504 See @ref __kmpc_dispatch_fini_4
2505 */
2506 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2507   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2508 }
2509 /*! @} */
2510 
2511 //-----------------------------------------------------------------------------
2512 // Non-template routines from kmp_dispatch.cpp used in other sources
2513 
2514 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2515   return value == checker;
2516 }
2517 
2518 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2519   return value != checker;
2520 }
2521 
2522 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2523   return value < checker;
2524 }
2525 
2526 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2527   return value >= checker;
2528 }
2529 
2530 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2531   return value <= checker;
2532 }
2533 
2534 kmp_uint32
2535 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2536              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2537              void *obj // Higher-level synchronization object, or NULL.
2538              ) {
2539   // note: we may not belong to a team at this point
2540   volatile kmp_uint32 *spin = spinner;
2541   kmp_uint32 check = checker;
2542   kmp_uint32 spins;
2543   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2544   kmp_uint32 r;
2545 
2546   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2547   KMP_INIT_YIELD(spins);
2548   // main wait spin loop
2549   while (!f(r = TCR_4(*spin), check)) {
2550     KMP_FSYNC_SPIN_PREPARE(obj);
2551     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2552        split. It causes problems with infinite recursion because of exit lock */
2553     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2554         __kmp_abort_thread(); */
2555     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2556   }
2557   KMP_FSYNC_SPIN_ACQUIRED(obj);
2558   return r;
2559 }
2560 
2561 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2562                       kmp_uint32 (*pred)(void *, kmp_uint32),
2563                       void *obj // Higher-level synchronization object, or NULL.
2564                       ) {
2565   // note: we may not belong to a team at this point
2566   void *spin = spinner;
2567   kmp_uint32 check = checker;
2568   kmp_uint32 spins;
2569   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2570 
2571   KMP_FSYNC_SPIN_INIT(obj, spin);
2572   KMP_INIT_YIELD(spins);
2573   // main wait spin loop
2574   while (!f(spin, check)) {
2575     KMP_FSYNC_SPIN_PREPARE(obj);
2576     /* if we have waited a bit, or are noversubscribed, yield */
2577     /* pause is in the following code */
2578     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2579   }
2580   KMP_FSYNC_SPIN_ACQUIRED(obj);
2581 }
2582 
2583 } // extern "C"
2584 
2585 #ifdef KMP_GOMP_COMPAT
2586 
2587 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2588                                enum sched_type schedule, kmp_int32 lb,
2589                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2590                                int push_ws) {
2591   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2592                                  push_ws);
2593 }
2594 
2595 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2596                                 enum sched_type schedule, kmp_uint32 lb,
2597                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2598                                 int push_ws) {
2599   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2600                                   push_ws);
2601 }
2602 
2603 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2604                                enum sched_type schedule, kmp_int64 lb,
2605                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2606                                int push_ws) {
2607   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2608                                  push_ws);
2609 }
2610 
2611 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2612                                 enum sched_type schedule, kmp_uint64 lb,
2613                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2614                                 int push_ws) {
2615   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2616                                   push_ws);
2617 }
2618 
2619 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2620   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2621 }
2622 
2623 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2624   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2625 }
2626 
2627 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2628   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2629 }
2630 
2631 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2632   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2633 }
2634 
2635 #endif /* KMP_GOMP_COMPAT */
2636 
2637 /* ------------------------------------------------------------------------ */
2638