1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                      gtid));
395       if (pr->u.p.parm1 <= 0)
396         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397       break;
398     } // if
399   } // case
400 #endif
401   case kmp_sch_static_balanced: {
402     T init, limit;
403 
404     KD_TRACE(
405         100,
406         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407          gtid));
408 
409     if (nproc > 1) {
410       T id = tid;
411 
412       if (tc < nproc) {
413         if (id < tc) {
414           init = id;
415           limit = id;
416           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
417         } else {
418           pr->u.p.count = 1; /* means no more chunks to execute */
419           pr->u.p.parm1 = FALSE;
420           break;
421         }
422       } else {
423         T small_chunk = tc / nproc;
424         T extras = tc % nproc;
425         init = id * small_chunk + (id < extras ? id : extras);
426         limit = init + small_chunk - (id < extras ? 0 : 1);
427         pr->u.p.parm1 = (id == nproc - 1);
428       }
429     } else {
430       if (tc > 0) {
431         init = 0;
432         limit = tc - 1;
433         pr->u.p.parm1 = TRUE;
434       } else {
435         // zero trip count
436         pr->u.p.count = 1; /* means no more chunks to execute */
437         pr->u.p.parm1 = FALSE;
438         break;
439       }
440     }
441 #if USE_ITT_BUILD
442     // Calculate chunk for metadata report
443     if (itt_need_metadata_reporting)
444       if (cur_chunk)
445         *cur_chunk = limit - init + 1;
446 #endif
447     if (st == 1) {
448       pr->u.p.lb = lb + init;
449       pr->u.p.ub = lb + limit;
450     } else {
451       // calculated upper bound, "ub" is user-defined upper bound
452       T ub_tmp = lb + limit * st;
453       pr->u.p.lb = lb + init * st;
454       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
455       // it exactly
456       if (st > 0) {
457         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
458       } else {
459         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
460       }
461     }
462     if (pr->flags.ordered) {
463       pr->u.p.ordered_lower = init;
464       pr->u.p.ordered_upper = limit;
465     }
466     break;
467   } // case
468   case kmp_sch_static_balanced_chunked: {
469     // similar to balanced, but chunk adjusted to multiple of simd width
470     T nth = nproc;
471     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472                    " -> falling-through to static_greedy\n",
473                    gtid));
474     schedule = kmp_sch_static_greedy;
475     if (nth > 1)
476       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477     else
478       pr->u.p.parm1 = tc;
479     break;
480   } // case
481   case kmp_sch_guided_simd:
482   case kmp_sch_guided_iterative_chunked: {
483     KD_TRACE(
484         100,
485         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
486          " case\n",
487          gtid));
488 
489     if (nproc > 1) {
490       if ((2L * chunk + 1) * nproc >= tc) {
491         /* chunk size too large, switch to dynamic */
492         schedule = kmp_sch_dynamic_chunked;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522       } else {
523         /* commonly used term: (2 nproc - 1)/(2 nproc) */
524         DBL x;
525 
526 #if KMP_USE_X87CONTROL
527         /* Linux* OS already has 64-bit computation by default for long double,
528            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
529            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
530            instead of the default 53-bit. Even though long double doesn't work
531            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
532            expected to impact the correctness of the algorithm, but this has not
533            been mathematically proven. */
534         // save original FPCW and set precision to 64-bit, as
535         // Windows* OS on IA-32 architecture defaults to 53-bit
536         unsigned int oldFpcw = _control87(0, 0);
537         _control87(_PC_64, _MCW_PC); // 0,0x30000
538 #endif
539         /* value used for comparison in solver for cross-over point */
540         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
541 
542         /* crossover point--chunk indexes equal to or greater than
543            this point switch to dynamic-style scheduling */
544         UT cross;
545 
546         /* commonly used term: (2 nproc - 1)/(2 nproc) */
547         x = 1.0 - 0.5 / (double)nproc;
548 
549 #ifdef KMP_DEBUG
550         { // test natural alignment
551           struct _test_a {
552             char a;
553             union {
554               char b;
555               DBL d;
556             };
557           } t;
558           ptrdiff_t natural_alignment =
559               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
560           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
561           // long)natural_alignment );
562           KMP_DEBUG_ASSERT(
563               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
564         }
565 #endif // KMP_DEBUG
566 
567         /* save the term in thread private dispatch structure */
568         *(DBL *)&pr->u.p.parm3 = x;
569 
570         /* solve for the crossover point to the nearest integer i for which C_i
571            <= chunk */
572         {
573           UT left, right, mid;
574           long double p;
575 
576           /* estimate initial upper and lower bound */
577 
578           /* doesn't matter what value right is as long as it is positive, but
579              it affects performance of the solver */
580           right = 229;
581           p = __kmp_pow<UT>(x, right);
582           if (p > target) {
583             do {
584               p *= p;
585               right <<= 1;
586             } while (p > target && right < (1 << 27));
587             /* lower bound is previous (failed) estimate of upper bound */
588             left = right >> 1;
589           } else {
590             left = 0;
591           }
592 
593           /* bisection root-finding method */
594           while (left + 1 < right) {
595             mid = (left + right) / 2;
596             if (__kmp_pow<UT>(x, mid) > target) {
597               left = mid;
598             } else {
599               right = mid;
600             }
601           } // while
602           cross = right;
603         }
604         /* assert sanity of computed crossover point */
605         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606                    __kmp_pow<UT>(x, cross) <= target);
607 
608         /* save the crossover point in thread private dispatch structure */
609         pr->u.p.parm2 = cross;
610 
611 // C75803
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
614 #else
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
616 #endif
617         /* dynamic-style scheduling offset */
618         pr->u.p.count = tc -
619                         __kmp_dispatch_guided_remaining(
620                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
621                         cross * chunk;
622 #if KMP_USE_X87CONTROL
623         // restore FPCW
624         _control87(oldFpcw, _MCW_PC);
625 #endif
626       } // if
627     } else {
628       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
629                      "kmp_sch_static_greedy\n",
630                      gtid));
631       schedule = kmp_sch_static_greedy;
632       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
633       pr->u.p.parm1 = tc;
634     } // if
635   } // case
636   break;
637   case kmp_sch_static_greedy:
638     KD_TRACE(
639         100,
640         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
641          gtid));
642     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
643     break;
644   case kmp_sch_static_chunked:
645   case kmp_sch_dynamic_chunked:
646     if (pr->u.p.parm1 <= 0) {
647       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
648     }
649     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
650                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
651                    gtid));
652     break;
653   case kmp_sch_trapezoidal: {
654     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
655 
656     T parm1, parm2, parm3, parm4;
657     KD_TRACE(100,
658              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
659               gtid));
660 
661     parm1 = chunk;
662 
663     /* F : size of the first cycle */
664     parm2 = (tc / (2 * nproc));
665 
666     if (parm2 < 1) {
667       parm2 = 1;
668     }
669 
670     /* L : size of the last cycle.  Make sure the last cycle is not larger
671        than the first cycle. */
672     if (parm1 < 1) {
673       parm1 = 1;
674     } else if (parm1 > parm2) {
675       parm1 = parm2;
676     }
677 
678     /* N : number of cycles */
679     parm3 = (parm2 + parm1);
680     parm3 = (2 * tc + parm3 - 1) / parm3;
681 
682     if (parm3 < 2) {
683       parm3 = 2;
684     }
685 
686     /* sigma : decreasing incr of the trapezoid */
687     parm4 = (parm3 - 1);
688     parm4 = (parm2 - parm1) / parm4;
689 
690     // pointless check, because parm4 >= 0 always
691     // if ( parm4 < 0 ) {
692     //    parm4 = 0;
693     //}
694 
695     pr->u.p.parm1 = parm1;
696     pr->u.p.parm2 = parm2;
697     pr->u.p.parm3 = parm3;
698     pr->u.p.parm4 = parm4;
699   } // case
700   break;
701 
702   default: {
703     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
704                 KMP_HNT(GetNewerLibrary), // Hint
705                 __kmp_msg_null // Variadic argument list terminator
706     );
707   } break;
708   } // switch
709   pr->schedule = schedule;
710 }
711 
712 #if KMP_USE_HIER_SCHED
713 template <typename T>
714 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
715                                              typename traits_t<T>::signed_t st);
716 template <>
717 inline void
718 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
719                                             kmp_int32 ub, kmp_int32 st) {
720   __kmp_dispatch_init_hierarchy<kmp_int32>(
721       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
722       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
723 }
724 template <>
725 inline void
726 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
727                                              kmp_uint32 ub, kmp_int32 st) {
728   __kmp_dispatch_init_hierarchy<kmp_uint32>(
729       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
730       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
731 }
732 template <>
733 inline void
734 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
735                                             kmp_int64 ub, kmp_int64 st) {
736   __kmp_dispatch_init_hierarchy<kmp_int64>(
737       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
738       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
739 }
740 template <>
741 inline void
742 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
743                                              kmp_uint64 ub, kmp_int64 st) {
744   __kmp_dispatch_init_hierarchy<kmp_uint64>(
745       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
746       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
747 }
748 
749 // free all the hierarchy scheduling memory associated with the team
750 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
751   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
752   for (int i = 0; i < num_disp_buff; ++i) {
753     // type does not matter here so use kmp_int32
754     auto sh =
755         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
756             &team->t.t_disp_buffer[i]);
757     if (sh->hier) {
758       sh->hier->deallocate();
759       __kmp_free(sh->hier);
760     }
761   }
762 }
763 #endif
764 
765 // UT - unsigned flavor of T, ST - signed flavor of T,
766 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
767 template <typename T>
768 static void
769 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
770                     T ub, typename traits_t<T>::signed_t st,
771                     typename traits_t<T>::signed_t chunk, int push_ws) {
772   typedef typename traits_t<T>::unsigned_t UT;
773 
774   int active;
775   kmp_info_t *th;
776   kmp_team_t *team;
777   kmp_uint32 my_buffer_index;
778   dispatch_private_info_template<T> *pr;
779   dispatch_shared_info_template<T> volatile *sh;
780 
781   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
782                    sizeof(dispatch_private_info));
783   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
784                    sizeof(dispatch_shared_info));
785   __kmp_assert_valid_gtid(gtid);
786 
787   if (!TCR_4(__kmp_init_parallel))
788     __kmp_parallel_initialize();
789 
790   __kmp_resume_if_soft_paused();
791 
792 #if INCLUDE_SSC_MARKS
793   SSC_MARK_DISPATCH_INIT();
794 #endif
795 #ifdef KMP_DEBUG
796   typedef typename traits_t<T>::signed_t ST;
797   {
798     char *buff;
799     // create format specifiers before the debug output
800     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
801                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
802                             traits_t<ST>::spec, traits_t<T>::spec,
803                             traits_t<T>::spec, traits_t<ST>::spec);
804     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
805     __kmp_str_free(&buff);
806   }
807 #endif
808   /* setup data */
809   th = __kmp_threads[gtid];
810   team = th->th.th_team;
811   active = !team->t.t_serialized;
812   th->th.th_ident = loc;
813 
814   // Any half-decent optimizer will remove this test when the blocks are empty
815   // since the macros expand to nothing
816   // when statistics are disabled.
817   if (schedule == __kmp_static) {
818     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
819   } else {
820     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
821   }
822 
823 #if KMP_USE_HIER_SCHED
824   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
825   // Hierarchical scheduling does not work with ordered, so if ordered is
826   // detected, then revert back to threaded scheduling.
827   bool ordered;
828   enum sched_type my_sched = schedule;
829   my_buffer_index = th->th.th_dispatch->th_disp_index;
830   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
831       &th->th.th_dispatch
832            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
833   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
834   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
835     my_sched =
836         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
837   ordered = (kmp_ord_lower & my_sched);
838   if (pr->flags.use_hier) {
839     if (ordered) {
840       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
841                      "Disabling hierarchical scheduling.\n",
842                      gtid));
843       pr->flags.use_hier = FALSE;
844     }
845   }
846   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
847     // Don't use hierarchical for ordered parallel loops and don't
848     // use the runtime hierarchy if one was specified in the program
849     if (!ordered && !pr->flags.use_hier)
850       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
851   }
852 #endif // KMP_USE_HIER_SCHED
853 
854 #if USE_ITT_BUILD
855   kmp_uint64 cur_chunk = chunk;
856   int itt_need_metadata_reporting =
857       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
858       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
859       team->t.t_active_level == 1;
860 #endif
861   if (!active) {
862     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
864   } else {
865     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
866                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
867 
868     my_buffer_index = th->th.th_dispatch->th_disp_index++;
869 
870     /* What happens when number of threads changes, need to resize buffer? */
871     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
872         &th->th.th_dispatch
873              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
874     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
875         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
876     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
877                   my_buffer_index));
878   }
879 
880   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
881 #if USE_ITT_BUILD
882                                 &cur_chunk,
883 #endif
884                                 chunk, (T)th->th.th_team_nproc,
885                                 (T)th->th.th_info.ds.ds_tid);
886   if (active) {
887     if (pr->flags.ordered == 0) {
888       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
889       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
890     } else {
891       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
892       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
893     }
894   }
895 
896   if (active) {
897     /* The name of this buffer should be my_buffer_index when it's free to use
898      * it */
899 
900     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
901                    "sh->buffer_index:%d\n",
902                    gtid, my_buffer_index, sh->buffer_index));
903     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
904                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
905     // Note: KMP_WAIT() cannot be used there: buffer index and
906     // my_buffer_index are *always* 32-bit integers.
907     KMP_MB(); /* is this necessary? */
908     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
909                    "sh->buffer_index:%d\n",
910                    gtid, my_buffer_index, sh->buffer_index));
911 
912     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
913     th->th.th_dispatch->th_dispatch_sh_current =
914         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
915 #if USE_ITT_BUILD
916     if (pr->flags.ordered) {
917       __kmp_itt_ordered_init(gtid);
918     }
919     // Report loop metadata
920     if (itt_need_metadata_reporting) {
921       // Only report metadata by master of active team at level 1
922       kmp_uint64 schedtype = 0;
923       switch (schedule) {
924       case kmp_sch_static_chunked:
925       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
926         break;
927       case kmp_sch_static_greedy:
928         cur_chunk = pr->u.p.parm1;
929         break;
930       case kmp_sch_dynamic_chunked:
931         schedtype = 1;
932         break;
933       case kmp_sch_guided_iterative_chunked:
934       case kmp_sch_guided_analytical_chunked:
935       case kmp_sch_guided_simd:
936         schedtype = 2;
937         break;
938       default:
939         // Should we put this case under "static"?
940         // case kmp_sch_static_steal:
941         schedtype = 3;
942         break;
943       }
944       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
945     }
946 #if KMP_USE_HIER_SCHED
947     if (pr->flags.use_hier) {
948       pr->u.p.count = 0;
949       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
950     }
951 #endif // KMP_USER_HIER_SCHED
952 #endif /* USE_ITT_BUILD */
953   }
954 
955 #ifdef KMP_DEBUG
956   {
957     char *buff;
958     // create format specifiers before the debug output
959     buff = __kmp_str_format(
960         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
961         "lb:%%%s ub:%%%s"
962         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
963         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
964         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
965         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
966         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
967         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
968     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
969                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
970                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
971                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
972     __kmp_str_free(&buff);
973   }
974 #endif
975 #if (KMP_STATIC_STEAL_ENABLED)
976   // It cannot be guaranteed that after execution of a loop with some other
977   // schedule kind all the parm3 variables will contain the same value. Even if
978   // all parm3 will be the same, it still exists a bad case like using 0 and 1
979   // rather than program life-time increment. So the dedicated variable is
980   // required. The 'static_steal_counter' is used.
981   if (pr->schedule == kmp_sch_static_steal) {
982     // Other threads will inspect this variable when searching for a victim.
983     // This is a flag showing that other threads may steal from this thread
984     // since then.
985     volatile T *p = &pr->u.p.static_steal_counter;
986     *p = *p + 1;
987   }
988 #endif // ( KMP_STATIC_STEAL_ENABLED )
989 
990 #if OMPT_SUPPORT && OMPT_OPTIONAL
991   if (ompt_enabled.ompt_callback_work) {
992     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
993     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
994     ompt_callbacks.ompt_callback(ompt_callback_work)(
995         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
996         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
997   }
998 #endif
999   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1000 }
1001 
1002 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1003  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1004  * every chunk of iterations.  If the ordered section(s) were not executed
1005  * for this iteration (or every iteration in this chunk), we need to set the
1006  * ordered iteration counters so that the next thread can proceed. */
1007 template <typename UT>
1008 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1009   typedef typename traits_t<UT>::signed_t ST;
1010   __kmp_assert_valid_gtid(gtid);
1011   kmp_info_t *th = __kmp_threads[gtid];
1012 
1013   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1014   if (!th->th.th_team->t.t_serialized) {
1015 
1016     dispatch_private_info_template<UT> *pr =
1017         reinterpret_cast<dispatch_private_info_template<UT> *>(
1018             th->th.th_dispatch->th_dispatch_pr_current);
1019     dispatch_shared_info_template<UT> volatile *sh =
1020         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1021             th->th.th_dispatch->th_dispatch_sh_current);
1022     KMP_DEBUG_ASSERT(pr);
1023     KMP_DEBUG_ASSERT(sh);
1024     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1025                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1026 
1027     if (pr->ordered_bumped) {
1028       KD_TRACE(
1029           1000,
1030           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1031            gtid));
1032       pr->ordered_bumped = 0;
1033     } else {
1034       UT lower = pr->u.p.ordered_lower;
1035 
1036 #ifdef KMP_DEBUG
1037       {
1038         char *buff;
1039         // create format specifiers before the debug output
1040         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1041                                 "ordered_iteration:%%%s lower:%%%s\n",
1042                                 traits_t<UT>::spec, traits_t<UT>::spec);
1043         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1044         __kmp_str_free(&buff);
1045       }
1046 #endif
1047 
1048       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1049                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1050       KMP_MB(); /* is this necessary? */
1051 #ifdef KMP_DEBUG
1052       {
1053         char *buff;
1054         // create format specifiers before the debug output
1055         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1056                                 "ordered_iteration:%%%s lower:%%%s\n",
1057                                 traits_t<UT>::spec, traits_t<UT>::spec);
1058         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1059         __kmp_str_free(&buff);
1060       }
1061 #endif
1062 
1063       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1064     } // if
1065   } // if
1066   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1067 }
1068 
1069 #ifdef KMP_GOMP_COMPAT
1070 
1071 template <typename UT>
1072 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1073   typedef typename traits_t<UT>::signed_t ST;
1074   __kmp_assert_valid_gtid(gtid);
1075   kmp_info_t *th = __kmp_threads[gtid];
1076 
1077   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1078   if (!th->th.th_team->t.t_serialized) {
1079     //        int cid;
1080     dispatch_private_info_template<UT> *pr =
1081         reinterpret_cast<dispatch_private_info_template<UT> *>(
1082             th->th.th_dispatch->th_dispatch_pr_current);
1083     dispatch_shared_info_template<UT> volatile *sh =
1084         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1085             th->th.th_dispatch->th_dispatch_sh_current);
1086     KMP_DEBUG_ASSERT(pr);
1087     KMP_DEBUG_ASSERT(sh);
1088     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1089                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1090 
1091     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1092     UT lower = pr->u.p.ordered_lower;
1093     UT upper = pr->u.p.ordered_upper;
1094     UT inc = upper - lower + 1;
1095 
1096     if (pr->ordered_bumped == inc) {
1097       KD_TRACE(
1098           1000,
1099           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1100            gtid));
1101       pr->ordered_bumped = 0;
1102     } else {
1103       inc -= pr->ordered_bumped;
1104 
1105 #ifdef KMP_DEBUG
1106       {
1107         char *buff;
1108         // create format specifiers before the debug output
1109         buff = __kmp_str_format(
1110             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1111             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1112             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1113         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1114         __kmp_str_free(&buff);
1115       }
1116 #endif
1117 
1118       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1119                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1120 
1121       KMP_MB(); /* is this necessary? */
1122       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1123                       "ordered_bumped to zero\n",
1124                       gtid));
1125       pr->ordered_bumped = 0;
1126 //!!!!! TODO check if the inc should be unsigned, or signed???
1127 #ifdef KMP_DEBUG
1128       {
1129         char *buff;
1130         // create format specifiers before the debug output
1131         buff = __kmp_str_format(
1132             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1133             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1134             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1135             traits_t<UT>::spec);
1136         KD_TRACE(1000,
1137                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1138         __kmp_str_free(&buff);
1139       }
1140 #endif
1141 
1142       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1143     }
1144     //        }
1145   }
1146   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1147 }
1148 
1149 #endif /* KMP_GOMP_COMPAT */
1150 
1151 template <typename T>
1152 int __kmp_dispatch_next_algorithm(int gtid,
1153                                   dispatch_private_info_template<T> *pr,
1154                                   dispatch_shared_info_template<T> volatile *sh,
1155                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1156                                   typename traits_t<T>::signed_t *p_st, T nproc,
1157                                   T tid) {
1158   typedef typename traits_t<T>::unsigned_t UT;
1159   typedef typename traits_t<T>::signed_t ST;
1160   typedef typename traits_t<T>::floating_t DBL;
1161   int status = 0;
1162   bool last = false;
1163   T start;
1164   ST incr;
1165   UT limit, trip, init;
1166   kmp_info_t *th = __kmp_threads[gtid];
1167   kmp_team_t *team = th->th.th_team;
1168 
1169   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1170                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1171   KMP_DEBUG_ASSERT(pr);
1172   KMP_DEBUG_ASSERT(sh);
1173   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1174 #ifdef KMP_DEBUG
1175   {
1176     char *buff;
1177     // create format specifiers before the debug output
1178     buff =
1179         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1180                          "sh:%%p nproc:%%%s tid:%%%s\n",
1181                          traits_t<T>::spec, traits_t<T>::spec);
1182     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1183     __kmp_str_free(&buff);
1184   }
1185 #endif
1186 
1187   // zero trip count
1188   if (pr->u.p.tc == 0) {
1189     KD_TRACE(10,
1190              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1191               "zero status:%d\n",
1192               gtid, status));
1193     return 0;
1194   }
1195 
1196   switch (pr->schedule) {
1197 #if (KMP_STATIC_STEAL_ENABLED)
1198   case kmp_sch_static_steal: {
1199     T chunk = pr->u.p.parm1;
1200 
1201     KD_TRACE(100,
1202              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1203               gtid));
1204 
1205     trip = pr->u.p.tc - 1;
1206 
1207     if (traits_t<T>::type_size > 4) {
1208       // use lock for 8-byte and CAS for 4-byte induction
1209       // variable. TODO (optional): check and use 16-byte CAS
1210       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1211       KMP_DEBUG_ASSERT(lck != NULL);
1212       if (pr->u.p.count < (UT)pr->u.p.ub) {
1213         __kmp_acquire_lock(lck, gtid);
1214         // try to get own chunk of iterations
1215         init = (pr->u.p.count)++;
1216         status = (init < (UT)pr->u.p.ub);
1217         __kmp_release_lock(lck, gtid);
1218       } else {
1219         status = 0; // no own chunks
1220       }
1221       if (!status) { // try to steal
1222         kmp_info_t **other_threads = team->t.t_threads;
1223         T while_limit = pr->u.p.parm3;
1224         T while_index = 0;
1225         T id = pr->u.p.static_steal_counter; // loop id
1226         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1227                   __kmp_dispatch_num_buffers; // current loop index
1228         // note: victim thread can potentially execute another loop
1229         // TODO: algorithm of searching for a victim
1230         // should be cleaned up and measured
1231         while ((!status) && (while_limit != ++while_index)) {
1232           dispatch_private_info_template<T> *victim;
1233           T remaining;
1234           T victimIdx = pr->u.p.parm4;
1235           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1236           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1237               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1238           KMP_DEBUG_ASSERT(victim);
1239           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1240                  oldVictimIdx != victimIdx) {
1241             victimIdx = (victimIdx + 1) % nproc;
1242             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1243                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1244             KMP_DEBUG_ASSERT(victim);
1245           }
1246           if (victim == pr || id != victim->u.p.static_steal_counter) {
1247             continue; // try once more (nproc attempts in total)
1248             // no victim is ready yet to participate in stealing
1249             // because no victim passed kmp_init_dispatch yet
1250           }
1251           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1252             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1253             continue; // not enough chunks to steal, goto next victim
1254           }
1255 
1256           lck = victim->u.p.th_steal_lock;
1257           KMP_ASSERT(lck != NULL);
1258           __kmp_acquire_lock(lck, gtid);
1259           limit = victim->u.p.ub; // keep initial ub
1260           if (victim->u.p.count >= limit ||
1261               (remaining = limit - victim->u.p.count) < 2) {
1262             __kmp_release_lock(lck, gtid);
1263             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1264             continue; // not enough chunks to steal
1265           }
1266           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1267           // by 1
1268           if (remaining > 3) {
1269             // steal 1/4 of remaining
1270             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1271             init = (victim->u.p.ub -= (remaining >> 2));
1272           } else {
1273             // steal 1 chunk of 2 or 3 remaining
1274             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1275             init = (victim->u.p.ub -= 1);
1276           }
1277           __kmp_release_lock(lck, gtid);
1278 
1279           KMP_DEBUG_ASSERT(init + 1 <= limit);
1280           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1281           status = 1;
1282           while_index = 0;
1283           // now update own count and ub with stolen range but init chunk
1284           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1285           pr->u.p.count = init + 1;
1286           pr->u.p.ub = limit;
1287           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1288         } // while (search for victim)
1289       } // if (try to find victim and steal)
1290     } else {
1291       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1292       typedef union {
1293         struct {
1294           UT count;
1295           T ub;
1296         } p;
1297         kmp_int64 b;
1298       } union_i4;
1299       // All operations on 'count' or 'ub' must be combined atomically
1300       // together.
1301       {
1302         union_i4 vold, vnew;
1303         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1304         vnew = vold;
1305         vnew.p.count++;
1306         while (!KMP_COMPARE_AND_STORE_ACQ64(
1307             (volatile kmp_int64 *)&pr->u.p.count,
1308             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1309             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1310           KMP_CPU_PAUSE();
1311           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1312           vnew = vold;
1313           vnew.p.count++;
1314         }
1315         vnew = vold;
1316         init = vnew.p.count;
1317         status = (init < (UT)vnew.p.ub);
1318       }
1319 
1320       if (!status) {
1321         kmp_info_t **other_threads = team->t.t_threads;
1322         T while_limit = pr->u.p.parm3;
1323         T while_index = 0;
1324         T id = pr->u.p.static_steal_counter; // loop id
1325         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1326                   __kmp_dispatch_num_buffers; // current loop index
1327         // note: victim thread can potentially execute another loop
1328         // TODO: algorithm of searching for a victim
1329         // should be cleaned up and measured
1330         while ((!status) && (while_limit != ++while_index)) {
1331           dispatch_private_info_template<T> *victim;
1332           union_i4 vold, vnew;
1333           T remaining;
1334           T victimIdx = pr->u.p.parm4;
1335           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1336           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1337               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1338           KMP_DEBUG_ASSERT(victim);
1339           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1340                  oldVictimIdx != victimIdx) {
1341             victimIdx = (victimIdx + 1) % nproc;
1342             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1343                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1344             KMP_DEBUG_ASSERT(victim);
1345           }
1346           if (victim == pr || id != victim->u.p.static_steal_counter) {
1347             continue; // try once more (nproc attempts in total)
1348             // no victim is ready yet to participate in stealing
1349             // because no victim passed kmp_init_dispatch yet
1350           }
1351           pr->u.p.parm4 = victimIdx; // new victim found
1352           while (1) { // CAS loop if victim has enough chunks to steal
1353             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1354             vnew = vold;
1355 
1356             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1357             if (vnew.p.count >= (UT)vnew.p.ub ||
1358                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1359               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1360               break; // not enough chunks to steal, goto next victim
1361             }
1362             if (remaining > 3) {
1363               // try to steal 1/4 of remaining
1364               vnew.p.ub -= remaining >> 2;
1365             } else {
1366               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1367             }
1368             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1369             // TODO: Should this be acquire or release?
1370             if (KMP_COMPARE_AND_STORE_ACQ64(
1371                     (volatile kmp_int64 *)&victim->u.p.count,
1372                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1373                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1374               // stealing succeeded
1375               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1376                                         vold.p.ub - vnew.p.ub);
1377               status = 1;
1378               while_index = 0;
1379               // now update own count and ub
1380               init = vnew.p.ub;
1381               vold.p.count = init + 1;
1382 #if KMP_ARCH_X86
1383               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1384 #else
1385               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1386 #endif
1387               break;
1388             } // if (check CAS result)
1389             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1390           } // while (try to steal from particular victim)
1391         } // while (search for victim)
1392       } // if (try to find victim and steal)
1393     } // if (4-byte induction variable)
1394     if (!status) {
1395       *p_lb = 0;
1396       *p_ub = 0;
1397       if (p_st != NULL)
1398         *p_st = 0;
1399     } else {
1400       start = pr->u.p.parm2;
1401       init *= chunk;
1402       limit = chunk + init - 1;
1403       incr = pr->u.p.st;
1404       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1405 
1406       KMP_DEBUG_ASSERT(init <= trip);
1407       if ((last = (limit >= trip)) != 0)
1408         limit = trip;
1409       if (p_st != NULL)
1410         *p_st = incr;
1411 
1412       if (incr == 1) {
1413         *p_lb = start + init;
1414         *p_ub = start + limit;
1415       } else {
1416         *p_lb = start + init * incr;
1417         *p_ub = start + limit * incr;
1418       }
1419 
1420       if (pr->flags.ordered) {
1421         pr->u.p.ordered_lower = init;
1422         pr->u.p.ordered_upper = limit;
1423       } // if
1424     } // if
1425     break;
1426   } // case
1427 #endif // ( KMP_STATIC_STEAL_ENABLED )
1428   case kmp_sch_static_balanced: {
1429     KD_TRACE(
1430         10,
1431         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1432          gtid));
1433     /* check if thread has any iteration to do */
1434     if ((status = !pr->u.p.count) != 0) {
1435       pr->u.p.count = 1;
1436       *p_lb = pr->u.p.lb;
1437       *p_ub = pr->u.p.ub;
1438       last = (pr->u.p.parm1 != 0);
1439       if (p_st != NULL)
1440         *p_st = pr->u.p.st;
1441     } else { /* no iterations to do */
1442       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1443     }
1444   } // case
1445   break;
1446   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1447                                  merged here */
1448   case kmp_sch_static_chunked: {
1449     T parm1;
1450 
1451     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1452                    "kmp_sch_static_[affinity|chunked] case\n",
1453                    gtid));
1454     parm1 = pr->u.p.parm1;
1455 
1456     trip = pr->u.p.tc - 1;
1457     init = parm1 * (pr->u.p.count + tid);
1458 
1459     if ((status = (init <= trip)) != 0) {
1460       start = pr->u.p.lb;
1461       incr = pr->u.p.st;
1462       limit = parm1 + init - 1;
1463 
1464       if ((last = (limit >= trip)) != 0)
1465         limit = trip;
1466 
1467       if (p_st != NULL)
1468         *p_st = incr;
1469 
1470       pr->u.p.count += nproc;
1471 
1472       if (incr == 1) {
1473         *p_lb = start + init;
1474         *p_ub = start + limit;
1475       } else {
1476         *p_lb = start + init * incr;
1477         *p_ub = start + limit * incr;
1478       }
1479 
1480       if (pr->flags.ordered) {
1481         pr->u.p.ordered_lower = init;
1482         pr->u.p.ordered_upper = limit;
1483       } // if
1484     } // if
1485   } // case
1486   break;
1487 
1488   case kmp_sch_dynamic_chunked: {
1489     T chunk = pr->u.p.parm1;
1490 
1491     KD_TRACE(
1492         100,
1493         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1494          gtid));
1495 
1496     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1497     trip = pr->u.p.tc - 1;
1498 
1499     if ((status = (init <= trip)) == 0) {
1500       *p_lb = 0;
1501       *p_ub = 0;
1502       if (p_st != NULL)
1503         *p_st = 0;
1504     } else {
1505       start = pr->u.p.lb;
1506       limit = chunk + init - 1;
1507       incr = pr->u.p.st;
1508 
1509       if ((last = (limit >= trip)) != 0)
1510         limit = trip;
1511 
1512       if (p_st != NULL)
1513         *p_st = incr;
1514 
1515       if (incr == 1) {
1516         *p_lb = start + init;
1517         *p_ub = start + limit;
1518       } else {
1519         *p_lb = start + init * incr;
1520         *p_ub = start + limit * incr;
1521       }
1522 
1523       if (pr->flags.ordered) {
1524         pr->u.p.ordered_lower = init;
1525         pr->u.p.ordered_upper = limit;
1526       } // if
1527     } // if
1528   } // case
1529   break;
1530 
1531   case kmp_sch_guided_iterative_chunked: {
1532     T chunkspec = pr->u.p.parm1;
1533     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1534                    "iterative case\n",
1535                    gtid));
1536     trip = pr->u.p.tc;
1537     // Start atomic part of calculations
1538     while (1) {
1539       ST remaining; // signed, because can be < 0
1540       init = sh->u.s.iteration; // shared value
1541       remaining = trip - init;
1542       if (remaining <= 0) { // AC: need to compare with 0 first
1543         // nothing to do, don't try atomic op
1544         status = 0;
1545         break;
1546       }
1547       if ((T)remaining <
1548           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1549         // use dynamic-style schedule
1550         // atomically increment iterations, get old value
1551         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1552                                  (ST)chunkspec);
1553         remaining = trip - init;
1554         if (remaining <= 0) {
1555           status = 0; // all iterations got by other threads
1556         } else {
1557           // got some iterations to work on
1558           status = 1;
1559           if ((T)remaining > chunkspec) {
1560             limit = init + chunkspec - 1;
1561           } else {
1562             last = true; // the last chunk
1563             limit = init + remaining - 1;
1564           } // if
1565         } // if
1566         break;
1567       } // if
1568       limit = init + (UT)((double)remaining *
1569                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1570       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1571                                (ST)init, (ST)limit)) {
1572         // CAS was successful, chunk obtained
1573         status = 1;
1574         --limit;
1575         break;
1576       } // if
1577     } // while
1578     if (status != 0) {
1579       start = pr->u.p.lb;
1580       incr = pr->u.p.st;
1581       if (p_st != NULL)
1582         *p_st = incr;
1583       *p_lb = start + init * incr;
1584       *p_ub = start + limit * incr;
1585       if (pr->flags.ordered) {
1586         pr->u.p.ordered_lower = init;
1587         pr->u.p.ordered_upper = limit;
1588       } // if
1589     } else {
1590       *p_lb = 0;
1591       *p_ub = 0;
1592       if (p_st != NULL)
1593         *p_st = 0;
1594     } // if
1595   } // case
1596   break;
1597 
1598   case kmp_sch_guided_simd: {
1599     // same as iterative but curr-chunk adjusted to be multiple of given
1600     // chunk
1601     T chunk = pr->u.p.parm1;
1602     KD_TRACE(100,
1603              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1604               gtid));
1605     trip = pr->u.p.tc;
1606     // Start atomic part of calculations
1607     while (1) {
1608       ST remaining; // signed, because can be < 0
1609       init = sh->u.s.iteration; // shared value
1610       remaining = trip - init;
1611       if (remaining <= 0) { // AC: need to compare with 0 first
1612         status = 0; // nothing to do, don't try atomic op
1613         break;
1614       }
1615       KMP_DEBUG_ASSERT(init % chunk == 0);
1616       // compare with K*nproc*(chunk+1), K=2 by default
1617       if ((T)remaining < pr->u.p.parm2) {
1618         // use dynamic-style schedule
1619         // atomically increment iterations, get old value
1620         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1621                                  (ST)chunk);
1622         remaining = trip - init;
1623         if (remaining <= 0) {
1624           status = 0; // all iterations got by other threads
1625         } else {
1626           // got some iterations to work on
1627           status = 1;
1628           if ((T)remaining > chunk) {
1629             limit = init + chunk - 1;
1630           } else {
1631             last = true; // the last chunk
1632             limit = init + remaining - 1;
1633           } // if
1634         } // if
1635         break;
1636       } // if
1637       // divide by K*nproc
1638       UT span;
1639       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1640                          &span);
1641       UT rem = span % chunk;
1642       if (rem) // adjust so that span%chunk == 0
1643         span += chunk - rem;
1644       limit = init + span;
1645       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1646                                (ST)init, (ST)limit)) {
1647         // CAS was successful, chunk obtained
1648         status = 1;
1649         --limit;
1650         break;
1651       } // if
1652     } // while
1653     if (status != 0) {
1654       start = pr->u.p.lb;
1655       incr = pr->u.p.st;
1656       if (p_st != NULL)
1657         *p_st = incr;
1658       *p_lb = start + init * incr;
1659       *p_ub = start + limit * incr;
1660       if (pr->flags.ordered) {
1661         pr->u.p.ordered_lower = init;
1662         pr->u.p.ordered_upper = limit;
1663       } // if
1664     } else {
1665       *p_lb = 0;
1666       *p_ub = 0;
1667       if (p_st != NULL)
1668         *p_st = 0;
1669     } // if
1670   } // case
1671   break;
1672 
1673   case kmp_sch_guided_analytical_chunked: {
1674     T chunkspec = pr->u.p.parm1;
1675     UT chunkIdx;
1676 #if KMP_USE_X87CONTROL
1677     /* for storing original FPCW value for Windows* OS on
1678        IA-32 architecture 8-byte version */
1679     unsigned int oldFpcw;
1680     unsigned int fpcwSet = 0;
1681 #endif
1682     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1683                    "kmp_sch_guided_analytical_chunked case\n",
1684                    gtid));
1685 
1686     trip = pr->u.p.tc;
1687 
1688     KMP_DEBUG_ASSERT(nproc > 1);
1689     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1690 
1691     while (1) { /* this while loop is a safeguard against unexpected zero
1692                    chunk sizes */
1693       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1694       if (chunkIdx >= (UT)pr->u.p.parm2) {
1695         --trip;
1696         /* use dynamic-style scheduling */
1697         init = chunkIdx * chunkspec + pr->u.p.count;
1698         /* need to verify init > 0 in case of overflow in the above
1699          * calculation */
1700         if ((status = (init > 0 && init <= trip)) != 0) {
1701           limit = init + chunkspec - 1;
1702 
1703           if ((last = (limit >= trip)) != 0)
1704             limit = trip;
1705         }
1706         break;
1707       } else {
1708 /* use exponential-style scheduling */
1709 /* The following check is to workaround the lack of long double precision on
1710    Windows* OS.
1711    This check works around the possible effect that init != 0 for chunkIdx == 0.
1712  */
1713 #if KMP_USE_X87CONTROL
1714         /* If we haven't already done so, save original
1715            FPCW and set precision to 64-bit, as Windows* OS
1716            on IA-32 architecture defaults to 53-bit */
1717         if (!fpcwSet) {
1718           oldFpcw = _control87(0, 0);
1719           _control87(_PC_64, _MCW_PC);
1720           fpcwSet = 0x30000;
1721         }
1722 #endif
1723         if (chunkIdx) {
1724           init = __kmp_dispatch_guided_remaining<T>(
1725               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1726           KMP_DEBUG_ASSERT(init);
1727           init = trip - init;
1728         } else
1729           init = 0;
1730         limit = trip - __kmp_dispatch_guided_remaining<T>(
1731                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1732         KMP_ASSERT(init <= limit);
1733         if (init < limit) {
1734           KMP_DEBUG_ASSERT(limit <= trip);
1735           --limit;
1736           status = 1;
1737           break;
1738         } // if
1739       } // if
1740     } // while (1)
1741 #if KMP_USE_X87CONTROL
1742     /* restore FPCW if necessary
1743        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1744     */
1745     if (fpcwSet && (oldFpcw & fpcwSet))
1746       _control87(oldFpcw, _MCW_PC);
1747 #endif
1748     if (status != 0) {
1749       start = pr->u.p.lb;
1750       incr = pr->u.p.st;
1751       if (p_st != NULL)
1752         *p_st = incr;
1753       *p_lb = start + init * incr;
1754       *p_ub = start + limit * incr;
1755       if (pr->flags.ordered) {
1756         pr->u.p.ordered_lower = init;
1757         pr->u.p.ordered_upper = limit;
1758       }
1759     } else {
1760       *p_lb = 0;
1761       *p_ub = 0;
1762       if (p_st != NULL)
1763         *p_st = 0;
1764     }
1765   } // case
1766   break;
1767 
1768   case kmp_sch_trapezoidal: {
1769     UT index;
1770     T parm2 = pr->u.p.parm2;
1771     T parm3 = pr->u.p.parm3;
1772     T parm4 = pr->u.p.parm4;
1773     KD_TRACE(100,
1774              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1775               gtid));
1776 
1777     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1778 
1779     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1780     trip = pr->u.p.tc - 1;
1781 
1782     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1783       *p_lb = 0;
1784       *p_ub = 0;
1785       if (p_st != NULL)
1786         *p_st = 0;
1787     } else {
1788       start = pr->u.p.lb;
1789       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1790       incr = pr->u.p.st;
1791 
1792       if ((last = (limit >= trip)) != 0)
1793         limit = trip;
1794 
1795       if (p_st != NULL)
1796         *p_st = incr;
1797 
1798       if (incr == 1) {
1799         *p_lb = start + init;
1800         *p_ub = start + limit;
1801       } else {
1802         *p_lb = start + init * incr;
1803         *p_ub = start + limit * incr;
1804       }
1805 
1806       if (pr->flags.ordered) {
1807         pr->u.p.ordered_lower = init;
1808         pr->u.p.ordered_upper = limit;
1809       } // if
1810     } // if
1811   } // case
1812   break;
1813   default: {
1814     status = 0; // to avoid complaints on uninitialized variable use
1815     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1816                 KMP_HNT(GetNewerLibrary), // Hint
1817                 __kmp_msg_null // Variadic argument list terminator
1818     );
1819   } break;
1820   } // switch
1821   if (p_last)
1822     *p_last = last;
1823 #ifdef KMP_DEBUG
1824   if (pr->flags.ordered) {
1825     char *buff;
1826     // create format specifiers before the debug output
1827     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1828                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1829                             traits_t<UT>::spec, traits_t<UT>::spec);
1830     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1831     __kmp_str_free(&buff);
1832   }
1833   {
1834     char *buff;
1835     // create format specifiers before the debug output
1836     buff = __kmp_str_format(
1837         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1838         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1839         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1840     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1841     __kmp_str_free(&buff);
1842   }
1843 #endif
1844   return status;
1845 }
1846 
1847 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1848    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1849    is not called. */
1850 #if OMPT_SUPPORT && OMPT_OPTIONAL
1851 #define OMPT_LOOP_END                                                          \
1852   if (status == 0) {                                                           \
1853     if (ompt_enabled.ompt_callback_work) {                                     \
1854       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1855       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1856       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1857           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1858           &(task_info->task_data), 0, codeptr);                                \
1859     }                                                                          \
1860   }
1861 // TODO: implement count
1862 #else
1863 #define OMPT_LOOP_END // no-op
1864 #endif
1865 
1866 #if KMP_STATS_ENABLED
1867 #define KMP_STATS_LOOP_END                                                     \
1868   {                                                                            \
1869     kmp_int64 u, l, t, i;                                                      \
1870     l = (kmp_int64)(*p_lb);                                                    \
1871     u = (kmp_int64)(*p_ub);                                                    \
1872     i = (kmp_int64)(pr->u.p.st);                                               \
1873     if (status == 0) {                                                         \
1874       t = 0;                                                                   \
1875       KMP_POP_PARTITIONED_TIMER();                                             \
1876     } else if (i == 1) {                                                       \
1877       if (u >= l)                                                              \
1878         t = u - l + 1;                                                         \
1879       else                                                                     \
1880         t = 0;                                                                 \
1881     } else if (i < 0) {                                                        \
1882       if (l >= u)                                                              \
1883         t = (l - u) / (-i) + 1;                                                \
1884       else                                                                     \
1885         t = 0;                                                                 \
1886     } else {                                                                   \
1887       if (u >= l)                                                              \
1888         t = (u - l) / i + 1;                                                   \
1889       else                                                                     \
1890         t = 0;                                                                 \
1891     }                                                                          \
1892     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1893   }
1894 #else
1895 #define KMP_STATS_LOOP_END /* Nothing */
1896 #endif
1897 
1898 template <typename T>
1899 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1900                                T *p_lb, T *p_ub,
1901                                typename traits_t<T>::signed_t *p_st
1902 #if OMPT_SUPPORT && OMPT_OPTIONAL
1903                                ,
1904                                void *codeptr
1905 #endif
1906 ) {
1907 
1908   typedef typename traits_t<T>::unsigned_t UT;
1909   typedef typename traits_t<T>::signed_t ST;
1910   // This is potentially slightly misleading, schedule(runtime) will appear here
1911   // even if the actual runtime schedule is static. (Which points out a
1912   // disadvantage of schedule(runtime): even when static scheduling is used it
1913   // costs more than a compile time choice to use static scheduling would.)
1914   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1915 
1916   int status;
1917   dispatch_private_info_template<T> *pr;
1918   __kmp_assert_valid_gtid(gtid);
1919   kmp_info_t *th = __kmp_threads[gtid];
1920   kmp_team_t *team = th->th.th_team;
1921 
1922   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1923   KD_TRACE(
1924       1000,
1925       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1926        gtid, p_lb, p_ub, p_st, p_last));
1927 
1928   if (team->t.t_serialized) {
1929     /* NOTE: serialize this dispatch because we are not at the active level */
1930     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1931         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1932     KMP_DEBUG_ASSERT(pr);
1933 
1934     if ((status = (pr->u.p.tc != 0)) == 0) {
1935       *p_lb = 0;
1936       *p_ub = 0;
1937       //            if ( p_last != NULL )
1938       //                *p_last = 0;
1939       if (p_st != NULL)
1940         *p_st = 0;
1941       if (__kmp_env_consistency_check) {
1942         if (pr->pushed_ws != ct_none) {
1943           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1944         }
1945       }
1946     } else if (pr->flags.nomerge) {
1947       kmp_int32 last;
1948       T start;
1949       UT limit, trip, init;
1950       ST incr;
1951       T chunk = pr->u.p.parm1;
1952 
1953       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1954                      gtid));
1955 
1956       init = chunk * pr->u.p.count++;
1957       trip = pr->u.p.tc - 1;
1958 
1959       if ((status = (init <= trip)) == 0) {
1960         *p_lb = 0;
1961         *p_ub = 0;
1962         //                if ( p_last != NULL )
1963         //                    *p_last = 0;
1964         if (p_st != NULL)
1965           *p_st = 0;
1966         if (__kmp_env_consistency_check) {
1967           if (pr->pushed_ws != ct_none) {
1968             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1969           }
1970         }
1971       } else {
1972         start = pr->u.p.lb;
1973         limit = chunk + init - 1;
1974         incr = pr->u.p.st;
1975 
1976         if ((last = (limit >= trip)) != 0) {
1977           limit = trip;
1978 #if KMP_OS_WINDOWS
1979           pr->u.p.last_upper = pr->u.p.ub;
1980 #endif /* KMP_OS_WINDOWS */
1981         }
1982         if (p_last != NULL)
1983           *p_last = last;
1984         if (p_st != NULL)
1985           *p_st = incr;
1986         if (incr == 1) {
1987           *p_lb = start + init;
1988           *p_ub = start + limit;
1989         } else {
1990           *p_lb = start + init * incr;
1991           *p_ub = start + limit * incr;
1992         }
1993 
1994         if (pr->flags.ordered) {
1995           pr->u.p.ordered_lower = init;
1996           pr->u.p.ordered_upper = limit;
1997 #ifdef KMP_DEBUG
1998           {
1999             char *buff;
2000             // create format specifiers before the debug output
2001             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2002                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2003                                     traits_t<UT>::spec, traits_t<UT>::spec);
2004             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2005                             pr->u.p.ordered_upper));
2006             __kmp_str_free(&buff);
2007           }
2008 #endif
2009         } // if
2010       } // if
2011     } else {
2012       pr->u.p.tc = 0;
2013       *p_lb = pr->u.p.lb;
2014       *p_ub = pr->u.p.ub;
2015 #if KMP_OS_WINDOWS
2016       pr->u.p.last_upper = *p_ub;
2017 #endif /* KMP_OS_WINDOWS */
2018       if (p_last != NULL)
2019         *p_last = TRUE;
2020       if (p_st != NULL)
2021         *p_st = pr->u.p.st;
2022     } // if
2023 #ifdef KMP_DEBUG
2024     {
2025       char *buff;
2026       // create format specifiers before the debug output
2027       buff = __kmp_str_format(
2028           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2029           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2030           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2031       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2032                     (p_last ? *p_last : 0), status));
2033       __kmp_str_free(&buff);
2034     }
2035 #endif
2036 #if INCLUDE_SSC_MARKS
2037     SSC_MARK_DISPATCH_NEXT();
2038 #endif
2039     OMPT_LOOP_END;
2040     KMP_STATS_LOOP_END;
2041     return status;
2042   } else {
2043     kmp_int32 last = 0;
2044     dispatch_shared_info_template<T> volatile *sh;
2045 
2046     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2047                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2048 
2049     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2050         th->th.th_dispatch->th_dispatch_pr_current);
2051     KMP_DEBUG_ASSERT(pr);
2052     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2053         th->th.th_dispatch->th_dispatch_sh_current);
2054     KMP_DEBUG_ASSERT(sh);
2055 
2056 #if KMP_USE_HIER_SCHED
2057     if (pr->flags.use_hier)
2058       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2059     else
2060 #endif // KMP_USE_HIER_SCHED
2061       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2062                                                 p_st, th->th.th_team_nproc,
2063                                                 th->th.th_info.ds.ds_tid);
2064     // status == 0: no more iterations to execute
2065     if (status == 0) {
2066       UT num_done;
2067 
2068       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2069 #ifdef KMP_DEBUG
2070       {
2071         char *buff;
2072         // create format specifiers before the debug output
2073         buff = __kmp_str_format(
2074             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2075             traits_t<UT>::spec);
2076         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2077         __kmp_str_free(&buff);
2078       }
2079 #endif
2080 
2081 #if KMP_USE_HIER_SCHED
2082       pr->flags.use_hier = FALSE;
2083 #endif
2084       if ((ST)num_done == th->th.th_team_nproc - 1) {
2085 #if (KMP_STATIC_STEAL_ENABLED)
2086         if (pr->schedule == kmp_sch_static_steal &&
2087             traits_t<T>::type_size > 4) {
2088           int i;
2089           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2090                     __kmp_dispatch_num_buffers; // current loop index
2091           kmp_info_t **other_threads = team->t.t_threads;
2092           // loop complete, safe to destroy locks used for stealing
2093           for (i = 0; i < th->th.th_team_nproc; ++i) {
2094             dispatch_private_info_template<T> *buf =
2095                 reinterpret_cast<dispatch_private_info_template<T> *>(
2096                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2097             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2098             KMP_ASSERT(lck != NULL);
2099             __kmp_destroy_lock(lck);
2100             __kmp_free(lck);
2101             buf->u.p.th_steal_lock = NULL;
2102           }
2103         }
2104 #endif
2105         /* NOTE: release this buffer to be reused */
2106 
2107         KMP_MB(); /* Flush all pending memory write invalidates.  */
2108 
2109         sh->u.s.num_done = 0;
2110         sh->u.s.iteration = 0;
2111 
2112         /* TODO replace with general release procedure? */
2113         if (pr->flags.ordered) {
2114           sh->u.s.ordered_iteration = 0;
2115         }
2116 
2117         KMP_MB(); /* Flush all pending memory write invalidates.  */
2118 
2119         sh->buffer_index += __kmp_dispatch_num_buffers;
2120         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2121                        gtid, sh->buffer_index));
2122 
2123         KMP_MB(); /* Flush all pending memory write invalidates.  */
2124 
2125       } // if
2126       if (__kmp_env_consistency_check) {
2127         if (pr->pushed_ws != ct_none) {
2128           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2129         }
2130       }
2131 
2132       th->th.th_dispatch->th_deo_fcn = NULL;
2133       th->th.th_dispatch->th_dxo_fcn = NULL;
2134       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2135       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2136     } // if (status == 0)
2137 #if KMP_OS_WINDOWS
2138     else if (last) {
2139       pr->u.p.last_upper = pr->u.p.ub;
2140     }
2141 #endif /* KMP_OS_WINDOWS */
2142     if (p_last != NULL && status != 0)
2143       *p_last = last;
2144   } // if
2145 
2146 #ifdef KMP_DEBUG
2147   {
2148     char *buff;
2149     // create format specifiers before the debug output
2150     buff = __kmp_str_format(
2151         "__kmp_dispatch_next: T#%%d normal case: "
2152         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2153         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2154     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2155                   (p_last ? *p_last : 0), status));
2156     __kmp_str_free(&buff);
2157   }
2158 #endif
2159 #if INCLUDE_SSC_MARKS
2160   SSC_MARK_DISPATCH_NEXT();
2161 #endif
2162   OMPT_LOOP_END;
2163   KMP_STATS_LOOP_END;
2164   return status;
2165 }
2166 
2167 template <typename T>
2168 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2169                                   kmp_int32 *plastiter, T *plower, T *pupper,
2170                                   typename traits_t<T>::signed_t incr) {
2171   typedef typename traits_t<T>::unsigned_t UT;
2172   kmp_uint32 team_id;
2173   kmp_uint32 nteams;
2174   UT trip_count;
2175   kmp_team_t *team;
2176   kmp_info_t *th;
2177 
2178   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2179   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2180 #ifdef KMP_DEBUG
2181   typedef typename traits_t<T>::signed_t ST;
2182   {
2183     char *buff;
2184     // create format specifiers before the debug output
2185     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2186                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2187                             traits_t<T>::spec, traits_t<T>::spec,
2188                             traits_t<ST>::spec, traits_t<T>::spec);
2189     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2190     __kmp_str_free(&buff);
2191   }
2192 #endif
2193 
2194   if (__kmp_env_consistency_check) {
2195     if (incr == 0) {
2196       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2197                             loc);
2198     }
2199     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2200       // The loop is illegal.
2201       // Some zero-trip loops maintained by compiler, e.g.:
2202       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2203       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2204       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2205       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2206       // Compiler does not check the following illegal loops:
2207       //   for(i=0;i<10;i+=incr) // where incr<0
2208       //   for(i=10;i>0;i-=incr) // where incr<0
2209       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2210     }
2211   }
2212   __kmp_assert_valid_gtid(gtid);
2213   th = __kmp_threads[gtid];
2214   team = th->th.th_team;
2215   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2216   nteams = th->th.th_teams_size.nteams;
2217   team_id = team->t.t_master_tid;
2218   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2219 
2220   // compute global trip count
2221   if (incr == 1) {
2222     trip_count = *pupper - *plower + 1;
2223   } else if (incr == -1) {
2224     trip_count = *plower - *pupper + 1;
2225   } else if (incr > 0) {
2226     // upper-lower can exceed the limit of signed type
2227     trip_count = (UT)(*pupper - *plower) / incr + 1;
2228   } else {
2229     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2230   }
2231 
2232   if (trip_count <= nteams) {
2233     KMP_DEBUG_ASSERT(
2234         __kmp_static == kmp_sch_static_greedy ||
2235         __kmp_static ==
2236             kmp_sch_static_balanced); // Unknown static scheduling type.
2237     // only some teams get single iteration, others get nothing
2238     if (team_id < trip_count) {
2239       *pupper = *plower = *plower + team_id * incr;
2240     } else {
2241       *plower = *pupper + incr; // zero-trip loop
2242     }
2243     if (plastiter != NULL)
2244       *plastiter = (team_id == trip_count - 1);
2245   } else {
2246     if (__kmp_static == kmp_sch_static_balanced) {
2247       UT chunk = trip_count / nteams;
2248       UT extras = trip_count % nteams;
2249       *plower +=
2250           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2251       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2252       if (plastiter != NULL)
2253         *plastiter = (team_id == nteams - 1);
2254     } else {
2255       T chunk_inc_count =
2256           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2257       T upper = *pupper;
2258       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2259       // Unknown static scheduling type.
2260       *plower += team_id * chunk_inc_count;
2261       *pupper = *plower + chunk_inc_count - incr;
2262       // Check/correct bounds if needed
2263       if (incr > 0) {
2264         if (*pupper < *plower)
2265           *pupper = traits_t<T>::max_value;
2266         if (plastiter != NULL)
2267           *plastiter = *plower <= upper && *pupper > upper - incr;
2268         if (*pupper > upper)
2269           *pupper = upper; // tracker C73258
2270       } else {
2271         if (*pupper > *plower)
2272           *pupper = traits_t<T>::min_value;
2273         if (plastiter != NULL)
2274           *plastiter = *plower >= upper && *pupper < upper - incr;
2275         if (*pupper < upper)
2276           *pupper = upper; // tracker C73258
2277       }
2278     }
2279   }
2280 }
2281 
2282 //-----------------------------------------------------------------------------
2283 // Dispatch routines
2284 //    Transfer call to template< type T >
2285 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2286 //                         T lb, T ub, ST st, ST chunk )
2287 extern "C" {
2288 
2289 /*!
2290 @ingroup WORK_SHARING
2291 @{
2292 @param loc Source location
2293 @param gtid Global thread id
2294 @param schedule Schedule type
2295 @param lb  Lower bound
2296 @param ub  Upper bound
2297 @param st  Step (or increment if you prefer)
2298 @param chunk The chunk size to block with
2299 
2300 This function prepares the runtime to start a dynamically scheduled for loop,
2301 saving the loop arguments.
2302 These functions are all identical apart from the types of the arguments.
2303 */
2304 
2305 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2306                             enum sched_type schedule, kmp_int32 lb,
2307                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2308   KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310   OMPT_STORE_RETURN_ADDRESS(gtid);
2311 #endif
2312   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2313 }
2314 /*!
2315 See @ref __kmpc_dispatch_init_4
2316 */
2317 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2318                              enum sched_type schedule, kmp_uint32 lb,
2319                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2320   KMP_DEBUG_ASSERT(__kmp_init_serial);
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322   OMPT_STORE_RETURN_ADDRESS(gtid);
2323 #endif
2324   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2325 }
2326 
2327 /*!
2328 See @ref __kmpc_dispatch_init_4
2329 */
2330 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2331                             enum sched_type schedule, kmp_int64 lb,
2332                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2333   KMP_DEBUG_ASSERT(__kmp_init_serial);
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335   OMPT_STORE_RETURN_ADDRESS(gtid);
2336 #endif
2337   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2338 }
2339 
2340 /*!
2341 See @ref __kmpc_dispatch_init_4
2342 */
2343 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2344                              enum sched_type schedule, kmp_uint64 lb,
2345                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2346   KMP_DEBUG_ASSERT(__kmp_init_serial);
2347 #if OMPT_SUPPORT && OMPT_OPTIONAL
2348   OMPT_STORE_RETURN_ADDRESS(gtid);
2349 #endif
2350   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2351 }
2352 
2353 /*!
2354 See @ref __kmpc_dispatch_init_4
2355 
2356 Difference from __kmpc_dispatch_init set of functions is these functions
2357 are called for composite distribute parallel for construct. Thus before
2358 regular iterations dispatching we need to calc per-team iteration space.
2359 
2360 These functions are all identical apart from the types of the arguments.
2361 */
2362 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2363                                  enum sched_type schedule, kmp_int32 *p_last,
2364                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2365                                  kmp_int32 chunk) {
2366   KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   OMPT_STORE_RETURN_ADDRESS(gtid);
2369 #endif
2370   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2371   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2372 }
2373 
2374 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2375                                   enum sched_type schedule, kmp_int32 *p_last,
2376                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2377                                   kmp_int32 chunk) {
2378   KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2383   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384 }
2385 
2386 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2387                                  enum sched_type schedule, kmp_int32 *p_last,
2388                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2389                                  kmp_int64 chunk) {
2390   KMP_DEBUG_ASSERT(__kmp_init_serial);
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392   OMPT_STORE_RETURN_ADDRESS(gtid);
2393 #endif
2394   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2395   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2396 }
2397 
2398 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2399                                   enum sched_type schedule, kmp_int32 *p_last,
2400                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2401                                   kmp_int64 chunk) {
2402   KMP_DEBUG_ASSERT(__kmp_init_serial);
2403 #if OMPT_SUPPORT && OMPT_OPTIONAL
2404   OMPT_STORE_RETURN_ADDRESS(gtid);
2405 #endif
2406   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2407   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2408 }
2409 
2410 /*!
2411 @param loc Source code location
2412 @param gtid Global thread id
2413 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2414 otherwise
2415 @param p_lb   Pointer to the lower bound for the next chunk of work
2416 @param p_ub   Pointer to the upper bound for the next chunk of work
2417 @param p_st   Pointer to the stride for the next chunk of work
2418 @return one if there is work to be done, zero otherwise
2419 
2420 Get the next dynamically allocated chunk of work for this thread.
2421 If there is no more work, then the lb,ub and stride need not be modified.
2422 */
2423 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2424                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426   OMPT_STORE_RETURN_ADDRESS(gtid);
2427 #endif
2428   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL
2430                                         ,
2431                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2432 #endif
2433   );
2434 }
2435 
2436 /*!
2437 See @ref __kmpc_dispatch_next_4
2438 */
2439 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2440                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2441                             kmp_int32 *p_st) {
2442 #if OMPT_SUPPORT && OMPT_OPTIONAL
2443   OMPT_STORE_RETURN_ADDRESS(gtid);
2444 #endif
2445   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2446 #if OMPT_SUPPORT && OMPT_OPTIONAL
2447                                          ,
2448                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2449 #endif
2450   );
2451 }
2452 
2453 /*!
2454 See @ref __kmpc_dispatch_next_4
2455 */
2456 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2457                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2458 #if OMPT_SUPPORT && OMPT_OPTIONAL
2459   OMPT_STORE_RETURN_ADDRESS(gtid);
2460 #endif
2461   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2462 #if OMPT_SUPPORT && OMPT_OPTIONAL
2463                                         ,
2464                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2465 #endif
2466   );
2467 }
2468 
2469 /*!
2470 See @ref __kmpc_dispatch_next_4
2471 */
2472 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2473                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2474                             kmp_int64 *p_st) {
2475 #if OMPT_SUPPORT && OMPT_OPTIONAL
2476   OMPT_STORE_RETURN_ADDRESS(gtid);
2477 #endif
2478   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2479 #if OMPT_SUPPORT && OMPT_OPTIONAL
2480                                          ,
2481                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2482 #endif
2483   );
2484 }
2485 
2486 /*!
2487 @param loc Source code location
2488 @param gtid Global thread id
2489 
2490 Mark the end of a dynamic loop.
2491 */
2492 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2493   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2494 }
2495 
2496 /*!
2497 See @ref __kmpc_dispatch_fini_4
2498 */
2499 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2500   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2501 }
2502 
2503 /*!
2504 See @ref __kmpc_dispatch_fini_4
2505 */
2506 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2507   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2508 }
2509 
2510 /*!
2511 See @ref __kmpc_dispatch_fini_4
2512 */
2513 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2514   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2515 }
2516 /*! @} */
2517 
2518 //-----------------------------------------------------------------------------
2519 // Non-template routines from kmp_dispatch.cpp used in other sources
2520 
2521 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2522   return value == checker;
2523 }
2524 
2525 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2526   return value != checker;
2527 }
2528 
2529 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2530   return value < checker;
2531 }
2532 
2533 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2534   return value >= checker;
2535 }
2536 
2537 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2538   return value <= checker;
2539 }
2540 
2541 kmp_uint32
2542 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2543              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2544              void *obj // Higher-level synchronization object, or NULL.
2545 ) {
2546   // note: we may not belong to a team at this point
2547   volatile kmp_uint32 *spin = spinner;
2548   kmp_uint32 check = checker;
2549   kmp_uint32 spins;
2550   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2551   kmp_uint32 r;
2552 
2553   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2554   KMP_INIT_YIELD(spins);
2555   // main wait spin loop
2556   while (!f(r = TCR_4(*spin), check)) {
2557     KMP_FSYNC_SPIN_PREPARE(obj);
2558     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2559        split. It causes problems with infinite recursion because of exit lock */
2560     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2561         __kmp_abort_thread(); */
2562     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2563   }
2564   KMP_FSYNC_SPIN_ACQUIRED(obj);
2565   return r;
2566 }
2567 
2568 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2569                       kmp_uint32 (*pred)(void *, kmp_uint32),
2570                       void *obj // Higher-level synchronization object, or NULL.
2571 ) {
2572   // note: we may not belong to a team at this point
2573   void *spin = spinner;
2574   kmp_uint32 check = checker;
2575   kmp_uint32 spins;
2576   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2577 
2578   KMP_FSYNC_SPIN_INIT(obj, spin);
2579   KMP_INIT_YIELD(spins);
2580   // main wait spin loop
2581   while (!f(spin, check)) {
2582     KMP_FSYNC_SPIN_PREPARE(obj);
2583     /* if we have waited a bit, or are noversubscribed, yield */
2584     /* pause is in the following code */
2585     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2586   }
2587   KMP_FSYNC_SPIN_ACQUIRED(obj);
2588 }
2589 
2590 } // extern "C"
2591 
2592 #ifdef KMP_GOMP_COMPAT
2593 
2594 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2595                                enum sched_type schedule, kmp_int32 lb,
2596                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2597                                int push_ws) {
2598   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2599                                  push_ws);
2600 }
2601 
2602 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2603                                 enum sched_type schedule, kmp_uint32 lb,
2604                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2605                                 int push_ws) {
2606   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2607                                   push_ws);
2608 }
2609 
2610 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2611                                enum sched_type schedule, kmp_int64 lb,
2612                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2613                                int push_ws) {
2614   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2615                                  push_ws);
2616 }
2617 
2618 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2619                                 enum sched_type schedule, kmp_uint64 lb,
2620                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2621                                 int push_ws) {
2622   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2623                                   push_ws);
2624 }
2625 
2626 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2627   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2628 }
2629 
2630 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2631   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2632 }
2633 
2634 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2635   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2636 }
2637 
2638 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2639   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2640 }
2641 
2642 #endif /* KMP_GOMP_COMPAT */
2643 
2644 /* ------------------------------------------------------------------------ */
2645