1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc != NULL && loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 #if KMP_STATIC_STEAL_ENABLED
94 enum { // values for steal_flag (possible states of private per-loop buffer)
95   UNUSED = 0,
96   CLAIMED = 1, // owner thread started initialization
97   READY = 2, // available for stealing
98   THIEF = 3 // finished by owner, or claimed by thief
99   // possible state changes:
100   // 0 -> 1 owner only, sync
101   // 0 -> 3 thief only, sync
102   // 1 -> 2 owner only, async
103   // 2 -> 3 owner only, async
104   // 3 -> 2 owner only, async
105   // 3 -> 0 last thread finishing the loop, async
106 };
107 #endif
108 
109 // Initialize a dispatch_private_info_template<T> buffer for a particular
110 // type of schedule,chunk.  The loop description is found in lb (lower bound),
111 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
112 // to the scheduling (often the number of threads in a team, but not always if
113 // hierarchical scheduling is used).  tid is the id of the thread calling
114 // the function within the group of nproc threads.  It will have a value
115 // between 0 and nproc - 1.  This is often just the thread id within a team, but
116 // is not necessarily the case when using hierarchical scheduling.
117 // loc is the source file location of the corresponding loop
118 // gtid is the global thread id
119 template <typename T>
120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121                                    dispatch_private_info_template<T> *pr,
122                                    enum sched_type schedule, T lb, T ub,
123                                    typename traits_t<T>::signed_t st,
124 #if USE_ITT_BUILD
125                                    kmp_uint64 *cur_chunk,
126 #endif
127                                    typename traits_t<T>::signed_t chunk,
128                                    T nproc, T tid) {
129   typedef typename traits_t<T>::unsigned_t UT;
130   typedef typename traits_t<T>::floating_t DBL;
131 
132   int active;
133   T tc;
134   kmp_info_t *th;
135   kmp_team_t *team;
136   int monotonicity;
137   bool use_hier;
138 
139 #ifdef KMP_DEBUG
140   typedef typename traits_t<T>::signed_t ST;
141   {
142     char *buff;
143     // create format specifiers before the debug output
144     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147                             traits_t<T>::spec, traits_t<T>::spec,
148                             traits_t<ST>::spec, traits_t<ST>::spec,
149                             traits_t<T>::spec, traits_t<T>::spec);
150     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151     __kmp_str_free(&buff);
152   }
153 #endif
154   /* setup data */
155   th = __kmp_threads[gtid];
156   team = th->th.th_team;
157   active = !team->t.t_serialized;
158 
159 #if USE_ITT_BUILD
160   int itt_need_metadata_reporting =
161       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163       team->t.t_active_level == 1;
164 #endif
165 
166 #if KMP_USE_HIER_SCHED
167   use_hier = pr->flags.use_hier;
168 #else
169   use_hier = false;
170 #endif
171 
172   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175 
176   /* Pick up the nomerge/ordered bits from the scheduling type */
177   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178     pr->flags.nomerge = TRUE;
179     schedule =
180         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181   } else {
182     pr->flags.nomerge = FALSE;
183   }
184   pr->type_size = traits_t<T>::type_size; // remember the size of variables
185   if (kmp_ord_lower & schedule) {
186     pr->flags.ordered = TRUE;
187     schedule =
188         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189   } else {
190     pr->flags.ordered = FALSE;
191   }
192   // Ordered overrides nonmonotonic
193   if (pr->flags.ordered) {
194     monotonicity = SCHEDULE_MONOTONIC;
195   }
196 
197   if (schedule == kmp_sch_static) {
198     schedule = __kmp_static;
199   } else {
200     if (schedule == kmp_sch_runtime) {
201       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202       // not specified)
203       schedule = team->t.t_sched.r_sched_type;
204       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206       if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207         monotonicity = SCHEDULE_MONOTONIC;
208       // Detail the schedule if needed (global controls are differentiated
209       // appropriately)
210       if (schedule == kmp_sch_guided_chunked) {
211         schedule = __kmp_guided;
212       } else if (schedule == kmp_sch_static) {
213         schedule = __kmp_static;
214       }
215       // Use the chunk size specified by OMP_SCHEDULE (or default if not
216       // specified)
217       chunk = team->t.t_sched.chunk;
218 #if USE_ITT_BUILD
219       if (cur_chunk)
220         *cur_chunk = chunk;
221 #endif
222 #ifdef KMP_DEBUG
223       {
224         char *buff;
225         // create format specifiers before the debug output
226         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227                                 "schedule:%%d chunk:%%%s\n",
228                                 traits_t<ST>::spec);
229         KD_TRACE(10, (buff, gtid, schedule, chunk));
230         __kmp_str_free(&buff);
231       }
232 #endif
233     } else {
234       if (schedule == kmp_sch_guided_chunked) {
235         schedule = __kmp_guided;
236       }
237       if (chunk <= 0) {
238         chunk = KMP_DEFAULT_CHUNK;
239       }
240     }
241 
242     if (schedule == kmp_sch_auto) {
243       // mapping and differentiation: in the __kmp_do_serial_initialize()
244       schedule = __kmp_auto;
245 #ifdef KMP_DEBUG
246       {
247         char *buff;
248         // create format specifiers before the debug output
249         buff = __kmp_str_format(
250             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251             "schedule:%%d chunk:%%%s\n",
252             traits_t<ST>::spec);
253         KD_TRACE(10, (buff, gtid, schedule, chunk));
254         __kmp_str_free(&buff);
255       }
256 #endif
257     }
258 #if KMP_STATIC_STEAL_ENABLED
259     // map nonmonotonic:dynamic to static steal
260     if (schedule == kmp_sch_dynamic_chunked) {
261       if (monotonicity == SCHEDULE_NONMONOTONIC)
262         schedule = kmp_sch_static_steal;
263     }
264 #endif
265     /* guided analytical not safe for too many threads */
266     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267       schedule = kmp_sch_guided_iterative_chunked;
268       KMP_WARNING(DispatchManyThreads);
269     }
270     if (schedule == kmp_sch_runtime_simd) {
271       // compiler provides simd_width in the chunk parameter
272       schedule = team->t.t_sched.r_sched_type;
273       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275       // Detail the schedule if needed (global controls are differentiated
276       // appropriately)
277       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278           schedule == __kmp_static) {
279         schedule = kmp_sch_static_balanced_chunked;
280       } else {
281         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282           schedule = kmp_sch_guided_simd;
283         }
284         chunk = team->t.t_sched.chunk * chunk;
285       }
286 #if USE_ITT_BUILD
287       if (cur_chunk)
288         *cur_chunk = chunk;
289 #endif
290 #ifdef KMP_DEBUG
291       {
292         char *buff;
293         // create format specifiers before the debug output
294         buff = __kmp_str_format(
295             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296             " chunk:%%%s\n",
297             traits_t<ST>::spec);
298         KD_TRACE(10, (buff, gtid, schedule, chunk));
299         __kmp_str_free(&buff);
300       }
301 #endif
302     }
303     pr->u.p.parm1 = chunk;
304   }
305   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306               "unknown scheduling type");
307 
308   pr->u.p.count = 0;
309 
310   if (__kmp_env_consistency_check) {
311     if (st == 0) {
312       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314     }
315   }
316   // compute trip count
317   if (st == 1) { // most common case
318     if (ub >= lb) {
319       tc = ub - lb + 1;
320     } else { // ub < lb
321       tc = 0; // zero-trip
322     }
323   } else if (st < 0) {
324     if (lb >= ub) {
325       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326       // where the division needs to be unsigned regardless of the result type
327       tc = (UT)(lb - ub) / (-st) + 1;
328     } else { // lb < ub
329       tc = 0; // zero-trip
330     }
331   } else { // st > 0
332     if (ub >= lb) {
333       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334       // where the division needs to be unsigned regardless of the result type
335       tc = (UT)(ub - lb) / st + 1;
336     } else { // ub < lb
337       tc = 0; // zero-trip
338     }
339   }
340 
341 #if KMP_STATS_ENABLED
342   if (KMP_MASTER_GTID(gtid)) {
343     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344   }
345 #endif
346 
347   pr->u.p.lb = lb;
348   pr->u.p.ub = ub;
349   pr->u.p.st = st;
350   pr->u.p.tc = tc;
351 
352 #if KMP_OS_WINDOWS
353   pr->u.p.last_upper = ub + st;
354 #endif /* KMP_OS_WINDOWS */
355 
356   /* NOTE: only the active parallel region(s) has active ordered sections */
357 
358   if (active) {
359     if (pr->flags.ordered) {
360       pr->ordered_bumped = 0;
361       pr->u.p.ordered_lower = 1;
362       pr->u.p.ordered_upper = 0;
363     }
364   }
365 
366   switch (schedule) {
367 #if KMP_STATIC_STEAL_ENABLED
368   case kmp_sch_static_steal: {
369     T ntc, init;
370 
371     KD_TRACE(100,
372              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373               gtid));
374 
375     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376     if (nproc > 1 && ntc >= nproc) {
377       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378       T id = tid;
379       T small_chunk, extras;
380       kmp_uint32 old = UNUSED;
381       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382       if (traits_t<T>::type_size > 4) {
383         // AC: TODO: check if 16-byte CAS available and use it to
384         // improve performance (probably wait for explicit request
385         // before spending time on this).
386         // For now use dynamically allocated per-private-buffer lock,
387         // free memory in __kmp_dispatch_next when status==0.
388         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389         __kmp_init_lock(pr->u.p.steal_lock);
390       }
391       small_chunk = ntc / nproc;
392       extras = ntc % nproc;
393 
394       init = id * small_chunk + (id < extras ? id : extras);
395       pr->u.p.count = init;
396       if (claimed) { // are we succeeded in claiming own buffer?
397         pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398         // Other threads will inspect steal_flag when searching for a victim.
399         // READY means other threads may steal from this thread from now on.
400         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401       } else {
402         // other thread has stolen whole our range
403         KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404         pr->u.p.ub = init; // mark there is no iterations to work on
405       }
406       pr->u.p.parm2 = ntc; // save number of chunks
407       // parm3 is the number of times to attempt stealing which is
408       // nproc (just a heuristics, could be optimized later on).
409       pr->u.p.parm3 = nproc;
410       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411       break;
412     } else {
413       /* too few chunks: switching to kmp_sch_dynamic_chunked */
414       schedule = kmp_sch_dynamic_chunked;
415       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416                      "kmp_sch_dynamic_chunked\n",
417                      gtid));
418       goto dynamic_init;
419       break;
420     } // if
421   } // case
422 #endif
423   case kmp_sch_static_balanced: {
424     T init, limit;
425 
426     KD_TRACE(
427         100,
428         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429          gtid));
430 
431     if (nproc > 1) {
432       T id = tid;
433 
434       if (tc < nproc) {
435         if (id < tc) {
436           init = id;
437           limit = id;
438           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439         } else {
440           pr->u.p.count = 1; /* means no more chunks to execute */
441           pr->u.p.parm1 = FALSE;
442           break;
443         }
444       } else {
445         T small_chunk = tc / nproc;
446         T extras = tc % nproc;
447         init = id * small_chunk + (id < extras ? id : extras);
448         limit = init + small_chunk - (id < extras ? 0 : 1);
449         pr->u.p.parm1 = (id == nproc - 1);
450       }
451     } else {
452       if (tc > 0) {
453         init = 0;
454         limit = tc - 1;
455         pr->u.p.parm1 = TRUE;
456       } else {
457         // zero trip count
458         pr->u.p.count = 1; /* means no more chunks to execute */
459         pr->u.p.parm1 = FALSE;
460         break;
461       }
462     }
463 #if USE_ITT_BUILD
464     // Calculate chunk for metadata report
465     if (itt_need_metadata_reporting)
466       if (cur_chunk)
467         *cur_chunk = limit - init + 1;
468 #endif
469     if (st == 1) {
470       pr->u.p.lb = lb + init;
471       pr->u.p.ub = lb + limit;
472     } else {
473       // calculated upper bound, "ub" is user-defined upper bound
474       T ub_tmp = lb + limit * st;
475       pr->u.p.lb = lb + init * st;
476       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477       // it exactly
478       if (st > 0) {
479         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480       } else {
481         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482       }
483     }
484     if (pr->flags.ordered) {
485       pr->u.p.ordered_lower = init;
486       pr->u.p.ordered_upper = limit;
487     }
488     break;
489   } // case
490   case kmp_sch_static_balanced_chunked: {
491     // similar to balanced, but chunk adjusted to multiple of simd width
492     T nth = nproc;
493     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494                    " -> falling-through to static_greedy\n",
495                    gtid));
496     schedule = kmp_sch_static_greedy;
497     if (nth > 1)
498       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499     else
500       pr->u.p.parm1 = tc;
501     break;
502   } // case
503   case kmp_sch_guided_simd:
504   case kmp_sch_guided_iterative_chunked: {
505     KD_TRACE(
506         100,
507         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508          " case\n",
509          gtid));
510 
511     if (nproc > 1) {
512       if ((2L * chunk + 1) * nproc >= tc) {
513         /* chunk size too large, switch to dynamic */
514         schedule = kmp_sch_dynamic_chunked;
515         goto dynamic_init;
516       } else {
517         // when remaining iters become less than parm2 - switch to dynamic
518         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519         *(double *)&pr->u.p.parm3 =
520             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521       }
522     } else {
523       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524                      "kmp_sch_static_greedy\n",
525                      gtid));
526       schedule = kmp_sch_static_greedy;
527       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528       KD_TRACE(
529           100,
530           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531            gtid));
532       pr->u.p.parm1 = tc;
533     } // if
534   } // case
535   break;
536   case kmp_sch_guided_analytical_chunked: {
537     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538                    "kmp_sch_guided_analytical_chunked case\n",
539                    gtid));
540 
541     if (nproc > 1) {
542       if ((2L * chunk + 1) * nproc >= tc) {
543         /* chunk size too large, switch to dynamic */
544         schedule = kmp_sch_dynamic_chunked;
545         goto dynamic_init;
546       } else {
547         /* commonly used term: (2 nproc - 1)/(2 nproc) */
548         DBL x;
549 
550 #if KMP_USE_X87CONTROL
551         /* Linux* OS already has 64-bit computation by default for long double,
552            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554            instead of the default 53-bit. Even though long double doesn't work
555            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556            expected to impact the correctness of the algorithm, but this has not
557            been mathematically proven. */
558         // save original FPCW and set precision to 64-bit, as
559         // Windows* OS on IA-32 architecture defaults to 53-bit
560         unsigned int oldFpcw = _control87(0, 0);
561         _control87(_PC_64, _MCW_PC); // 0,0x30000
562 #endif
563         /* value used for comparison in solver for cross-over point */
564         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
565 
566         /* crossover point--chunk indexes equal to or greater than
567            this point switch to dynamic-style scheduling */
568         UT cross;
569 
570         /* commonly used term: (2 nproc - 1)/(2 nproc) */
571         x = 1.0 - 0.5 / (double)nproc;
572 
573 #ifdef KMP_DEBUG
574         { // test natural alignment
575           struct _test_a {
576             char a;
577             union {
578               char b;
579               DBL d;
580             };
581           } t;
582           ptrdiff_t natural_alignment =
583               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
584           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
585           // long)natural_alignment );
586           KMP_DEBUG_ASSERT(
587               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
588         }
589 #endif // KMP_DEBUG
590 
591         /* save the term in thread private dispatch structure */
592         *(DBL *)&pr->u.p.parm3 = x;
593 
594         /* solve for the crossover point to the nearest integer i for which C_i
595            <= chunk */
596         {
597           UT left, right, mid;
598           long double p;
599 
600           /* estimate initial upper and lower bound */
601 
602           /* doesn't matter what value right is as long as it is positive, but
603              it affects performance of the solver */
604           right = 229;
605           p = __kmp_pow<UT>(x, right);
606           if (p > target) {
607             do {
608               p *= p;
609               right <<= 1;
610             } while (p > target && right < (1 << 27));
611             /* lower bound is previous (failed) estimate of upper bound */
612             left = right >> 1;
613           } else {
614             left = 0;
615           }
616 
617           /* bisection root-finding method */
618           while (left + 1 < right) {
619             mid = (left + right) / 2;
620             if (__kmp_pow<UT>(x, mid) > target) {
621               left = mid;
622             } else {
623               right = mid;
624             }
625           } // while
626           cross = right;
627         }
628         /* assert sanity of computed crossover point */
629         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
630                    __kmp_pow<UT>(x, cross) <= target);
631 
632         /* save the crossover point in thread private dispatch structure */
633         pr->u.p.parm2 = cross;
634 
635 // C75803
636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
638 #else
639 #define GUIDED_ANALYTICAL_WORKAROUND (x)
640 #endif
641         /* dynamic-style scheduling offset */
642         pr->u.p.count = tc -
643                         __kmp_dispatch_guided_remaining(
644                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
645                         cross * chunk;
646 #if KMP_USE_X87CONTROL
647         // restore FPCW
648         _control87(oldFpcw, _MCW_PC);
649 #endif
650       } // if
651     } else {
652       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
653                      "kmp_sch_static_greedy\n",
654                      gtid));
655       schedule = kmp_sch_static_greedy;
656       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
657       pr->u.p.parm1 = tc;
658     } // if
659   } // case
660   break;
661   case kmp_sch_static_greedy:
662     KD_TRACE(
663         100,
664         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
665          gtid));
666     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
667     break;
668   case kmp_sch_static_chunked:
669   case kmp_sch_dynamic_chunked:
670   dynamic_init:
671     if (tc == 0)
672       break;
673     if (pr->u.p.parm1 <= 0)
674       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
675     else if (pr->u.p.parm1 > tc)
676       pr->u.p.parm1 = tc;
677     // Store the total number of chunks to prevent integer overflow during
678     // bounds calculations in the get next chunk routine.
679     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
680     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
681                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
682                    gtid));
683     break;
684   case kmp_sch_trapezoidal: {
685     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
686 
687     T parm1, parm2, parm3, parm4;
688     KD_TRACE(100,
689              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
690               gtid));
691 
692     parm1 = chunk;
693 
694     /* F : size of the first cycle */
695     parm2 = (tc / (2 * nproc));
696 
697     if (parm2 < 1) {
698       parm2 = 1;
699     }
700 
701     /* L : size of the last cycle.  Make sure the last cycle is not larger
702        than the first cycle. */
703     if (parm1 < 1) {
704       parm1 = 1;
705     } else if (parm1 > parm2) {
706       parm1 = parm2;
707     }
708 
709     /* N : number of cycles */
710     parm3 = (parm2 + parm1);
711     parm3 = (2 * tc + parm3 - 1) / parm3;
712 
713     if (parm3 < 2) {
714       parm3 = 2;
715     }
716 
717     /* sigma : decreasing incr of the trapezoid */
718     parm4 = (parm3 - 1);
719     parm4 = (parm2 - parm1) / parm4;
720 
721     // pointless check, because parm4 >= 0 always
722     // if ( parm4 < 0 ) {
723     //    parm4 = 0;
724     //}
725 
726     pr->u.p.parm1 = parm1;
727     pr->u.p.parm2 = parm2;
728     pr->u.p.parm3 = parm3;
729     pr->u.p.parm4 = parm4;
730   } // case
731   break;
732 
733   default: {
734     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
735                 KMP_HNT(GetNewerLibrary), // Hint
736                 __kmp_msg_null // Variadic argument list terminator
737     );
738   } break;
739   } // switch
740   pr->schedule = schedule;
741 }
742 
743 #if KMP_USE_HIER_SCHED
744 template <typename T>
745 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
746                                              typename traits_t<T>::signed_t st);
747 template <>
748 inline void
749 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
750                                             kmp_int32 ub, kmp_int32 st) {
751   __kmp_dispatch_init_hierarchy<kmp_int32>(
752       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
753       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
754 }
755 template <>
756 inline void
757 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
758                                              kmp_uint32 ub, kmp_int32 st) {
759   __kmp_dispatch_init_hierarchy<kmp_uint32>(
760       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
761       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
762 }
763 template <>
764 inline void
765 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
766                                             kmp_int64 ub, kmp_int64 st) {
767   __kmp_dispatch_init_hierarchy<kmp_int64>(
768       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
769       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
770 }
771 template <>
772 inline void
773 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
774                                              kmp_uint64 ub, kmp_int64 st) {
775   __kmp_dispatch_init_hierarchy<kmp_uint64>(
776       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
777       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
778 }
779 
780 // free all the hierarchy scheduling memory associated with the team
781 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
782   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
783   for (int i = 0; i < num_disp_buff; ++i) {
784     // type does not matter here so use kmp_int32
785     auto sh =
786         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
787             &team->t.t_disp_buffer[i]);
788     if (sh->hier) {
789       sh->hier->deallocate();
790       __kmp_free(sh->hier);
791     }
792   }
793 }
794 #endif
795 
796 // UT - unsigned flavor of T, ST - signed flavor of T,
797 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
798 template <typename T>
799 static void
800 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
801                     T ub, typename traits_t<T>::signed_t st,
802                     typename traits_t<T>::signed_t chunk, int push_ws) {
803   typedef typename traits_t<T>::unsigned_t UT;
804 
805   int active;
806   kmp_info_t *th;
807   kmp_team_t *team;
808   kmp_uint32 my_buffer_index;
809   dispatch_private_info_template<T> *pr;
810   dispatch_shared_info_template<T> volatile *sh;
811 
812   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
813                    sizeof(dispatch_private_info));
814   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
815                    sizeof(dispatch_shared_info));
816   __kmp_assert_valid_gtid(gtid);
817 
818   if (!TCR_4(__kmp_init_parallel))
819     __kmp_parallel_initialize();
820 
821   __kmp_resume_if_soft_paused();
822 
823 #if INCLUDE_SSC_MARKS
824   SSC_MARK_DISPATCH_INIT();
825 #endif
826 #ifdef KMP_DEBUG
827   typedef typename traits_t<T>::signed_t ST;
828   {
829     char *buff;
830     // create format specifiers before the debug output
831     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
832                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
833                             traits_t<ST>::spec, traits_t<T>::spec,
834                             traits_t<T>::spec, traits_t<ST>::spec);
835     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
836     __kmp_str_free(&buff);
837   }
838 #endif
839   /* setup data */
840   th = __kmp_threads[gtid];
841   team = th->th.th_team;
842   active = !team->t.t_serialized;
843   th->th.th_ident = loc;
844 
845   // Any half-decent optimizer will remove this test when the blocks are empty
846   // since the macros expand to nothing
847   // when statistics are disabled.
848   if (schedule == __kmp_static) {
849     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
850   } else {
851     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
852   }
853 
854 #if KMP_USE_HIER_SCHED
855   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
856   // Hierarchical scheduling does not work with ordered, so if ordered is
857   // detected, then revert back to threaded scheduling.
858   bool ordered;
859   enum sched_type my_sched = schedule;
860   my_buffer_index = th->th.th_dispatch->th_disp_index;
861   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862       &th->th.th_dispatch
863            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
865   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
866     my_sched =
867         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
868   ordered = (kmp_ord_lower & my_sched);
869   if (pr->flags.use_hier) {
870     if (ordered) {
871       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
872                      "Disabling hierarchical scheduling.\n",
873                      gtid));
874       pr->flags.use_hier = FALSE;
875     }
876   }
877   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
878     // Don't use hierarchical for ordered parallel loops and don't
879     // use the runtime hierarchy if one was specified in the program
880     if (!ordered && !pr->flags.use_hier)
881       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
882   }
883 #endif // KMP_USE_HIER_SCHED
884 
885 #if USE_ITT_BUILD
886   kmp_uint64 cur_chunk = chunk;
887   int itt_need_metadata_reporting =
888       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
889       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
890       team->t.t_active_level == 1;
891 #endif
892   if (!active) {
893     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
894         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
895   } else {
896     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
897                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
898 
899     my_buffer_index = th->th.th_dispatch->th_disp_index++;
900 
901     /* What happens when number of threads changes, need to resize buffer? */
902     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
903         &th->th.th_dispatch
904              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
905     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
906         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
907     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
908                   my_buffer_index));
909     if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
910       KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
911                      " sh->buffer_index:%d\n",
912                      gtid, my_buffer_index, sh->buffer_index));
913       __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
914                              __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
915       // Note: KMP_WAIT() cannot be used there: buffer index and
916       // my_buffer_index are *always* 32-bit integers.
917       KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
918                      "sh->buffer_index:%d\n",
919                      gtid, my_buffer_index, sh->buffer_index));
920     }
921   }
922 
923   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
924 #if USE_ITT_BUILD
925                                 &cur_chunk,
926 #endif
927                                 chunk, (T)th->th.th_team_nproc,
928                                 (T)th->th.th_info.ds.ds_tid);
929   if (active) {
930     if (pr->flags.ordered == 0) {
931       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
932       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
933     } else {
934       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
935       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
936     }
937     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
938     th->th.th_dispatch->th_dispatch_sh_current =
939         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
940 #if USE_ITT_BUILD
941     if (pr->flags.ordered) {
942       __kmp_itt_ordered_init(gtid);
943     }
944     // Report loop metadata
945     if (itt_need_metadata_reporting) {
946       // Only report metadata by primary thread of active team at level 1
947       kmp_uint64 schedtype = 0;
948       switch (schedule) {
949       case kmp_sch_static_chunked:
950       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
951         break;
952       case kmp_sch_static_greedy:
953         cur_chunk = pr->u.p.parm1;
954         break;
955       case kmp_sch_dynamic_chunked:
956         schedtype = 1;
957         break;
958       case kmp_sch_guided_iterative_chunked:
959       case kmp_sch_guided_analytical_chunked:
960       case kmp_sch_guided_simd:
961         schedtype = 2;
962         break;
963       default:
964         // Should we put this case under "static"?
965         // case kmp_sch_static_steal:
966         schedtype = 3;
967         break;
968       }
969       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
970     }
971 #if KMP_USE_HIER_SCHED
972     if (pr->flags.use_hier) {
973       pr->u.p.count = 0;
974       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
975     }
976 #endif // KMP_USER_HIER_SCHED
977 #endif /* USE_ITT_BUILD */
978   }
979 
980 #ifdef KMP_DEBUG
981   {
982     char *buff;
983     // create format specifiers before the debug output
984     buff = __kmp_str_format(
985         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
986         "lb:%%%s ub:%%%s"
987         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
988         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
989         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
990         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
991         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
992         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
993     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
994                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
995                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
996                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
997     __kmp_str_free(&buff);
998   }
999 #endif
1000 #if OMPT_SUPPORT && OMPT_OPTIONAL
1001   if (ompt_enabled.ompt_callback_work) {
1002     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1003     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1004     ompt_callbacks.ompt_callback(ompt_callback_work)(
1005         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1006         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1007   }
1008 #endif
1009   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1010 }
1011 
1012 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1013  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1014  * every chunk of iterations.  If the ordered section(s) were not executed
1015  * for this iteration (or every iteration in this chunk), we need to set the
1016  * ordered iteration counters so that the next thread can proceed. */
1017 template <typename UT>
1018 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1019   typedef typename traits_t<UT>::signed_t ST;
1020   __kmp_assert_valid_gtid(gtid);
1021   kmp_info_t *th = __kmp_threads[gtid];
1022 
1023   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1024   if (!th->th.th_team->t.t_serialized) {
1025 
1026     dispatch_private_info_template<UT> *pr =
1027         reinterpret_cast<dispatch_private_info_template<UT> *>(
1028             th->th.th_dispatch->th_dispatch_pr_current);
1029     dispatch_shared_info_template<UT> volatile *sh =
1030         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1031             th->th.th_dispatch->th_dispatch_sh_current);
1032     KMP_DEBUG_ASSERT(pr);
1033     KMP_DEBUG_ASSERT(sh);
1034     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1035                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1036 
1037     if (pr->ordered_bumped) {
1038       KD_TRACE(
1039           1000,
1040           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1041            gtid));
1042       pr->ordered_bumped = 0;
1043     } else {
1044       UT lower = pr->u.p.ordered_lower;
1045 
1046 #ifdef KMP_DEBUG
1047       {
1048         char *buff;
1049         // create format specifiers before the debug output
1050         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1051                                 "ordered_iteration:%%%s lower:%%%s\n",
1052                                 traits_t<UT>::spec, traits_t<UT>::spec);
1053         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1054         __kmp_str_free(&buff);
1055       }
1056 #endif
1057 
1058       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1059                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1060       KMP_MB(); /* is this necessary? */
1061 #ifdef KMP_DEBUG
1062       {
1063         char *buff;
1064         // create format specifiers before the debug output
1065         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1066                                 "ordered_iteration:%%%s lower:%%%s\n",
1067                                 traits_t<UT>::spec, traits_t<UT>::spec);
1068         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1069         __kmp_str_free(&buff);
1070       }
1071 #endif
1072 
1073       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1074     } // if
1075   } // if
1076   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1077 }
1078 
1079 #ifdef KMP_GOMP_COMPAT
1080 
1081 template <typename UT>
1082 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1083   typedef typename traits_t<UT>::signed_t ST;
1084   __kmp_assert_valid_gtid(gtid);
1085   kmp_info_t *th = __kmp_threads[gtid];
1086 
1087   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1088   if (!th->th.th_team->t.t_serialized) {
1089     dispatch_private_info_template<UT> *pr =
1090         reinterpret_cast<dispatch_private_info_template<UT> *>(
1091             th->th.th_dispatch->th_dispatch_pr_current);
1092     dispatch_shared_info_template<UT> volatile *sh =
1093         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1094             th->th.th_dispatch->th_dispatch_sh_current);
1095     KMP_DEBUG_ASSERT(pr);
1096     KMP_DEBUG_ASSERT(sh);
1097     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1098                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1099 
1100     UT lower = pr->u.p.ordered_lower;
1101     UT upper = pr->u.p.ordered_upper;
1102     UT inc = upper - lower + 1;
1103 
1104     if (pr->ordered_bumped == inc) {
1105       KD_TRACE(
1106           1000,
1107           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1108            gtid));
1109       pr->ordered_bumped = 0;
1110     } else {
1111       inc -= pr->ordered_bumped;
1112 
1113 #ifdef KMP_DEBUG
1114       {
1115         char *buff;
1116         // create format specifiers before the debug output
1117         buff = __kmp_str_format(
1118             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1119             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1120             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1121         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1122         __kmp_str_free(&buff);
1123       }
1124 #endif
1125 
1126       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1127                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1128 
1129       KMP_MB(); /* is this necessary? */
1130       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1131                       "ordered_bumped to zero\n",
1132                       gtid));
1133       pr->ordered_bumped = 0;
1134 //!!!!! TODO check if the inc should be unsigned, or signed???
1135 #ifdef KMP_DEBUG
1136       {
1137         char *buff;
1138         // create format specifiers before the debug output
1139         buff = __kmp_str_format(
1140             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1141             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1142             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1143             traits_t<UT>::spec);
1144         KD_TRACE(1000,
1145                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1146         __kmp_str_free(&buff);
1147       }
1148 #endif
1149 
1150       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1151     }
1152     //        }
1153   }
1154   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1155 }
1156 
1157 #endif /* KMP_GOMP_COMPAT */
1158 
1159 template <typename T>
1160 int __kmp_dispatch_next_algorithm(int gtid,
1161                                   dispatch_private_info_template<T> *pr,
1162                                   dispatch_shared_info_template<T> volatile *sh,
1163                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1164                                   typename traits_t<T>::signed_t *p_st, T nproc,
1165                                   T tid) {
1166   typedef typename traits_t<T>::unsigned_t UT;
1167   typedef typename traits_t<T>::signed_t ST;
1168   typedef typename traits_t<T>::floating_t DBL;
1169   int status = 0;
1170   bool last = false;
1171   T start;
1172   ST incr;
1173   UT limit, trip, init;
1174   kmp_info_t *th = __kmp_threads[gtid];
1175   kmp_team_t *team = th->th.th_team;
1176 
1177   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1178                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1179   KMP_DEBUG_ASSERT(pr);
1180   KMP_DEBUG_ASSERT(sh);
1181   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1182 #ifdef KMP_DEBUG
1183   {
1184     char *buff;
1185     // create format specifiers before the debug output
1186     buff =
1187         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1188                          "sh:%%p nproc:%%%s tid:%%%s\n",
1189                          traits_t<T>::spec, traits_t<T>::spec);
1190     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1191     __kmp_str_free(&buff);
1192   }
1193 #endif
1194 
1195   // zero trip count
1196   if (pr->u.p.tc == 0) {
1197     KD_TRACE(10,
1198              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1199               "zero status:%d\n",
1200               gtid, status));
1201     return 0;
1202   }
1203 
1204   switch (pr->schedule) {
1205 #if KMP_STATIC_STEAL_ENABLED
1206   case kmp_sch_static_steal: {
1207     T chunk = pr->u.p.parm1;
1208     UT nchunks = pr->u.p.parm2;
1209     KD_TRACE(100,
1210              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1211               gtid));
1212 
1213     trip = pr->u.p.tc - 1;
1214 
1215     if (traits_t<T>::type_size > 4) {
1216       // use lock for 8-byte induction variable.
1217       // TODO (optional): check presence and use 16-byte CAS
1218       kmp_lock_t *lck = pr->u.p.steal_lock;
1219       KMP_DEBUG_ASSERT(lck != NULL);
1220       if (pr->u.p.count < (UT)pr->u.p.ub) {
1221         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1222         __kmp_acquire_lock(lck, gtid);
1223         // try to get own chunk of iterations
1224         init = (pr->u.p.count)++;
1225         status = (init < (UT)pr->u.p.ub);
1226         __kmp_release_lock(lck, gtid);
1227       } else {
1228         status = 0; // no own chunks
1229       }
1230       if (!status) { // try to steal
1231         kmp_lock_t *lckv; // victim buffer's lock
1232         T while_limit = pr->u.p.parm3;
1233         T while_index = 0;
1234         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1235                   __kmp_dispatch_num_buffers; // current loop index
1236         // note: victim thread can potentially execute another loop
1237         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1238         while ((!status) && (while_limit != ++while_index)) {
1239           dispatch_private_info_template<T> *v;
1240           T remaining;
1241           T victimId = pr->u.p.parm4;
1242           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1243           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1244               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1245           KMP_DEBUG_ASSERT(v);
1246           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1247                  oldVictimId != victimId) {
1248             victimId = (victimId + 1) % nproc;
1249             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1250                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1251             KMP_DEBUG_ASSERT(v);
1252           }
1253           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1254             continue; // try once more (nproc attempts in total)
1255           }
1256           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1257             kmp_uint32 old = UNUSED;
1258             // try to steal whole range from inactive victim
1259             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1260             if (status) {
1261               // initialize self buffer with victim's whole range of chunks
1262               T id = victimId;
1263               T small_chunk, extras;
1264               small_chunk = nchunks / nproc; // chunks per thread
1265               extras = nchunks % nproc;
1266               init = id * small_chunk + (id < extras ? id : extras);
1267               __kmp_acquire_lock(lck, gtid);
1268               pr->u.p.count = init + 1; // exclude one we execute immediately
1269               pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1270               __kmp_release_lock(lck, gtid);
1271               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1272               // no need to reinitialize other thread invariants: lb, st, etc.
1273 #ifdef KMP_DEBUG
1274               {
1275                 char *buff;
1276                 // create format specifiers before the debug output
1277                 buff = __kmp_str_format(
1278                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1279                     "count:%%%s ub:%%%s\n",
1280                     traits_t<UT>::spec, traits_t<T>::spec);
1281                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1282                 __kmp_str_free(&buff);
1283               }
1284 #endif
1285               // activate non-empty buffer and let others steal from us
1286               if (pr->u.p.count < (UT)pr->u.p.ub)
1287                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1288               break;
1289             }
1290           }
1291           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1292               v->u.p.count >= (UT)v->u.p.ub) {
1293             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1294             continue; // no chunks to steal, try next victim
1295           }
1296           lckv = v->u.p.steal_lock;
1297           KMP_ASSERT(lckv != NULL);
1298           __kmp_acquire_lock(lckv, gtid);
1299           limit = v->u.p.ub; // keep initial ub
1300           if (v->u.p.count >= limit) {
1301             __kmp_release_lock(lckv, gtid);
1302             pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1303             continue; // no chunks to steal, try next victim
1304           }
1305 
1306           // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1307           // TODO: is this heuristics good enough??
1308           remaining = limit - v->u.p.count;
1309           if (remaining > 7) {
1310             // steal 1/4 of remaining
1311             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1312             init = (v->u.p.ub -= (remaining >> 2));
1313           } else {
1314             // steal 1 chunk of 1..7 remaining
1315             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1316             init = (v->u.p.ub -= 1);
1317           }
1318           __kmp_release_lock(lckv, gtid);
1319 #ifdef KMP_DEBUG
1320           {
1321             char *buff;
1322             // create format specifiers before the debug output
1323             buff = __kmp_str_format(
1324                 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1325                 "count:%%%s ub:%%%s\n",
1326                 traits_t<UT>::spec, traits_t<UT>::spec);
1327             KD_TRACE(10, (buff, gtid, victimId, init, limit));
1328             __kmp_str_free(&buff);
1329           }
1330 #endif
1331           KMP_DEBUG_ASSERT(init + 1 <= limit);
1332           pr->u.p.parm4 = victimId; // remember victim to steal from
1333           status = 1;
1334           // now update own count and ub with stolen range excluding init chunk
1335           __kmp_acquire_lock(lck, gtid);
1336           pr->u.p.count = init + 1;
1337           pr->u.p.ub = limit;
1338           __kmp_release_lock(lck, gtid);
1339           // activate non-empty buffer and let others steal from us
1340           if (init + 1 < limit)
1341             KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1342         } // while (search for victim)
1343       } // if (try to find victim and steal)
1344     } else {
1345       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1346       // as all operations on pair (count, ub) must be done atomically
1347       typedef union {
1348         struct {
1349           UT count;
1350           T ub;
1351         } p;
1352         kmp_int64 b;
1353       } union_i4;
1354       union_i4 vold, vnew;
1355       if (pr->u.p.count < (UT)pr->u.p.ub) {
1356         KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1357         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1358         vnew.b = vold.b;
1359         vnew.p.count++; // get chunk from head of self range
1360         while (!KMP_COMPARE_AND_STORE_REL64(
1361             (volatile kmp_int64 *)&pr->u.p.count,
1362             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1363             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1364           KMP_CPU_PAUSE();
1365           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1366           vnew.b = vold.b;
1367           vnew.p.count++;
1368         }
1369         init = vold.p.count;
1370         status = (init < (UT)vold.p.ub);
1371       } else {
1372         status = 0; // no own chunks
1373       }
1374       if (!status) { // try to steal
1375         T while_limit = pr->u.p.parm3;
1376         T while_index = 0;
1377         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1378                   __kmp_dispatch_num_buffers; // current loop index
1379         // note: victim thread can potentially execute another loop
1380         KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1381         while ((!status) && (while_limit != ++while_index)) {
1382           dispatch_private_info_template<T> *v;
1383           T remaining;
1384           T victimId = pr->u.p.parm4;
1385           T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1386           v = reinterpret_cast<dispatch_private_info_template<T> *>(
1387               &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1388           KMP_DEBUG_ASSERT(v);
1389           while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1390                  oldVictimId != victimId) {
1391             victimId = (victimId + 1) % nproc;
1392             v = reinterpret_cast<dispatch_private_info_template<T> *>(
1393                 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1394             KMP_DEBUG_ASSERT(v);
1395           }
1396           if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1397             continue; // try once more (nproc attempts in total)
1398           }
1399           if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1400             kmp_uint32 old = UNUSED;
1401             // try to steal whole range from inactive victim
1402             status = v->steal_flag.compare_exchange_strong(old, THIEF);
1403             if (status) {
1404               // initialize self buffer with victim's whole range of chunks
1405               T id = victimId;
1406               T small_chunk, extras;
1407               small_chunk = nchunks / nproc; // chunks per thread
1408               extras = nchunks % nproc;
1409               init = id * small_chunk + (id < extras ? id : extras);
1410               vnew.p.count = init + 1;
1411               vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1412               // write pair (count, ub) at once atomically
1413 #if KMP_ARCH_X86
1414               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1415 #else
1416               *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1417 #endif
1418               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1419               // no need to initialize other thread invariants: lb, st, etc.
1420 #ifdef KMP_DEBUG
1421               {
1422                 char *buff;
1423                 // create format specifiers before the debug output
1424                 buff = __kmp_str_format(
1425                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1426                     "count:%%%s ub:%%%s\n",
1427                     traits_t<UT>::spec, traits_t<T>::spec);
1428                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1429                 __kmp_str_free(&buff);
1430               }
1431 #endif
1432               // activate non-empty buffer and let others steal from us
1433               if (pr->u.p.count < (UT)pr->u.p.ub)
1434                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1435               break;
1436             }
1437           }
1438           while (1) { // CAS loop with check if victim still has enough chunks
1439             // many threads may be stealing concurrently from same victim
1440             vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1441             if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1442                 vold.p.count >= (UT)vold.p.ub) {
1443               pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1444               break; // no chunks to steal, try next victim
1445             }
1446             vnew.b = vold.b;
1447             remaining = vold.p.ub - vold.p.count;
1448             // try to steal 1/4 of remaining
1449             // TODO: is this heuristics good enough??
1450             if (remaining > 7) {
1451               vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1452             } else {
1453               vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1454             }
1455             KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1456             if (KMP_COMPARE_AND_STORE_REL64(
1457                     (volatile kmp_int64 *)&v->u.p.count,
1458                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1459                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1460               // stealing succedded
1461 #ifdef KMP_DEBUG
1462               {
1463                 char *buff;
1464                 // create format specifiers before the debug output
1465                 buff = __kmp_str_format(
1466                     "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1467                     "count:%%%s ub:%%%s\n",
1468                     traits_t<T>::spec, traits_t<T>::spec);
1469                 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1470                 __kmp_str_free(&buff);
1471               }
1472 #endif
1473               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1474                                         vold.p.ub - vnew.p.ub);
1475               status = 1;
1476               pr->u.p.parm4 = victimId; // keep victim id
1477               // now update own count and ub
1478               init = vnew.p.ub;
1479               vold.p.count = init + 1;
1480 #if KMP_ARCH_X86
1481               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1482 #else
1483               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1484 #endif
1485               // activate non-empty buffer and let others steal from us
1486               if (vold.p.count < (UT)vold.p.ub)
1487                 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1488               break;
1489             } // if (check CAS result)
1490             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1491           } // while (try to steal from particular victim)
1492         } // while (search for victim)
1493       } // if (try to find victim and steal)
1494     } // if (4-byte induction variable)
1495     if (!status) {
1496       *p_lb = 0;
1497       *p_ub = 0;
1498       if (p_st != NULL)
1499         *p_st = 0;
1500     } else {
1501       start = pr->u.p.lb;
1502       init *= chunk;
1503       limit = chunk + init - 1;
1504       incr = pr->u.p.st;
1505       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1506 
1507       KMP_DEBUG_ASSERT(init <= trip);
1508       // keep track of done chunks for possible early exit from stealing
1509       // TODO: count executed chunks locally with rare update of shared location
1510       // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1511       if ((last = (limit >= trip)) != 0)
1512         limit = trip;
1513       if (p_st != NULL)
1514         *p_st = incr;
1515 
1516       if (incr == 1) {
1517         *p_lb = start + init;
1518         *p_ub = start + limit;
1519       } else {
1520         *p_lb = start + init * incr;
1521         *p_ub = start + limit * incr;
1522       }
1523     } // if
1524     break;
1525   } // case
1526 #endif // KMP_STATIC_STEAL_ENABLED
1527   case kmp_sch_static_balanced: {
1528     KD_TRACE(
1529         10,
1530         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1531          gtid));
1532     /* check if thread has any iteration to do */
1533     if ((status = !pr->u.p.count) != 0) {
1534       pr->u.p.count = 1;
1535       *p_lb = pr->u.p.lb;
1536       *p_ub = pr->u.p.ub;
1537       last = (pr->u.p.parm1 != 0);
1538       if (p_st != NULL)
1539         *p_st = pr->u.p.st;
1540     } else { /* no iterations to do */
1541       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1542     }
1543   } // case
1544   break;
1545   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1546                                  merged here */
1547   case kmp_sch_static_chunked: {
1548     T parm1;
1549 
1550     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1551                    "kmp_sch_static_[affinity|chunked] case\n",
1552                    gtid));
1553     parm1 = pr->u.p.parm1;
1554 
1555     trip = pr->u.p.tc - 1;
1556     init = parm1 * (pr->u.p.count + tid);
1557 
1558     if ((status = (init <= trip)) != 0) {
1559       start = pr->u.p.lb;
1560       incr = pr->u.p.st;
1561       limit = parm1 + init - 1;
1562 
1563       if ((last = (limit >= trip)) != 0)
1564         limit = trip;
1565 
1566       if (p_st != NULL)
1567         *p_st = incr;
1568 
1569       pr->u.p.count += nproc;
1570 
1571       if (incr == 1) {
1572         *p_lb = start + init;
1573         *p_ub = start + limit;
1574       } else {
1575         *p_lb = start + init * incr;
1576         *p_ub = start + limit * incr;
1577       }
1578 
1579       if (pr->flags.ordered) {
1580         pr->u.p.ordered_lower = init;
1581         pr->u.p.ordered_upper = limit;
1582       } // if
1583     } // if
1584   } // case
1585   break;
1586 
1587   case kmp_sch_dynamic_chunked: {
1588     UT chunk_number;
1589     UT chunk_size = pr->u.p.parm1;
1590     UT nchunks = pr->u.p.parm2;
1591 
1592     KD_TRACE(
1593         100,
1594         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1595          gtid));
1596 
1597     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1598     status = (chunk_number < nchunks);
1599     if (!status) {
1600       *p_lb = 0;
1601       *p_ub = 0;
1602       if (p_st != NULL)
1603         *p_st = 0;
1604     } else {
1605       init = chunk_size * chunk_number;
1606       trip = pr->u.p.tc - 1;
1607       start = pr->u.p.lb;
1608       incr = pr->u.p.st;
1609 
1610       if ((last = (trip - init < (UT)chunk_size)))
1611         limit = trip;
1612       else
1613         limit = chunk_size + init - 1;
1614 
1615       if (p_st != NULL)
1616         *p_st = incr;
1617 
1618       if (incr == 1) {
1619         *p_lb = start + init;
1620         *p_ub = start + limit;
1621       } else {
1622         *p_lb = start + init * incr;
1623         *p_ub = start + limit * incr;
1624       }
1625 
1626       if (pr->flags.ordered) {
1627         pr->u.p.ordered_lower = init;
1628         pr->u.p.ordered_upper = limit;
1629       } // if
1630     } // if
1631   } // case
1632   break;
1633 
1634   case kmp_sch_guided_iterative_chunked: {
1635     T chunkspec = pr->u.p.parm1;
1636     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1637                    "iterative case\n",
1638                    gtid));
1639     trip = pr->u.p.tc;
1640     // Start atomic part of calculations
1641     while (1) {
1642       ST remaining; // signed, because can be < 0
1643       init = sh->u.s.iteration; // shared value
1644       remaining = trip - init;
1645       if (remaining <= 0) { // AC: need to compare with 0 first
1646         // nothing to do, don't try atomic op
1647         status = 0;
1648         break;
1649       }
1650       if ((T)remaining <
1651           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1652         // use dynamic-style schedule
1653         // atomically increment iterations, get old value
1654         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1655                                  (ST)chunkspec);
1656         remaining = trip - init;
1657         if (remaining <= 0) {
1658           status = 0; // all iterations got by other threads
1659         } else {
1660           // got some iterations to work on
1661           status = 1;
1662           if ((T)remaining > chunkspec) {
1663             limit = init + chunkspec - 1;
1664           } else {
1665             last = true; // the last chunk
1666             limit = init + remaining - 1;
1667           } // if
1668         } // if
1669         break;
1670       } // if
1671       limit = init + (UT)((double)remaining *
1672                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1673       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1674                                (ST)init, (ST)limit)) {
1675         // CAS was successful, chunk obtained
1676         status = 1;
1677         --limit;
1678         break;
1679       } // if
1680     } // while
1681     if (status != 0) {
1682       start = pr->u.p.lb;
1683       incr = pr->u.p.st;
1684       if (p_st != NULL)
1685         *p_st = incr;
1686       *p_lb = start + init * incr;
1687       *p_ub = start + limit * incr;
1688       if (pr->flags.ordered) {
1689         pr->u.p.ordered_lower = init;
1690         pr->u.p.ordered_upper = limit;
1691       } // if
1692     } else {
1693       *p_lb = 0;
1694       *p_ub = 0;
1695       if (p_st != NULL)
1696         *p_st = 0;
1697     } // if
1698   } // case
1699   break;
1700 
1701   case kmp_sch_guided_simd: {
1702     // same as iterative but curr-chunk adjusted to be multiple of given
1703     // chunk
1704     T chunk = pr->u.p.parm1;
1705     KD_TRACE(100,
1706              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1707               gtid));
1708     trip = pr->u.p.tc;
1709     // Start atomic part of calculations
1710     while (1) {
1711       ST remaining; // signed, because can be < 0
1712       init = sh->u.s.iteration; // shared value
1713       remaining = trip - init;
1714       if (remaining <= 0) { // AC: need to compare with 0 first
1715         status = 0; // nothing to do, don't try atomic op
1716         break;
1717       }
1718       KMP_DEBUG_ASSERT(init % chunk == 0);
1719       // compare with K*nproc*(chunk+1), K=2 by default
1720       if ((T)remaining < pr->u.p.parm2) {
1721         // use dynamic-style schedule
1722         // atomically increment iterations, get old value
1723         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1724                                  (ST)chunk);
1725         remaining = trip - init;
1726         if (remaining <= 0) {
1727           status = 0; // all iterations got by other threads
1728         } else {
1729           // got some iterations to work on
1730           status = 1;
1731           if ((T)remaining > chunk) {
1732             limit = init + chunk - 1;
1733           } else {
1734             last = true; // the last chunk
1735             limit = init + remaining - 1;
1736           } // if
1737         } // if
1738         break;
1739       } // if
1740       // divide by K*nproc
1741       UT span;
1742       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1743                          &span);
1744       UT rem = span % chunk;
1745       if (rem) // adjust so that span%chunk == 0
1746         span += chunk - rem;
1747       limit = init + span;
1748       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1749                                (ST)init, (ST)limit)) {
1750         // CAS was successful, chunk obtained
1751         status = 1;
1752         --limit;
1753         break;
1754       } // if
1755     } // while
1756     if (status != 0) {
1757       start = pr->u.p.lb;
1758       incr = pr->u.p.st;
1759       if (p_st != NULL)
1760         *p_st = incr;
1761       *p_lb = start + init * incr;
1762       *p_ub = start + limit * incr;
1763       if (pr->flags.ordered) {
1764         pr->u.p.ordered_lower = init;
1765         pr->u.p.ordered_upper = limit;
1766       } // if
1767     } else {
1768       *p_lb = 0;
1769       *p_ub = 0;
1770       if (p_st != NULL)
1771         *p_st = 0;
1772     } // if
1773   } // case
1774   break;
1775 
1776   case kmp_sch_guided_analytical_chunked: {
1777     T chunkspec = pr->u.p.parm1;
1778     UT chunkIdx;
1779 #if KMP_USE_X87CONTROL
1780     /* for storing original FPCW value for Windows* OS on
1781        IA-32 architecture 8-byte version */
1782     unsigned int oldFpcw;
1783     unsigned int fpcwSet = 0;
1784 #endif
1785     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1786                    "kmp_sch_guided_analytical_chunked case\n",
1787                    gtid));
1788 
1789     trip = pr->u.p.tc;
1790 
1791     KMP_DEBUG_ASSERT(nproc > 1);
1792     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1793 
1794     while (1) { /* this while loop is a safeguard against unexpected zero
1795                    chunk sizes */
1796       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1797       if (chunkIdx >= (UT)pr->u.p.parm2) {
1798         --trip;
1799         /* use dynamic-style scheduling */
1800         init = chunkIdx * chunkspec + pr->u.p.count;
1801         /* need to verify init > 0 in case of overflow in the above
1802          * calculation */
1803         if ((status = (init > 0 && init <= trip)) != 0) {
1804           limit = init + chunkspec - 1;
1805 
1806           if ((last = (limit >= trip)) != 0)
1807             limit = trip;
1808         }
1809         break;
1810       } else {
1811 /* use exponential-style scheduling */
1812 /* The following check is to workaround the lack of long double precision on
1813    Windows* OS.
1814    This check works around the possible effect that init != 0 for chunkIdx == 0.
1815  */
1816 #if KMP_USE_X87CONTROL
1817         /* If we haven't already done so, save original
1818            FPCW and set precision to 64-bit, as Windows* OS
1819            on IA-32 architecture defaults to 53-bit */
1820         if (!fpcwSet) {
1821           oldFpcw = _control87(0, 0);
1822           _control87(_PC_64, _MCW_PC);
1823           fpcwSet = 0x30000;
1824         }
1825 #endif
1826         if (chunkIdx) {
1827           init = __kmp_dispatch_guided_remaining<T>(
1828               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1829           KMP_DEBUG_ASSERT(init);
1830           init = trip - init;
1831         } else
1832           init = 0;
1833         limit = trip - __kmp_dispatch_guided_remaining<T>(
1834                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1835         KMP_ASSERT(init <= limit);
1836         if (init < limit) {
1837           KMP_DEBUG_ASSERT(limit <= trip);
1838           --limit;
1839           status = 1;
1840           break;
1841         } // if
1842       } // if
1843     } // while (1)
1844 #if KMP_USE_X87CONTROL
1845     /* restore FPCW if necessary
1846        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1847     */
1848     if (fpcwSet && (oldFpcw & fpcwSet))
1849       _control87(oldFpcw, _MCW_PC);
1850 #endif
1851     if (status != 0) {
1852       start = pr->u.p.lb;
1853       incr = pr->u.p.st;
1854       if (p_st != NULL)
1855         *p_st = incr;
1856       *p_lb = start + init * incr;
1857       *p_ub = start + limit * incr;
1858       if (pr->flags.ordered) {
1859         pr->u.p.ordered_lower = init;
1860         pr->u.p.ordered_upper = limit;
1861       }
1862     } else {
1863       *p_lb = 0;
1864       *p_ub = 0;
1865       if (p_st != NULL)
1866         *p_st = 0;
1867     }
1868   } // case
1869   break;
1870 
1871   case kmp_sch_trapezoidal: {
1872     UT index;
1873     T parm2 = pr->u.p.parm2;
1874     T parm3 = pr->u.p.parm3;
1875     T parm4 = pr->u.p.parm4;
1876     KD_TRACE(100,
1877              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1878               gtid));
1879 
1880     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1881 
1882     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1883     trip = pr->u.p.tc - 1;
1884 
1885     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1886       *p_lb = 0;
1887       *p_ub = 0;
1888       if (p_st != NULL)
1889         *p_st = 0;
1890     } else {
1891       start = pr->u.p.lb;
1892       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1893       incr = pr->u.p.st;
1894 
1895       if ((last = (limit >= trip)) != 0)
1896         limit = trip;
1897 
1898       if (p_st != NULL)
1899         *p_st = incr;
1900 
1901       if (incr == 1) {
1902         *p_lb = start + init;
1903         *p_ub = start + limit;
1904       } else {
1905         *p_lb = start + init * incr;
1906         *p_ub = start + limit * incr;
1907       }
1908 
1909       if (pr->flags.ordered) {
1910         pr->u.p.ordered_lower = init;
1911         pr->u.p.ordered_upper = limit;
1912       } // if
1913     } // if
1914   } // case
1915   break;
1916   default: {
1917     status = 0; // to avoid complaints on uninitialized variable use
1918     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1919                 KMP_HNT(GetNewerLibrary), // Hint
1920                 __kmp_msg_null // Variadic argument list terminator
1921     );
1922   } break;
1923   } // switch
1924   if (p_last)
1925     *p_last = last;
1926 #ifdef KMP_DEBUG
1927   if (pr->flags.ordered) {
1928     char *buff;
1929     // create format specifiers before the debug output
1930     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1931                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1932                             traits_t<UT>::spec, traits_t<UT>::spec);
1933     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1934     __kmp_str_free(&buff);
1935   }
1936   {
1937     char *buff;
1938     // create format specifiers before the debug output
1939     buff = __kmp_str_format(
1940         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1941         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1942         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1943     KMP_DEBUG_ASSERT(p_last);
1944     KMP_DEBUG_ASSERT(p_st);
1945     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1946     __kmp_str_free(&buff);
1947   }
1948 #endif
1949   return status;
1950 }
1951 
1952 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1953    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1954    is not called. */
1955 #if OMPT_SUPPORT && OMPT_OPTIONAL
1956 #define OMPT_LOOP_END                                                          \
1957   if (status == 0) {                                                           \
1958     if (ompt_enabled.ompt_callback_work) {                                     \
1959       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1960       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1961       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1962           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1963           &(task_info->task_data), 0, codeptr);                                \
1964     }                                                                          \
1965   }
1966 // TODO: implement count
1967 #else
1968 #define OMPT_LOOP_END // no-op
1969 #endif
1970 
1971 #if KMP_STATS_ENABLED
1972 #define KMP_STATS_LOOP_END                                                     \
1973   {                                                                            \
1974     kmp_int64 u, l, t, i;                                                      \
1975     l = (kmp_int64)(*p_lb);                                                    \
1976     u = (kmp_int64)(*p_ub);                                                    \
1977     i = (kmp_int64)(pr->u.p.st);                                               \
1978     if (status == 0) {                                                         \
1979       t = 0;                                                                   \
1980       KMP_POP_PARTITIONED_TIMER();                                             \
1981     } else if (i == 1) {                                                       \
1982       if (u >= l)                                                              \
1983         t = u - l + 1;                                                         \
1984       else                                                                     \
1985         t = 0;                                                                 \
1986     } else if (i < 0) {                                                        \
1987       if (l >= u)                                                              \
1988         t = (l - u) / (-i) + 1;                                                \
1989       else                                                                     \
1990         t = 0;                                                                 \
1991     } else {                                                                   \
1992       if (u >= l)                                                              \
1993         t = (u - l) / i + 1;                                                   \
1994       else                                                                     \
1995         t = 0;                                                                 \
1996     }                                                                          \
1997     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1998   }
1999 #else
2000 #define KMP_STATS_LOOP_END /* Nothing */
2001 #endif
2002 
2003 template <typename T>
2004 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2005                                T *p_lb, T *p_ub,
2006                                typename traits_t<T>::signed_t *p_st
2007 #if OMPT_SUPPORT && OMPT_OPTIONAL
2008                                ,
2009                                void *codeptr
2010 #endif
2011 ) {
2012 
2013   typedef typename traits_t<T>::unsigned_t UT;
2014   typedef typename traits_t<T>::signed_t ST;
2015   // This is potentially slightly misleading, schedule(runtime) will appear here
2016   // even if the actual runtime schedule is static. (Which points out a
2017   // disadvantage of schedule(runtime): even when static scheduling is used it
2018   // costs more than a compile time choice to use static scheduling would.)
2019   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2020 
2021   int status;
2022   dispatch_private_info_template<T> *pr;
2023   __kmp_assert_valid_gtid(gtid);
2024   kmp_info_t *th = __kmp_threads[gtid];
2025   kmp_team_t *team = th->th.th_team;
2026 
2027   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2028   KD_TRACE(
2029       1000,
2030       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2031        gtid, p_lb, p_ub, p_st, p_last));
2032 
2033   if (team->t.t_serialized) {
2034     /* NOTE: serialize this dispatch because we are not at the active level */
2035     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2036         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2037     KMP_DEBUG_ASSERT(pr);
2038 
2039     if ((status = (pr->u.p.tc != 0)) == 0) {
2040       *p_lb = 0;
2041       *p_ub = 0;
2042       //            if ( p_last != NULL )
2043       //                *p_last = 0;
2044       if (p_st != NULL)
2045         *p_st = 0;
2046       if (__kmp_env_consistency_check) {
2047         if (pr->pushed_ws != ct_none) {
2048           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2049         }
2050       }
2051     } else if (pr->flags.nomerge) {
2052       kmp_int32 last;
2053       T start;
2054       UT limit, trip, init;
2055       ST incr;
2056       T chunk = pr->u.p.parm1;
2057 
2058       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2059                      gtid));
2060 
2061       init = chunk * pr->u.p.count++;
2062       trip = pr->u.p.tc - 1;
2063 
2064       if ((status = (init <= trip)) == 0) {
2065         *p_lb = 0;
2066         *p_ub = 0;
2067         //                if ( p_last != NULL )
2068         //                    *p_last = 0;
2069         if (p_st != NULL)
2070           *p_st = 0;
2071         if (__kmp_env_consistency_check) {
2072           if (pr->pushed_ws != ct_none) {
2073             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2074           }
2075         }
2076       } else {
2077         start = pr->u.p.lb;
2078         limit = chunk + init - 1;
2079         incr = pr->u.p.st;
2080 
2081         if ((last = (limit >= trip)) != 0) {
2082           limit = trip;
2083 #if KMP_OS_WINDOWS
2084           pr->u.p.last_upper = pr->u.p.ub;
2085 #endif /* KMP_OS_WINDOWS */
2086         }
2087         if (p_last != NULL)
2088           *p_last = last;
2089         if (p_st != NULL)
2090           *p_st = incr;
2091         if (incr == 1) {
2092           *p_lb = start + init;
2093           *p_ub = start + limit;
2094         } else {
2095           *p_lb = start + init * incr;
2096           *p_ub = start + limit * incr;
2097         }
2098 
2099         if (pr->flags.ordered) {
2100           pr->u.p.ordered_lower = init;
2101           pr->u.p.ordered_upper = limit;
2102 #ifdef KMP_DEBUG
2103           {
2104             char *buff;
2105             // create format specifiers before the debug output
2106             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2107                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2108                                     traits_t<UT>::spec, traits_t<UT>::spec);
2109             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2110                             pr->u.p.ordered_upper));
2111             __kmp_str_free(&buff);
2112           }
2113 #endif
2114         } // if
2115       } // if
2116     } else {
2117       pr->u.p.tc = 0;
2118       *p_lb = pr->u.p.lb;
2119       *p_ub = pr->u.p.ub;
2120 #if KMP_OS_WINDOWS
2121       pr->u.p.last_upper = *p_ub;
2122 #endif /* KMP_OS_WINDOWS */
2123       if (p_last != NULL)
2124         *p_last = TRUE;
2125       if (p_st != NULL)
2126         *p_st = pr->u.p.st;
2127     } // if
2128 #ifdef KMP_DEBUG
2129     {
2130       char *buff;
2131       // create format specifiers before the debug output
2132       buff = __kmp_str_format(
2133           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2134           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2135           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2136       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2137                     (p_last ? *p_last : 0), status));
2138       __kmp_str_free(&buff);
2139     }
2140 #endif
2141 #if INCLUDE_SSC_MARKS
2142     SSC_MARK_DISPATCH_NEXT();
2143 #endif
2144     OMPT_LOOP_END;
2145     KMP_STATS_LOOP_END;
2146     return status;
2147   } else {
2148     kmp_int32 last = 0;
2149     dispatch_shared_info_template<T> volatile *sh;
2150 
2151     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2152                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2153 
2154     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2155         th->th.th_dispatch->th_dispatch_pr_current);
2156     KMP_DEBUG_ASSERT(pr);
2157     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2158         th->th.th_dispatch->th_dispatch_sh_current);
2159     KMP_DEBUG_ASSERT(sh);
2160 
2161 #if KMP_USE_HIER_SCHED
2162     if (pr->flags.use_hier)
2163       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2164     else
2165 #endif // KMP_USE_HIER_SCHED
2166       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2167                                                 p_st, th->th.th_team_nproc,
2168                                                 th->th.th_info.ds.ds_tid);
2169     // status == 0: no more iterations to execute
2170     if (status == 0) {
2171       ST num_done;
2172       num_done = test_then_inc<ST>(&sh->u.s.num_done);
2173 #ifdef KMP_DEBUG
2174       {
2175         char *buff;
2176         // create format specifiers before the debug output
2177         buff = __kmp_str_format(
2178             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2179             traits_t<ST>::spec);
2180         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2181         __kmp_str_free(&buff);
2182       }
2183 #endif
2184 
2185 #if KMP_USE_HIER_SCHED
2186       pr->flags.use_hier = FALSE;
2187 #endif
2188       if (num_done == th->th.th_team_nproc - 1) {
2189 #if KMP_STATIC_STEAL_ENABLED
2190         if (pr->schedule == kmp_sch_static_steal) {
2191           int i;
2192           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2193                     __kmp_dispatch_num_buffers; // current loop index
2194           // loop complete, safe to destroy locks used for stealing
2195           for (i = 0; i < th->th.th_team_nproc; ++i) {
2196             dispatch_private_info_template<T> *buf =
2197                 reinterpret_cast<dispatch_private_info_template<T> *>(
2198                     &team->t.t_dispatch[i].th_disp_buffer[idx]);
2199             KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2200             KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2201             if (traits_t<T>::type_size > 4) {
2202               // destroy locks used for stealing
2203               kmp_lock_t *lck = buf->u.p.steal_lock;
2204               KMP_ASSERT(lck != NULL);
2205               __kmp_destroy_lock(lck);
2206               __kmp_free(lck);
2207               buf->u.p.steal_lock = NULL;
2208             }
2209           }
2210         }
2211 #endif
2212         /* NOTE: release shared buffer to be reused */
2213 
2214         KMP_MB(); /* Flush all pending memory write invalidates.  */
2215 
2216         sh->u.s.num_done = 0;
2217         sh->u.s.iteration = 0;
2218 
2219         /* TODO replace with general release procedure? */
2220         if (pr->flags.ordered) {
2221           sh->u.s.ordered_iteration = 0;
2222         }
2223 
2224         sh->buffer_index += __kmp_dispatch_num_buffers;
2225         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2226                        gtid, sh->buffer_index));
2227 
2228         KMP_MB(); /* Flush all pending memory write invalidates.  */
2229 
2230       } // if
2231       if (__kmp_env_consistency_check) {
2232         if (pr->pushed_ws != ct_none) {
2233           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2234         }
2235       }
2236 
2237       th->th.th_dispatch->th_deo_fcn = NULL;
2238       th->th.th_dispatch->th_dxo_fcn = NULL;
2239       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2240       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2241     } // if (status == 0)
2242 #if KMP_OS_WINDOWS
2243     else if (last) {
2244       pr->u.p.last_upper = pr->u.p.ub;
2245     }
2246 #endif /* KMP_OS_WINDOWS */
2247     if (p_last != NULL && status != 0)
2248       *p_last = last;
2249   } // if
2250 
2251 #ifdef KMP_DEBUG
2252   {
2253     char *buff;
2254     // create format specifiers before the debug output
2255     buff = __kmp_str_format(
2256         "__kmp_dispatch_next: T#%%d normal case: "
2257         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2258         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2259     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2260                   (p_last ? *p_last : 0), status));
2261     __kmp_str_free(&buff);
2262   }
2263 #endif
2264 #if INCLUDE_SSC_MARKS
2265   SSC_MARK_DISPATCH_NEXT();
2266 #endif
2267   OMPT_LOOP_END;
2268   KMP_STATS_LOOP_END;
2269   return status;
2270 }
2271 
2272 template <typename T>
2273 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2274                                   kmp_int32 *plastiter, T *plower, T *pupper,
2275                                   typename traits_t<T>::signed_t incr) {
2276   typedef typename traits_t<T>::unsigned_t UT;
2277   kmp_uint32 team_id;
2278   kmp_uint32 nteams;
2279   UT trip_count;
2280   kmp_team_t *team;
2281   kmp_info_t *th;
2282 
2283   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2284   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2285 #ifdef KMP_DEBUG
2286   typedef typename traits_t<T>::signed_t ST;
2287   {
2288     char *buff;
2289     // create format specifiers before the debug output
2290     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2291                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2292                             traits_t<T>::spec, traits_t<T>::spec,
2293                             traits_t<ST>::spec, traits_t<T>::spec);
2294     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2295     __kmp_str_free(&buff);
2296   }
2297 #endif
2298 
2299   if (__kmp_env_consistency_check) {
2300     if (incr == 0) {
2301       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2302                             loc);
2303     }
2304     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2305       // The loop is illegal.
2306       // Some zero-trip loops maintained by compiler, e.g.:
2307       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2308       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2309       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2310       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2311       // Compiler does not check the following illegal loops:
2312       //   for(i=0;i<10;i+=incr) // where incr<0
2313       //   for(i=10;i>0;i-=incr) // where incr<0
2314       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2315     }
2316   }
2317   __kmp_assert_valid_gtid(gtid);
2318   th = __kmp_threads[gtid];
2319   team = th->th.th_team;
2320   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2321   nteams = th->th.th_teams_size.nteams;
2322   team_id = team->t.t_master_tid;
2323   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2324 
2325   // compute global trip count
2326   if (incr == 1) {
2327     trip_count = *pupper - *plower + 1;
2328   } else if (incr == -1) {
2329     trip_count = *plower - *pupper + 1;
2330   } else if (incr > 0) {
2331     // upper-lower can exceed the limit of signed type
2332     trip_count = (UT)(*pupper - *plower) / incr + 1;
2333   } else {
2334     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2335   }
2336 
2337   if (trip_count <= nteams) {
2338     KMP_DEBUG_ASSERT(
2339         __kmp_static == kmp_sch_static_greedy ||
2340         __kmp_static ==
2341             kmp_sch_static_balanced); // Unknown static scheduling type.
2342     // only some teams get single iteration, others get nothing
2343     if (team_id < trip_count) {
2344       *pupper = *plower = *plower + team_id * incr;
2345     } else {
2346       *plower = *pupper + incr; // zero-trip loop
2347     }
2348     if (plastiter != NULL)
2349       *plastiter = (team_id == trip_count - 1);
2350   } else {
2351     if (__kmp_static == kmp_sch_static_balanced) {
2352       UT chunk = trip_count / nteams;
2353       UT extras = trip_count % nteams;
2354       *plower +=
2355           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2356       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2357       if (plastiter != NULL)
2358         *plastiter = (team_id == nteams - 1);
2359     } else {
2360       T chunk_inc_count =
2361           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2362       T upper = *pupper;
2363       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2364       // Unknown static scheduling type.
2365       *plower += team_id * chunk_inc_count;
2366       *pupper = *plower + chunk_inc_count - incr;
2367       // Check/correct bounds if needed
2368       if (incr > 0) {
2369         if (*pupper < *plower)
2370           *pupper = traits_t<T>::max_value;
2371         if (plastiter != NULL)
2372           *plastiter = *plower <= upper && *pupper > upper - incr;
2373         if (*pupper > upper)
2374           *pupper = upper; // tracker C73258
2375       } else {
2376         if (*pupper > *plower)
2377           *pupper = traits_t<T>::min_value;
2378         if (plastiter != NULL)
2379           *plastiter = *plower >= upper && *pupper < upper - incr;
2380         if (*pupper < upper)
2381           *pupper = upper; // tracker C73258
2382       }
2383     }
2384   }
2385 }
2386 
2387 //-----------------------------------------------------------------------------
2388 // Dispatch routines
2389 //    Transfer call to template< type T >
2390 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2391 //                         T lb, T ub, ST st, ST chunk )
2392 extern "C" {
2393 
2394 /*!
2395 @ingroup WORK_SHARING
2396 @{
2397 @param loc Source location
2398 @param gtid Global thread id
2399 @param schedule Schedule type
2400 @param lb  Lower bound
2401 @param ub  Upper bound
2402 @param st  Step (or increment if you prefer)
2403 @param chunk The chunk size to block with
2404 
2405 This function prepares the runtime to start a dynamically scheduled for loop,
2406 saving the loop arguments.
2407 These functions are all identical apart from the types of the arguments.
2408 */
2409 
2410 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2411                             enum sched_type schedule, kmp_int32 lb,
2412                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2413   KMP_DEBUG_ASSERT(__kmp_init_serial);
2414 #if OMPT_SUPPORT && OMPT_OPTIONAL
2415   OMPT_STORE_RETURN_ADDRESS(gtid);
2416 #endif
2417   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2418 }
2419 /*!
2420 See @ref __kmpc_dispatch_init_4
2421 */
2422 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2423                              enum sched_type schedule, kmp_uint32 lb,
2424                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2425   KMP_DEBUG_ASSERT(__kmp_init_serial);
2426 #if OMPT_SUPPORT && OMPT_OPTIONAL
2427   OMPT_STORE_RETURN_ADDRESS(gtid);
2428 #endif
2429   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2430 }
2431 
2432 /*!
2433 See @ref __kmpc_dispatch_init_4
2434 */
2435 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2436                             enum sched_type schedule, kmp_int64 lb,
2437                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2438   KMP_DEBUG_ASSERT(__kmp_init_serial);
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440   OMPT_STORE_RETURN_ADDRESS(gtid);
2441 #endif
2442   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2443 }
2444 
2445 /*!
2446 See @ref __kmpc_dispatch_init_4
2447 */
2448 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2449                              enum sched_type schedule, kmp_uint64 lb,
2450                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2451   KMP_DEBUG_ASSERT(__kmp_init_serial);
2452 #if OMPT_SUPPORT && OMPT_OPTIONAL
2453   OMPT_STORE_RETURN_ADDRESS(gtid);
2454 #endif
2455   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2456 }
2457 
2458 /*!
2459 See @ref __kmpc_dispatch_init_4
2460 
2461 Difference from __kmpc_dispatch_init set of functions is these functions
2462 are called for composite distribute parallel for construct. Thus before
2463 regular iterations dispatching we need to calc per-team iteration space.
2464 
2465 These functions are all identical apart from the types of the arguments.
2466 */
2467 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2468                                  enum sched_type schedule, kmp_int32 *p_last,
2469                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2470                                  kmp_int32 chunk) {
2471   KMP_DEBUG_ASSERT(__kmp_init_serial);
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473   OMPT_STORE_RETURN_ADDRESS(gtid);
2474 #endif
2475   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2476   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2477 }
2478 
2479 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2480                                   enum sched_type schedule, kmp_int32 *p_last,
2481                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2482                                   kmp_int32 chunk) {
2483   KMP_DEBUG_ASSERT(__kmp_init_serial);
2484 #if OMPT_SUPPORT && OMPT_OPTIONAL
2485   OMPT_STORE_RETURN_ADDRESS(gtid);
2486 #endif
2487   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2488   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2489 }
2490 
2491 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2492                                  enum sched_type schedule, kmp_int32 *p_last,
2493                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2494                                  kmp_int64 chunk) {
2495   KMP_DEBUG_ASSERT(__kmp_init_serial);
2496 #if OMPT_SUPPORT && OMPT_OPTIONAL
2497   OMPT_STORE_RETURN_ADDRESS(gtid);
2498 #endif
2499   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2500   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2501 }
2502 
2503 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2504                                   enum sched_type schedule, kmp_int32 *p_last,
2505                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2506                                   kmp_int64 chunk) {
2507   KMP_DEBUG_ASSERT(__kmp_init_serial);
2508 #if OMPT_SUPPORT && OMPT_OPTIONAL
2509   OMPT_STORE_RETURN_ADDRESS(gtid);
2510 #endif
2511   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2512   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2513 }
2514 
2515 /*!
2516 @param loc Source code location
2517 @param gtid Global thread id
2518 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2519 otherwise
2520 @param p_lb   Pointer to the lower bound for the next chunk of work
2521 @param p_ub   Pointer to the upper bound for the next chunk of work
2522 @param p_st   Pointer to the stride for the next chunk of work
2523 @return one if there is work to be done, zero otherwise
2524 
2525 Get the next dynamically allocated chunk of work for this thread.
2526 If there is no more work, then the lb,ub and stride need not be modified.
2527 */
2528 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2529                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2530 #if OMPT_SUPPORT && OMPT_OPTIONAL
2531   OMPT_STORE_RETURN_ADDRESS(gtid);
2532 #endif
2533   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2534 #if OMPT_SUPPORT && OMPT_OPTIONAL
2535                                         ,
2536                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2537 #endif
2538   );
2539 }
2540 
2541 /*!
2542 See @ref __kmpc_dispatch_next_4
2543 */
2544 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2545                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2546                             kmp_int32 *p_st) {
2547 #if OMPT_SUPPORT && OMPT_OPTIONAL
2548   OMPT_STORE_RETURN_ADDRESS(gtid);
2549 #endif
2550   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2551 #if OMPT_SUPPORT && OMPT_OPTIONAL
2552                                          ,
2553                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2554 #endif
2555   );
2556 }
2557 
2558 /*!
2559 See @ref __kmpc_dispatch_next_4
2560 */
2561 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2562                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2563 #if OMPT_SUPPORT && OMPT_OPTIONAL
2564   OMPT_STORE_RETURN_ADDRESS(gtid);
2565 #endif
2566   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2567 #if OMPT_SUPPORT && OMPT_OPTIONAL
2568                                         ,
2569                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2570 #endif
2571   );
2572 }
2573 
2574 /*!
2575 See @ref __kmpc_dispatch_next_4
2576 */
2577 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2578                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2579                             kmp_int64 *p_st) {
2580 #if OMPT_SUPPORT && OMPT_OPTIONAL
2581   OMPT_STORE_RETURN_ADDRESS(gtid);
2582 #endif
2583   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2584 #if OMPT_SUPPORT && OMPT_OPTIONAL
2585                                          ,
2586                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2587 #endif
2588   );
2589 }
2590 
2591 /*!
2592 @param loc Source code location
2593 @param gtid Global thread id
2594 
2595 Mark the end of a dynamic loop.
2596 */
2597 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2598   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2599 }
2600 
2601 /*!
2602 See @ref __kmpc_dispatch_fini_4
2603 */
2604 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2605   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2606 }
2607 
2608 /*!
2609 See @ref __kmpc_dispatch_fini_4
2610 */
2611 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2612   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2613 }
2614 
2615 /*!
2616 See @ref __kmpc_dispatch_fini_4
2617 */
2618 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2619   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2620 }
2621 /*! @} */
2622 
2623 //-----------------------------------------------------------------------------
2624 // Non-template routines from kmp_dispatch.cpp used in other sources
2625 
2626 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2627   return value == checker;
2628 }
2629 
2630 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2631   return value != checker;
2632 }
2633 
2634 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2635   return value < checker;
2636 }
2637 
2638 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2639   return value >= checker;
2640 }
2641 
2642 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2643   return value <= checker;
2644 }
2645 
2646 kmp_uint32
2647 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2648              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2649              void *obj // Higher-level synchronization object, or NULL.
2650 ) {
2651   // note: we may not belong to a team at this point
2652   volatile kmp_uint32 *spin = spinner;
2653   kmp_uint32 check = checker;
2654   kmp_uint32 spins;
2655   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2656   kmp_uint32 r;
2657 
2658   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2659   KMP_INIT_YIELD(spins);
2660   // main wait spin loop
2661   while (!f(r = TCR_4(*spin), check)) {
2662     KMP_FSYNC_SPIN_PREPARE(obj);
2663     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2664        split. It causes problems with infinite recursion because of exit lock */
2665     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2666         __kmp_abort_thread(); */
2667     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2668   }
2669   KMP_FSYNC_SPIN_ACQUIRED(obj);
2670   return r;
2671 }
2672 
2673 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2674                       kmp_uint32 (*pred)(void *, kmp_uint32),
2675                       void *obj // Higher-level synchronization object, or NULL.
2676 ) {
2677   // note: we may not belong to a team at this point
2678   void *spin = spinner;
2679   kmp_uint32 check = checker;
2680   kmp_uint32 spins;
2681   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2682 
2683   KMP_FSYNC_SPIN_INIT(obj, spin);
2684   KMP_INIT_YIELD(spins);
2685   // main wait spin loop
2686   while (!f(spin, check)) {
2687     KMP_FSYNC_SPIN_PREPARE(obj);
2688     /* if we have waited a bit, or are noversubscribed, yield */
2689     /* pause is in the following code */
2690     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2691   }
2692   KMP_FSYNC_SPIN_ACQUIRED(obj);
2693 }
2694 
2695 } // extern "C"
2696 
2697 #ifdef KMP_GOMP_COMPAT
2698 
2699 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2700                                enum sched_type schedule, kmp_int32 lb,
2701                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2702                                int push_ws) {
2703   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2704                                  push_ws);
2705 }
2706 
2707 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2708                                 enum sched_type schedule, kmp_uint32 lb,
2709                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2710                                 int push_ws) {
2711   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2712                                   push_ws);
2713 }
2714 
2715 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2716                                enum sched_type schedule, kmp_int64 lb,
2717                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2718                                int push_ws) {
2719   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2720                                  push_ws);
2721 }
2722 
2723 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2724                                 enum sched_type schedule, kmp_uint64 lb,
2725                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2726                                 int push_ws) {
2727   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2728                                   push_ws);
2729 }
2730 
2731 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2732   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2733 }
2734 
2735 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2736   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2737 }
2738 
2739 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2740   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2741 }
2742 
2743 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2744   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2745 }
2746 
2747 #endif /* KMP_GOMP_COMPAT */
2748 
2749 /* ------------------------------------------------------------------------ */
2750