1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                      gtid));
395       goto dynamic_init;
396       break;
397     } // if
398   } // case
399 #endif
400   case kmp_sch_static_balanced: {
401     T init, limit;
402 
403     KD_TRACE(
404         100,
405         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
406          gtid));
407 
408     if (nproc > 1) {
409       T id = tid;
410 
411       if (tc < nproc) {
412         if (id < tc) {
413           init = id;
414           limit = id;
415           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
416         } else {
417           pr->u.p.count = 1; /* means no more chunks to execute */
418           pr->u.p.parm1 = FALSE;
419           break;
420         }
421       } else {
422         T small_chunk = tc / nproc;
423         T extras = tc % nproc;
424         init = id * small_chunk + (id < extras ? id : extras);
425         limit = init + small_chunk - (id < extras ? 0 : 1);
426         pr->u.p.parm1 = (id == nproc - 1);
427       }
428     } else {
429       if (tc > 0) {
430         init = 0;
431         limit = tc - 1;
432         pr->u.p.parm1 = TRUE;
433       } else {
434         // zero trip count
435         pr->u.p.count = 1; /* means no more chunks to execute */
436         pr->u.p.parm1 = FALSE;
437         break;
438       }
439     }
440 #if USE_ITT_BUILD
441     // Calculate chunk for metadata report
442     if (itt_need_metadata_reporting)
443       if (cur_chunk)
444         *cur_chunk = limit - init + 1;
445 #endif
446     if (st == 1) {
447       pr->u.p.lb = lb + init;
448       pr->u.p.ub = lb + limit;
449     } else {
450       // calculated upper bound, "ub" is user-defined upper bound
451       T ub_tmp = lb + limit * st;
452       pr->u.p.lb = lb + init * st;
453       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
454       // it exactly
455       if (st > 0) {
456         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
457       } else {
458         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
459       }
460     }
461     if (pr->flags.ordered) {
462       pr->u.p.ordered_lower = init;
463       pr->u.p.ordered_upper = limit;
464     }
465     break;
466   } // case
467   case kmp_sch_static_balanced_chunked: {
468     // similar to balanced, but chunk adjusted to multiple of simd width
469     T nth = nproc;
470     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
471                    " -> falling-through to static_greedy\n",
472                    gtid));
473     schedule = kmp_sch_static_greedy;
474     if (nth > 1)
475       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
476     else
477       pr->u.p.parm1 = tc;
478     break;
479   } // case
480   case kmp_sch_guided_simd:
481   case kmp_sch_guided_iterative_chunked: {
482     KD_TRACE(
483         100,
484         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
485          " case\n",
486          gtid));
487 
488     if (nproc > 1) {
489       if ((2L * chunk + 1) * nproc >= tc) {
490         /* chunk size too large, switch to dynamic */
491         schedule = kmp_sch_dynamic_chunked;
492         goto dynamic_init;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522         goto dynamic_init;
523       } else {
524         /* commonly used term: (2 nproc - 1)/(2 nproc) */
525         DBL x;
526 
527 #if KMP_USE_X87CONTROL
528         /* Linux* OS already has 64-bit computation by default for long double,
529            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
530            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
531            instead of the default 53-bit. Even though long double doesn't work
532            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
533            expected to impact the correctness of the algorithm, but this has not
534            been mathematically proven. */
535         // save original FPCW and set precision to 64-bit, as
536         // Windows* OS on IA-32 architecture defaults to 53-bit
537         unsigned int oldFpcw = _control87(0, 0);
538         _control87(_PC_64, _MCW_PC); // 0,0x30000
539 #endif
540         /* value used for comparison in solver for cross-over point */
541         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
542 
543         /* crossover point--chunk indexes equal to or greater than
544            this point switch to dynamic-style scheduling */
545         UT cross;
546 
547         /* commonly used term: (2 nproc - 1)/(2 nproc) */
548         x = 1.0 - 0.5 / (double)nproc;
549 
550 #ifdef KMP_DEBUG
551         { // test natural alignment
552           struct _test_a {
553             char a;
554             union {
555               char b;
556               DBL d;
557             };
558           } t;
559           ptrdiff_t natural_alignment =
560               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
561           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
562           // long)natural_alignment );
563           KMP_DEBUG_ASSERT(
564               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
565         }
566 #endif // KMP_DEBUG
567 
568         /* save the term in thread private dispatch structure */
569         *(DBL *)&pr->u.p.parm3 = x;
570 
571         /* solve for the crossover point to the nearest integer i for which C_i
572            <= chunk */
573         {
574           UT left, right, mid;
575           long double p;
576 
577           /* estimate initial upper and lower bound */
578 
579           /* doesn't matter what value right is as long as it is positive, but
580              it affects performance of the solver */
581           right = 229;
582           p = __kmp_pow<UT>(x, right);
583           if (p > target) {
584             do {
585               p *= p;
586               right <<= 1;
587             } while (p > target && right < (1 << 27));
588             /* lower bound is previous (failed) estimate of upper bound */
589             left = right >> 1;
590           } else {
591             left = 0;
592           }
593 
594           /* bisection root-finding method */
595           while (left + 1 < right) {
596             mid = (left + right) / 2;
597             if (__kmp_pow<UT>(x, mid) > target) {
598               left = mid;
599             } else {
600               right = mid;
601             }
602           } // while
603           cross = right;
604         }
605         /* assert sanity of computed crossover point */
606         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
607                    __kmp_pow<UT>(x, cross) <= target);
608 
609         /* save the crossover point in thread private dispatch structure */
610         pr->u.p.parm2 = cross;
611 
612 // C75803
613 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
614 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
615 #else
616 #define GUIDED_ANALYTICAL_WORKAROUND (x)
617 #endif
618         /* dynamic-style scheduling offset */
619         pr->u.p.count = tc -
620                         __kmp_dispatch_guided_remaining(
621                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
622                         cross * chunk;
623 #if KMP_USE_X87CONTROL
624         // restore FPCW
625         _control87(oldFpcw, _MCW_PC);
626 #endif
627       } // if
628     } else {
629       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
630                      "kmp_sch_static_greedy\n",
631                      gtid));
632       schedule = kmp_sch_static_greedy;
633       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
634       pr->u.p.parm1 = tc;
635     } // if
636   } // case
637   break;
638   case kmp_sch_static_greedy:
639     KD_TRACE(
640         100,
641         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
642          gtid));
643     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
644     break;
645   case kmp_sch_static_chunked:
646   case kmp_sch_dynamic_chunked:
647   dynamic_init:
648     if (pr->u.p.parm1 <= 0)
649       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
650     else if (pr->u.p.parm1 > tc)
651       pr->u.p.parm1 = tc;
652     // Store the total number of chunks to prevent integer overflow during
653     // bounds calculations in the get next chunk routine.
654     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
655     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
656                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
657                    gtid));
658     break;
659   case kmp_sch_trapezoidal: {
660     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
661 
662     T parm1, parm2, parm3, parm4;
663     KD_TRACE(100,
664              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
665               gtid));
666 
667     parm1 = chunk;
668 
669     /* F : size of the first cycle */
670     parm2 = (tc / (2 * nproc));
671 
672     if (parm2 < 1) {
673       parm2 = 1;
674     }
675 
676     /* L : size of the last cycle.  Make sure the last cycle is not larger
677        than the first cycle. */
678     if (parm1 < 1) {
679       parm1 = 1;
680     } else if (parm1 > parm2) {
681       parm1 = parm2;
682     }
683 
684     /* N : number of cycles */
685     parm3 = (parm2 + parm1);
686     parm3 = (2 * tc + parm3 - 1) / parm3;
687 
688     if (parm3 < 2) {
689       parm3 = 2;
690     }
691 
692     /* sigma : decreasing incr of the trapezoid */
693     parm4 = (parm3 - 1);
694     parm4 = (parm2 - parm1) / parm4;
695 
696     // pointless check, because parm4 >= 0 always
697     // if ( parm4 < 0 ) {
698     //    parm4 = 0;
699     //}
700 
701     pr->u.p.parm1 = parm1;
702     pr->u.p.parm2 = parm2;
703     pr->u.p.parm3 = parm3;
704     pr->u.p.parm4 = parm4;
705   } // case
706   break;
707 
708   default: {
709     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
710                 KMP_HNT(GetNewerLibrary), // Hint
711                 __kmp_msg_null // Variadic argument list terminator
712     );
713   } break;
714   } // switch
715   pr->schedule = schedule;
716 }
717 
718 #if KMP_USE_HIER_SCHED
719 template <typename T>
720 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
721                                              typename traits_t<T>::signed_t st);
722 template <>
723 inline void
724 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
725                                             kmp_int32 ub, kmp_int32 st) {
726   __kmp_dispatch_init_hierarchy<kmp_int32>(
727       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
728       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
729 }
730 template <>
731 inline void
732 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
733                                              kmp_uint32 ub, kmp_int32 st) {
734   __kmp_dispatch_init_hierarchy<kmp_uint32>(
735       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
736       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
737 }
738 template <>
739 inline void
740 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
741                                             kmp_int64 ub, kmp_int64 st) {
742   __kmp_dispatch_init_hierarchy<kmp_int64>(
743       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
744       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
745 }
746 template <>
747 inline void
748 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
749                                              kmp_uint64 ub, kmp_int64 st) {
750   __kmp_dispatch_init_hierarchy<kmp_uint64>(
751       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
752       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
753 }
754 
755 // free all the hierarchy scheduling memory associated with the team
756 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
757   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
758   for (int i = 0; i < num_disp_buff; ++i) {
759     // type does not matter here so use kmp_int32
760     auto sh =
761         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
762             &team->t.t_disp_buffer[i]);
763     if (sh->hier) {
764       sh->hier->deallocate();
765       __kmp_free(sh->hier);
766     }
767   }
768 }
769 #endif
770 
771 // UT - unsigned flavor of T, ST - signed flavor of T,
772 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
773 template <typename T>
774 static void
775 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
776                     T ub, typename traits_t<T>::signed_t st,
777                     typename traits_t<T>::signed_t chunk, int push_ws) {
778   typedef typename traits_t<T>::unsigned_t UT;
779 
780   int active;
781   kmp_info_t *th;
782   kmp_team_t *team;
783   kmp_uint32 my_buffer_index;
784   dispatch_private_info_template<T> *pr;
785   dispatch_shared_info_template<T> volatile *sh;
786 
787   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
788                    sizeof(dispatch_private_info));
789   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
790                    sizeof(dispatch_shared_info));
791   __kmp_assert_valid_gtid(gtid);
792 
793   if (!TCR_4(__kmp_init_parallel))
794     __kmp_parallel_initialize();
795 
796   __kmp_resume_if_soft_paused();
797 
798 #if INCLUDE_SSC_MARKS
799   SSC_MARK_DISPATCH_INIT();
800 #endif
801 #ifdef KMP_DEBUG
802   typedef typename traits_t<T>::signed_t ST;
803   {
804     char *buff;
805     // create format specifiers before the debug output
806     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
807                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
808                             traits_t<ST>::spec, traits_t<T>::spec,
809                             traits_t<T>::spec, traits_t<ST>::spec);
810     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
811     __kmp_str_free(&buff);
812   }
813 #endif
814   /* setup data */
815   th = __kmp_threads[gtid];
816   team = th->th.th_team;
817   active = !team->t.t_serialized;
818   th->th.th_ident = loc;
819 
820   // Any half-decent optimizer will remove this test when the blocks are empty
821   // since the macros expand to nothing
822   // when statistics are disabled.
823   if (schedule == __kmp_static) {
824     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
825   } else {
826     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
827   }
828 
829 #if KMP_USE_HIER_SCHED
830   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
831   // Hierarchical scheduling does not work with ordered, so if ordered is
832   // detected, then revert back to threaded scheduling.
833   bool ordered;
834   enum sched_type my_sched = schedule;
835   my_buffer_index = th->th.th_dispatch->th_disp_index;
836   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
837       &th->th.th_dispatch
838            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
839   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
840   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
841     my_sched =
842         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
843   ordered = (kmp_ord_lower & my_sched);
844   if (pr->flags.use_hier) {
845     if (ordered) {
846       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
847                      "Disabling hierarchical scheduling.\n",
848                      gtid));
849       pr->flags.use_hier = FALSE;
850     }
851   }
852   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
853     // Don't use hierarchical for ordered parallel loops and don't
854     // use the runtime hierarchy if one was specified in the program
855     if (!ordered && !pr->flags.use_hier)
856       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
857   }
858 #endif // KMP_USE_HIER_SCHED
859 
860 #if USE_ITT_BUILD
861   kmp_uint64 cur_chunk = chunk;
862   int itt_need_metadata_reporting =
863       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
864       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
865       team->t.t_active_level == 1;
866 #endif
867   if (!active) {
868     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
869         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
870   } else {
871     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
872                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
873 
874     my_buffer_index = th->th.th_dispatch->th_disp_index++;
875 
876     /* What happens when number of threads changes, need to resize buffer? */
877     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
878         &th->th.th_dispatch
879              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
880     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
881         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
882     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
883                   my_buffer_index));
884   }
885 
886   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
887 #if USE_ITT_BUILD
888                                 &cur_chunk,
889 #endif
890                                 chunk, (T)th->th.th_team_nproc,
891                                 (T)th->th.th_info.ds.ds_tid);
892   if (active) {
893     if (pr->flags.ordered == 0) {
894       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
895       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
896     } else {
897       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
898       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
899     }
900   }
901 
902   if (active) {
903     /* The name of this buffer should be my_buffer_index when it's free to use
904      * it */
905 
906     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
907                    "sh->buffer_index:%d\n",
908                    gtid, my_buffer_index, sh->buffer_index));
909     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
910                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
911     // Note: KMP_WAIT() cannot be used there: buffer index and
912     // my_buffer_index are *always* 32-bit integers.
913     KMP_MB(); /* is this necessary? */
914     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
915                    "sh->buffer_index:%d\n",
916                    gtid, my_buffer_index, sh->buffer_index));
917 
918     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
919     th->th.th_dispatch->th_dispatch_sh_current =
920         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
921 #if USE_ITT_BUILD
922     if (pr->flags.ordered) {
923       __kmp_itt_ordered_init(gtid);
924     }
925     // Report loop metadata
926     if (itt_need_metadata_reporting) {
927       // Only report metadata by primary thread of active team at level 1
928       kmp_uint64 schedtype = 0;
929       switch (schedule) {
930       case kmp_sch_static_chunked:
931       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
932         break;
933       case kmp_sch_static_greedy:
934         cur_chunk = pr->u.p.parm1;
935         break;
936       case kmp_sch_dynamic_chunked:
937         schedtype = 1;
938         break;
939       case kmp_sch_guided_iterative_chunked:
940       case kmp_sch_guided_analytical_chunked:
941       case kmp_sch_guided_simd:
942         schedtype = 2;
943         break;
944       default:
945         // Should we put this case under "static"?
946         // case kmp_sch_static_steal:
947         schedtype = 3;
948         break;
949       }
950       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
951     }
952 #if KMP_USE_HIER_SCHED
953     if (pr->flags.use_hier) {
954       pr->u.p.count = 0;
955       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
956     }
957 #endif // KMP_USER_HIER_SCHED
958 #endif /* USE_ITT_BUILD */
959   }
960 
961 #ifdef KMP_DEBUG
962   {
963     char *buff;
964     // create format specifiers before the debug output
965     buff = __kmp_str_format(
966         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
967         "lb:%%%s ub:%%%s"
968         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
969         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
970         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
971         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
972         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
973         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
974     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
975                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
976                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
977                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
978     __kmp_str_free(&buff);
979   }
980 #endif
981 #if (KMP_STATIC_STEAL_ENABLED)
982   // It cannot be guaranteed that after execution of a loop with some other
983   // schedule kind all the parm3 variables will contain the same value. Even if
984   // all parm3 will be the same, it still exists a bad case like using 0 and 1
985   // rather than program life-time increment. So the dedicated variable is
986   // required. The 'static_steal_counter' is used.
987   if (pr->schedule == kmp_sch_static_steal) {
988     // Other threads will inspect this variable when searching for a victim.
989     // This is a flag showing that other threads may steal from this thread
990     // since then.
991     volatile T *p = &pr->u.p.static_steal_counter;
992     *p = *p + 1;
993   }
994 #endif // ( KMP_STATIC_STEAL_ENABLED )
995 
996 #if OMPT_SUPPORT && OMPT_OPTIONAL
997   if (ompt_enabled.ompt_callback_work) {
998     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
999     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1000     ompt_callbacks.ompt_callback(ompt_callback_work)(
1001         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1002         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1003   }
1004 #endif
1005   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1006 }
1007 
1008 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1009  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1010  * every chunk of iterations.  If the ordered section(s) were not executed
1011  * for this iteration (or every iteration in this chunk), we need to set the
1012  * ordered iteration counters so that the next thread can proceed. */
1013 template <typename UT>
1014 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1015   typedef typename traits_t<UT>::signed_t ST;
1016   __kmp_assert_valid_gtid(gtid);
1017   kmp_info_t *th = __kmp_threads[gtid];
1018 
1019   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1020   if (!th->th.th_team->t.t_serialized) {
1021 
1022     dispatch_private_info_template<UT> *pr =
1023         reinterpret_cast<dispatch_private_info_template<UT> *>(
1024             th->th.th_dispatch->th_dispatch_pr_current);
1025     dispatch_shared_info_template<UT> volatile *sh =
1026         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1027             th->th.th_dispatch->th_dispatch_sh_current);
1028     KMP_DEBUG_ASSERT(pr);
1029     KMP_DEBUG_ASSERT(sh);
1030     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1031                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1032 
1033     if (pr->ordered_bumped) {
1034       KD_TRACE(
1035           1000,
1036           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1037            gtid));
1038       pr->ordered_bumped = 0;
1039     } else {
1040       UT lower = pr->u.p.ordered_lower;
1041 
1042 #ifdef KMP_DEBUG
1043       {
1044         char *buff;
1045         // create format specifiers before the debug output
1046         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1047                                 "ordered_iteration:%%%s lower:%%%s\n",
1048                                 traits_t<UT>::spec, traits_t<UT>::spec);
1049         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1050         __kmp_str_free(&buff);
1051       }
1052 #endif
1053 
1054       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1055                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1056       KMP_MB(); /* is this necessary? */
1057 #ifdef KMP_DEBUG
1058       {
1059         char *buff;
1060         // create format specifiers before the debug output
1061         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1062                                 "ordered_iteration:%%%s lower:%%%s\n",
1063                                 traits_t<UT>::spec, traits_t<UT>::spec);
1064         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1065         __kmp_str_free(&buff);
1066       }
1067 #endif
1068 
1069       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1070     } // if
1071   } // if
1072   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1073 }
1074 
1075 #ifdef KMP_GOMP_COMPAT
1076 
1077 template <typename UT>
1078 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1079   typedef typename traits_t<UT>::signed_t ST;
1080   __kmp_assert_valid_gtid(gtid);
1081   kmp_info_t *th = __kmp_threads[gtid];
1082 
1083   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1084   if (!th->th.th_team->t.t_serialized) {
1085     //        int cid;
1086     dispatch_private_info_template<UT> *pr =
1087         reinterpret_cast<dispatch_private_info_template<UT> *>(
1088             th->th.th_dispatch->th_dispatch_pr_current);
1089     dispatch_shared_info_template<UT> volatile *sh =
1090         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1091             th->th.th_dispatch->th_dispatch_sh_current);
1092     KMP_DEBUG_ASSERT(pr);
1093     KMP_DEBUG_ASSERT(sh);
1094     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1095                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1096 
1097     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1098     UT lower = pr->u.p.ordered_lower;
1099     UT upper = pr->u.p.ordered_upper;
1100     UT inc = upper - lower + 1;
1101 
1102     if (pr->ordered_bumped == inc) {
1103       KD_TRACE(
1104           1000,
1105           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106            gtid));
1107       pr->ordered_bumped = 0;
1108     } else {
1109       inc -= pr->ordered_bumped;
1110 
1111 #ifdef KMP_DEBUG
1112       {
1113         char *buff;
1114         // create format specifiers before the debug output
1115         buff = __kmp_str_format(
1116             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120         __kmp_str_free(&buff);
1121       }
1122 #endif
1123 
1124       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126 
1127       KMP_MB(); /* is this necessary? */
1128       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129                       "ordered_bumped to zero\n",
1130                       gtid));
1131       pr->ordered_bumped = 0;
1132 //!!!!! TODO check if the inc should be unsigned, or signed???
1133 #ifdef KMP_DEBUG
1134       {
1135         char *buff;
1136         // create format specifiers before the debug output
1137         buff = __kmp_str_format(
1138             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141             traits_t<UT>::spec);
1142         KD_TRACE(1000,
1143                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144         __kmp_str_free(&buff);
1145       }
1146 #endif
1147 
1148       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149     }
1150     //        }
1151   }
1152   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153 }
1154 
1155 #endif /* KMP_GOMP_COMPAT */
1156 
1157 template <typename T>
1158 int __kmp_dispatch_next_algorithm(int gtid,
1159                                   dispatch_private_info_template<T> *pr,
1160                                   dispatch_shared_info_template<T> volatile *sh,
1161                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1162                                   typename traits_t<T>::signed_t *p_st, T nproc,
1163                                   T tid) {
1164   typedef typename traits_t<T>::unsigned_t UT;
1165   typedef typename traits_t<T>::signed_t ST;
1166   typedef typename traits_t<T>::floating_t DBL;
1167   int status = 0;
1168   bool last = false;
1169   T start;
1170   ST incr;
1171   UT limit, trip, init;
1172   kmp_info_t *th = __kmp_threads[gtid];
1173   kmp_team_t *team = th->th.th_team;
1174 
1175   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177   KMP_DEBUG_ASSERT(pr);
1178   KMP_DEBUG_ASSERT(sh);
1179   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180 #ifdef KMP_DEBUG
1181   {
1182     char *buff;
1183     // create format specifiers before the debug output
1184     buff =
1185         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186                          "sh:%%p nproc:%%%s tid:%%%s\n",
1187                          traits_t<T>::spec, traits_t<T>::spec);
1188     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189     __kmp_str_free(&buff);
1190   }
1191 #endif
1192 
1193   // zero trip count
1194   if (pr->u.p.tc == 0) {
1195     KD_TRACE(10,
1196              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197               "zero status:%d\n",
1198               gtid, status));
1199     return 0;
1200   }
1201 
1202   switch (pr->schedule) {
1203 #if (KMP_STATIC_STEAL_ENABLED)
1204   case kmp_sch_static_steal: {
1205     T chunk = pr->u.p.parm1;
1206 
1207     KD_TRACE(100,
1208              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209               gtid));
1210 
1211     trip = pr->u.p.tc - 1;
1212 
1213     if (traits_t<T>::type_size > 4) {
1214       // use lock for 8-byte and CAS for 4-byte induction
1215       // variable. TODO (optional): check and use 16-byte CAS
1216       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1217       KMP_DEBUG_ASSERT(lck != NULL);
1218       if (pr->u.p.count < (UT)pr->u.p.ub) {
1219         __kmp_acquire_lock(lck, gtid);
1220         // try to get own chunk of iterations
1221         init = (pr->u.p.count)++;
1222         status = (init < (UT)pr->u.p.ub);
1223         __kmp_release_lock(lck, gtid);
1224       } else {
1225         status = 0; // no own chunks
1226       }
1227       if (!status) { // try to steal
1228         kmp_info_t **other_threads = team->t.t_threads;
1229         T while_limit = pr->u.p.parm3;
1230         T while_index = 0;
1231         T id = pr->u.p.static_steal_counter; // loop id
1232         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233                   __kmp_dispatch_num_buffers; // current loop index
1234         // note: victim thread can potentially execute another loop
1235         // TODO: algorithm of searching for a victim
1236         // should be cleaned up and measured
1237         while ((!status) && (while_limit != ++while_index)) {
1238           dispatch_private_info_template<T> *victim;
1239           T remaining;
1240           T victimIdx = pr->u.p.parm4;
1241           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1242           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1243               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1244           KMP_DEBUG_ASSERT(victim);
1245           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1246                  oldVictimIdx != victimIdx) {
1247             victimIdx = (victimIdx + 1) % nproc;
1248             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1249                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1250             KMP_DEBUG_ASSERT(victim);
1251           }
1252           if (victim == pr || id != victim->u.p.static_steal_counter) {
1253             continue; // try once more (nproc attempts in total)
1254             // no victim is ready yet to participate in stealing
1255             // because no victim passed kmp_init_dispatch yet
1256           }
1257           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1258             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1259             continue; // not enough chunks to steal, goto next victim
1260           }
1261 
1262           lck = victim->u.p.th_steal_lock;
1263           KMP_ASSERT(lck != NULL);
1264           __kmp_acquire_lock(lck, gtid);
1265           limit = victim->u.p.ub; // keep initial ub
1266           if (victim->u.p.count >= limit ||
1267               (remaining = limit - victim->u.p.count) < 2) {
1268             __kmp_release_lock(lck, gtid);
1269             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1270             continue; // not enough chunks to steal
1271           }
1272           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1273           // by 1
1274           if (remaining > 3) {
1275             // steal 1/4 of remaining
1276             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1277             init = (victim->u.p.ub -= (remaining >> 2));
1278           } else {
1279             // steal 1 chunk of 2 or 3 remaining
1280             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1281             init = (victim->u.p.ub -= 1);
1282           }
1283           __kmp_release_lock(lck, gtid);
1284 
1285           KMP_DEBUG_ASSERT(init + 1 <= limit);
1286           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1287           status = 1;
1288           while_index = 0;
1289           // now update own count and ub with stolen range but init chunk
1290           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1291           pr->u.p.count = init + 1;
1292           pr->u.p.ub = limit;
1293           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1294         } // while (search for victim)
1295       } // if (try to find victim and steal)
1296     } else {
1297       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1298       typedef union {
1299         struct {
1300           UT count;
1301           T ub;
1302         } p;
1303         kmp_int64 b;
1304       } union_i4;
1305       // All operations on 'count' or 'ub' must be combined atomically
1306       // together.
1307       {
1308         union_i4 vold, vnew;
1309         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1310         vnew = vold;
1311         vnew.p.count++;
1312         while (!KMP_COMPARE_AND_STORE_ACQ64(
1313             (volatile kmp_int64 *)&pr->u.p.count,
1314             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1315             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1316           KMP_CPU_PAUSE();
1317           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1318           vnew = vold;
1319           vnew.p.count++;
1320         }
1321         vnew = vold;
1322         init = vnew.p.count;
1323         status = (init < (UT)vnew.p.ub);
1324       }
1325 
1326       if (!status) {
1327         kmp_info_t **other_threads = team->t.t_threads;
1328         T while_limit = pr->u.p.parm3;
1329         T while_index = 0;
1330         T id = pr->u.p.static_steal_counter; // loop id
1331         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1332                   __kmp_dispatch_num_buffers; // current loop index
1333         // note: victim thread can potentially execute another loop
1334         // TODO: algorithm of searching for a victim
1335         // should be cleaned up and measured
1336         while ((!status) && (while_limit != ++while_index)) {
1337           dispatch_private_info_template<T> *victim;
1338           union_i4 vold, vnew;
1339           T remaining;
1340           T victimIdx = pr->u.p.parm4;
1341           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1342           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1343               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1344           KMP_DEBUG_ASSERT(victim);
1345           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1346                  oldVictimIdx != victimIdx) {
1347             victimIdx = (victimIdx + 1) % nproc;
1348             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1349                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1350             KMP_DEBUG_ASSERT(victim);
1351           }
1352           if (victim == pr || id != victim->u.p.static_steal_counter) {
1353             continue; // try once more (nproc attempts in total)
1354             // no victim is ready yet to participate in stealing
1355             // because no victim passed kmp_init_dispatch yet
1356           }
1357           pr->u.p.parm4 = victimIdx; // new victim found
1358           while (1) { // CAS loop if victim has enough chunks to steal
1359             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1360             vnew = vold;
1361 
1362             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1363             if (vnew.p.count >= (UT)vnew.p.ub ||
1364                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1365               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1366               break; // not enough chunks to steal, goto next victim
1367             }
1368             if (remaining > 3) {
1369               // try to steal 1/4 of remaining
1370               vnew.p.ub -= remaining >> 2;
1371             } else {
1372               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1373             }
1374             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1375             // TODO: Should this be acquire or release?
1376             if (KMP_COMPARE_AND_STORE_ACQ64(
1377                     (volatile kmp_int64 *)&victim->u.p.count,
1378                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1379                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1380               // stealing succeeded
1381               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1382                                         vold.p.ub - vnew.p.ub);
1383               status = 1;
1384               while_index = 0;
1385               // now update own count and ub
1386               init = vnew.p.ub;
1387               vold.p.count = init + 1;
1388 #if KMP_ARCH_X86
1389               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1390 #else
1391               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1392 #endif
1393               break;
1394             } // if (check CAS result)
1395             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1396           } // while (try to steal from particular victim)
1397         } // while (search for victim)
1398       } // if (try to find victim and steal)
1399     } // if (4-byte induction variable)
1400     if (!status) {
1401       *p_lb = 0;
1402       *p_ub = 0;
1403       if (p_st != NULL)
1404         *p_st = 0;
1405     } else {
1406       start = pr->u.p.parm2;
1407       init *= chunk;
1408       limit = chunk + init - 1;
1409       incr = pr->u.p.st;
1410       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1411 
1412       KMP_DEBUG_ASSERT(init <= trip);
1413       if ((last = (limit >= trip)) != 0)
1414         limit = trip;
1415       if (p_st != NULL)
1416         *p_st = incr;
1417 
1418       if (incr == 1) {
1419         *p_lb = start + init;
1420         *p_ub = start + limit;
1421       } else {
1422         *p_lb = start + init * incr;
1423         *p_ub = start + limit * incr;
1424       }
1425 
1426       if (pr->flags.ordered) {
1427         pr->u.p.ordered_lower = init;
1428         pr->u.p.ordered_upper = limit;
1429       } // if
1430     } // if
1431     break;
1432   } // case
1433 #endif // ( KMP_STATIC_STEAL_ENABLED )
1434   case kmp_sch_static_balanced: {
1435     KD_TRACE(
1436         10,
1437         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1438          gtid));
1439     /* check if thread has any iteration to do */
1440     if ((status = !pr->u.p.count) != 0) {
1441       pr->u.p.count = 1;
1442       *p_lb = pr->u.p.lb;
1443       *p_ub = pr->u.p.ub;
1444       last = (pr->u.p.parm1 != 0);
1445       if (p_st != NULL)
1446         *p_st = pr->u.p.st;
1447     } else { /* no iterations to do */
1448       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1449     }
1450   } // case
1451   break;
1452   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1453                                  merged here */
1454   case kmp_sch_static_chunked: {
1455     T parm1;
1456 
1457     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1458                    "kmp_sch_static_[affinity|chunked] case\n",
1459                    gtid));
1460     parm1 = pr->u.p.parm1;
1461 
1462     trip = pr->u.p.tc - 1;
1463     init = parm1 * (pr->u.p.count + tid);
1464 
1465     if ((status = (init <= trip)) != 0) {
1466       start = pr->u.p.lb;
1467       incr = pr->u.p.st;
1468       limit = parm1 + init - 1;
1469 
1470       if ((last = (limit >= trip)) != 0)
1471         limit = trip;
1472 
1473       if (p_st != NULL)
1474         *p_st = incr;
1475 
1476       pr->u.p.count += nproc;
1477 
1478       if (incr == 1) {
1479         *p_lb = start + init;
1480         *p_ub = start + limit;
1481       } else {
1482         *p_lb = start + init * incr;
1483         *p_ub = start + limit * incr;
1484       }
1485 
1486       if (pr->flags.ordered) {
1487         pr->u.p.ordered_lower = init;
1488         pr->u.p.ordered_upper = limit;
1489       } // if
1490     } // if
1491   } // case
1492   break;
1493 
1494   case kmp_sch_dynamic_chunked: {
1495     UT chunk_number;
1496     UT chunk_size = pr->u.p.parm1;
1497     UT nchunks = pr->u.p.parm2;
1498 
1499     KD_TRACE(
1500         100,
1501         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1502          gtid));
1503 
1504     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1505     status = (chunk_number < nchunks);
1506     if (!status) {
1507       *p_lb = 0;
1508       *p_ub = 0;
1509       if (p_st != NULL)
1510         *p_st = 0;
1511     } else {
1512       init = chunk_size * chunk_number;
1513       trip = pr->u.p.tc - 1;
1514       start = pr->u.p.lb;
1515       incr = pr->u.p.st;
1516 
1517       if ((last = (trip - init < (UT)chunk_size)))
1518         limit = trip;
1519       else
1520         limit = chunk_size + init - 1;
1521 
1522       if (p_st != NULL)
1523         *p_st = incr;
1524 
1525       if (incr == 1) {
1526         *p_lb = start + init;
1527         *p_ub = start + limit;
1528       } else {
1529         *p_lb = start + init * incr;
1530         *p_ub = start + limit * incr;
1531       }
1532 
1533       if (pr->flags.ordered) {
1534         pr->u.p.ordered_lower = init;
1535         pr->u.p.ordered_upper = limit;
1536       } // if
1537     } // if
1538   } // case
1539   break;
1540 
1541   case kmp_sch_guided_iterative_chunked: {
1542     T chunkspec = pr->u.p.parm1;
1543     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1544                    "iterative case\n",
1545                    gtid));
1546     trip = pr->u.p.tc;
1547     // Start atomic part of calculations
1548     while (1) {
1549       ST remaining; // signed, because can be < 0
1550       init = sh->u.s.iteration; // shared value
1551       remaining = trip - init;
1552       if (remaining <= 0) { // AC: need to compare with 0 first
1553         // nothing to do, don't try atomic op
1554         status = 0;
1555         break;
1556       }
1557       if ((T)remaining <
1558           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1559         // use dynamic-style schedule
1560         // atomically increment iterations, get old value
1561         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1562                                  (ST)chunkspec);
1563         remaining = trip - init;
1564         if (remaining <= 0) {
1565           status = 0; // all iterations got by other threads
1566         } else {
1567           // got some iterations to work on
1568           status = 1;
1569           if ((T)remaining > chunkspec) {
1570             limit = init + chunkspec - 1;
1571           } else {
1572             last = true; // the last chunk
1573             limit = init + remaining - 1;
1574           } // if
1575         } // if
1576         break;
1577       } // if
1578       limit = init + (UT)((double)remaining *
1579                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1580       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1581                                (ST)init, (ST)limit)) {
1582         // CAS was successful, chunk obtained
1583         status = 1;
1584         --limit;
1585         break;
1586       } // if
1587     } // while
1588     if (status != 0) {
1589       start = pr->u.p.lb;
1590       incr = pr->u.p.st;
1591       if (p_st != NULL)
1592         *p_st = incr;
1593       *p_lb = start + init * incr;
1594       *p_ub = start + limit * incr;
1595       if (pr->flags.ordered) {
1596         pr->u.p.ordered_lower = init;
1597         pr->u.p.ordered_upper = limit;
1598       } // if
1599     } else {
1600       *p_lb = 0;
1601       *p_ub = 0;
1602       if (p_st != NULL)
1603         *p_st = 0;
1604     } // if
1605   } // case
1606   break;
1607 
1608   case kmp_sch_guided_simd: {
1609     // same as iterative but curr-chunk adjusted to be multiple of given
1610     // chunk
1611     T chunk = pr->u.p.parm1;
1612     KD_TRACE(100,
1613              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1614               gtid));
1615     trip = pr->u.p.tc;
1616     // Start atomic part of calculations
1617     while (1) {
1618       ST remaining; // signed, because can be < 0
1619       init = sh->u.s.iteration; // shared value
1620       remaining = trip - init;
1621       if (remaining <= 0) { // AC: need to compare with 0 first
1622         status = 0; // nothing to do, don't try atomic op
1623         break;
1624       }
1625       KMP_DEBUG_ASSERT(init % chunk == 0);
1626       // compare with K*nproc*(chunk+1), K=2 by default
1627       if ((T)remaining < pr->u.p.parm2) {
1628         // use dynamic-style schedule
1629         // atomically increment iterations, get old value
1630         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1631                                  (ST)chunk);
1632         remaining = trip - init;
1633         if (remaining <= 0) {
1634           status = 0; // all iterations got by other threads
1635         } else {
1636           // got some iterations to work on
1637           status = 1;
1638           if ((T)remaining > chunk) {
1639             limit = init + chunk - 1;
1640           } else {
1641             last = true; // the last chunk
1642             limit = init + remaining - 1;
1643           } // if
1644         } // if
1645         break;
1646       } // if
1647       // divide by K*nproc
1648       UT span;
1649       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1650                          &span);
1651       UT rem = span % chunk;
1652       if (rem) // adjust so that span%chunk == 0
1653         span += chunk - rem;
1654       limit = init + span;
1655       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656                                (ST)init, (ST)limit)) {
1657         // CAS was successful, chunk obtained
1658         status = 1;
1659         --limit;
1660         break;
1661       } // if
1662     } // while
1663     if (status != 0) {
1664       start = pr->u.p.lb;
1665       incr = pr->u.p.st;
1666       if (p_st != NULL)
1667         *p_st = incr;
1668       *p_lb = start + init * incr;
1669       *p_ub = start + limit * incr;
1670       if (pr->flags.ordered) {
1671         pr->u.p.ordered_lower = init;
1672         pr->u.p.ordered_upper = limit;
1673       } // if
1674     } else {
1675       *p_lb = 0;
1676       *p_ub = 0;
1677       if (p_st != NULL)
1678         *p_st = 0;
1679     } // if
1680   } // case
1681   break;
1682 
1683   case kmp_sch_guided_analytical_chunked: {
1684     T chunkspec = pr->u.p.parm1;
1685     UT chunkIdx;
1686 #if KMP_USE_X87CONTROL
1687     /* for storing original FPCW value for Windows* OS on
1688        IA-32 architecture 8-byte version */
1689     unsigned int oldFpcw;
1690     unsigned int fpcwSet = 0;
1691 #endif
1692     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1693                    "kmp_sch_guided_analytical_chunked case\n",
1694                    gtid));
1695 
1696     trip = pr->u.p.tc;
1697 
1698     KMP_DEBUG_ASSERT(nproc > 1);
1699     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1700 
1701     while (1) { /* this while loop is a safeguard against unexpected zero
1702                    chunk sizes */
1703       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1704       if (chunkIdx >= (UT)pr->u.p.parm2) {
1705         --trip;
1706         /* use dynamic-style scheduling */
1707         init = chunkIdx * chunkspec + pr->u.p.count;
1708         /* need to verify init > 0 in case of overflow in the above
1709          * calculation */
1710         if ((status = (init > 0 && init <= trip)) != 0) {
1711           limit = init + chunkspec - 1;
1712 
1713           if ((last = (limit >= trip)) != 0)
1714             limit = trip;
1715         }
1716         break;
1717       } else {
1718 /* use exponential-style scheduling */
1719 /* The following check is to workaround the lack of long double precision on
1720    Windows* OS.
1721    This check works around the possible effect that init != 0 for chunkIdx == 0.
1722  */
1723 #if KMP_USE_X87CONTROL
1724         /* If we haven't already done so, save original
1725            FPCW and set precision to 64-bit, as Windows* OS
1726            on IA-32 architecture defaults to 53-bit */
1727         if (!fpcwSet) {
1728           oldFpcw = _control87(0, 0);
1729           _control87(_PC_64, _MCW_PC);
1730           fpcwSet = 0x30000;
1731         }
1732 #endif
1733         if (chunkIdx) {
1734           init = __kmp_dispatch_guided_remaining<T>(
1735               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1736           KMP_DEBUG_ASSERT(init);
1737           init = trip - init;
1738         } else
1739           init = 0;
1740         limit = trip - __kmp_dispatch_guided_remaining<T>(
1741                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1742         KMP_ASSERT(init <= limit);
1743         if (init < limit) {
1744           KMP_DEBUG_ASSERT(limit <= trip);
1745           --limit;
1746           status = 1;
1747           break;
1748         } // if
1749       } // if
1750     } // while (1)
1751 #if KMP_USE_X87CONTROL
1752     /* restore FPCW if necessary
1753        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1754     */
1755     if (fpcwSet && (oldFpcw & fpcwSet))
1756       _control87(oldFpcw, _MCW_PC);
1757 #endif
1758     if (status != 0) {
1759       start = pr->u.p.lb;
1760       incr = pr->u.p.st;
1761       if (p_st != NULL)
1762         *p_st = incr;
1763       *p_lb = start + init * incr;
1764       *p_ub = start + limit * incr;
1765       if (pr->flags.ordered) {
1766         pr->u.p.ordered_lower = init;
1767         pr->u.p.ordered_upper = limit;
1768       }
1769     } else {
1770       *p_lb = 0;
1771       *p_ub = 0;
1772       if (p_st != NULL)
1773         *p_st = 0;
1774     }
1775   } // case
1776   break;
1777 
1778   case kmp_sch_trapezoidal: {
1779     UT index;
1780     T parm2 = pr->u.p.parm2;
1781     T parm3 = pr->u.p.parm3;
1782     T parm4 = pr->u.p.parm4;
1783     KD_TRACE(100,
1784              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1785               gtid));
1786 
1787     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1788 
1789     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1790     trip = pr->u.p.tc - 1;
1791 
1792     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1793       *p_lb = 0;
1794       *p_ub = 0;
1795       if (p_st != NULL)
1796         *p_st = 0;
1797     } else {
1798       start = pr->u.p.lb;
1799       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1800       incr = pr->u.p.st;
1801 
1802       if ((last = (limit >= trip)) != 0)
1803         limit = trip;
1804 
1805       if (p_st != NULL)
1806         *p_st = incr;
1807 
1808       if (incr == 1) {
1809         *p_lb = start + init;
1810         *p_ub = start + limit;
1811       } else {
1812         *p_lb = start + init * incr;
1813         *p_ub = start + limit * incr;
1814       }
1815 
1816       if (pr->flags.ordered) {
1817         pr->u.p.ordered_lower = init;
1818         pr->u.p.ordered_upper = limit;
1819       } // if
1820     } // if
1821   } // case
1822   break;
1823   default: {
1824     status = 0; // to avoid complaints on uninitialized variable use
1825     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1826                 KMP_HNT(GetNewerLibrary), // Hint
1827                 __kmp_msg_null // Variadic argument list terminator
1828     );
1829   } break;
1830   } // switch
1831   if (p_last)
1832     *p_last = last;
1833 #ifdef KMP_DEBUG
1834   if (pr->flags.ordered) {
1835     char *buff;
1836     // create format specifiers before the debug output
1837     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1838                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1839                             traits_t<UT>::spec, traits_t<UT>::spec);
1840     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1841     __kmp_str_free(&buff);
1842   }
1843   {
1844     char *buff;
1845     // create format specifiers before the debug output
1846     buff = __kmp_str_format(
1847         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1848         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1849         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1850     KMP_DEBUG_ASSERT(p_last);
1851     KMP_DEBUG_ASSERT(p_st);
1852     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1853     __kmp_str_free(&buff);
1854   }
1855 #endif
1856   return status;
1857 }
1858 
1859 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1860    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1861    is not called. */
1862 #if OMPT_SUPPORT && OMPT_OPTIONAL
1863 #define OMPT_LOOP_END                                                          \
1864   if (status == 0) {                                                           \
1865     if (ompt_enabled.ompt_callback_work) {                                     \
1866       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1867       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1868       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1869           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1870           &(task_info->task_data), 0, codeptr);                                \
1871     }                                                                          \
1872   }
1873 // TODO: implement count
1874 #else
1875 #define OMPT_LOOP_END // no-op
1876 #endif
1877 
1878 #if KMP_STATS_ENABLED
1879 #define KMP_STATS_LOOP_END                                                     \
1880   {                                                                            \
1881     kmp_int64 u, l, t, i;                                                      \
1882     l = (kmp_int64)(*p_lb);                                                    \
1883     u = (kmp_int64)(*p_ub);                                                    \
1884     i = (kmp_int64)(pr->u.p.st);                                               \
1885     if (status == 0) {                                                         \
1886       t = 0;                                                                   \
1887       KMP_POP_PARTITIONED_TIMER();                                             \
1888     } else if (i == 1) {                                                       \
1889       if (u >= l)                                                              \
1890         t = u - l + 1;                                                         \
1891       else                                                                     \
1892         t = 0;                                                                 \
1893     } else if (i < 0) {                                                        \
1894       if (l >= u)                                                              \
1895         t = (l - u) / (-i) + 1;                                                \
1896       else                                                                     \
1897         t = 0;                                                                 \
1898     } else {                                                                   \
1899       if (u >= l)                                                              \
1900         t = (u - l) / i + 1;                                                   \
1901       else                                                                     \
1902         t = 0;                                                                 \
1903     }                                                                          \
1904     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1905   }
1906 #else
1907 #define KMP_STATS_LOOP_END /* Nothing */
1908 #endif
1909 
1910 template <typename T>
1911 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1912                                T *p_lb, T *p_ub,
1913                                typename traits_t<T>::signed_t *p_st
1914 #if OMPT_SUPPORT && OMPT_OPTIONAL
1915                                ,
1916                                void *codeptr
1917 #endif
1918 ) {
1919 
1920   typedef typename traits_t<T>::unsigned_t UT;
1921   typedef typename traits_t<T>::signed_t ST;
1922   // This is potentially slightly misleading, schedule(runtime) will appear here
1923   // even if the actual runtime schedule is static. (Which points out a
1924   // disadvantage of schedule(runtime): even when static scheduling is used it
1925   // costs more than a compile time choice to use static scheduling would.)
1926   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1927 
1928   int status;
1929   dispatch_private_info_template<T> *pr;
1930   __kmp_assert_valid_gtid(gtid);
1931   kmp_info_t *th = __kmp_threads[gtid];
1932   kmp_team_t *team = th->th.th_team;
1933 
1934   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1935   KD_TRACE(
1936       1000,
1937       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1938        gtid, p_lb, p_ub, p_st, p_last));
1939 
1940   if (team->t.t_serialized) {
1941     /* NOTE: serialize this dispatch because we are not at the active level */
1942     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1943         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1944     KMP_DEBUG_ASSERT(pr);
1945 
1946     if ((status = (pr->u.p.tc != 0)) == 0) {
1947       *p_lb = 0;
1948       *p_ub = 0;
1949       //            if ( p_last != NULL )
1950       //                *p_last = 0;
1951       if (p_st != NULL)
1952         *p_st = 0;
1953       if (__kmp_env_consistency_check) {
1954         if (pr->pushed_ws != ct_none) {
1955           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1956         }
1957       }
1958     } else if (pr->flags.nomerge) {
1959       kmp_int32 last;
1960       T start;
1961       UT limit, trip, init;
1962       ST incr;
1963       T chunk = pr->u.p.parm1;
1964 
1965       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1966                      gtid));
1967 
1968       init = chunk * pr->u.p.count++;
1969       trip = pr->u.p.tc - 1;
1970 
1971       if ((status = (init <= trip)) == 0) {
1972         *p_lb = 0;
1973         *p_ub = 0;
1974         //                if ( p_last != NULL )
1975         //                    *p_last = 0;
1976         if (p_st != NULL)
1977           *p_st = 0;
1978         if (__kmp_env_consistency_check) {
1979           if (pr->pushed_ws != ct_none) {
1980             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1981           }
1982         }
1983       } else {
1984         start = pr->u.p.lb;
1985         limit = chunk + init - 1;
1986         incr = pr->u.p.st;
1987 
1988         if ((last = (limit >= trip)) != 0) {
1989           limit = trip;
1990 #if KMP_OS_WINDOWS
1991           pr->u.p.last_upper = pr->u.p.ub;
1992 #endif /* KMP_OS_WINDOWS */
1993         }
1994         if (p_last != NULL)
1995           *p_last = last;
1996         if (p_st != NULL)
1997           *p_st = incr;
1998         if (incr == 1) {
1999           *p_lb = start + init;
2000           *p_ub = start + limit;
2001         } else {
2002           *p_lb = start + init * incr;
2003           *p_ub = start + limit * incr;
2004         }
2005 
2006         if (pr->flags.ordered) {
2007           pr->u.p.ordered_lower = init;
2008           pr->u.p.ordered_upper = limit;
2009 #ifdef KMP_DEBUG
2010           {
2011             char *buff;
2012             // create format specifiers before the debug output
2013             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2014                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2015                                     traits_t<UT>::spec, traits_t<UT>::spec);
2016             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2017                             pr->u.p.ordered_upper));
2018             __kmp_str_free(&buff);
2019           }
2020 #endif
2021         } // if
2022       } // if
2023     } else {
2024       pr->u.p.tc = 0;
2025       *p_lb = pr->u.p.lb;
2026       *p_ub = pr->u.p.ub;
2027 #if KMP_OS_WINDOWS
2028       pr->u.p.last_upper = *p_ub;
2029 #endif /* KMP_OS_WINDOWS */
2030       if (p_last != NULL)
2031         *p_last = TRUE;
2032       if (p_st != NULL)
2033         *p_st = pr->u.p.st;
2034     } // if
2035 #ifdef KMP_DEBUG
2036     {
2037       char *buff;
2038       // create format specifiers before the debug output
2039       buff = __kmp_str_format(
2040           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2041           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2042           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2043       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2044                     (p_last ? *p_last : 0), status));
2045       __kmp_str_free(&buff);
2046     }
2047 #endif
2048 #if INCLUDE_SSC_MARKS
2049     SSC_MARK_DISPATCH_NEXT();
2050 #endif
2051     OMPT_LOOP_END;
2052     KMP_STATS_LOOP_END;
2053     return status;
2054   } else {
2055     kmp_int32 last = 0;
2056     dispatch_shared_info_template<T> volatile *sh;
2057 
2058     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2059                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2060 
2061     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2062         th->th.th_dispatch->th_dispatch_pr_current);
2063     KMP_DEBUG_ASSERT(pr);
2064     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2065         th->th.th_dispatch->th_dispatch_sh_current);
2066     KMP_DEBUG_ASSERT(sh);
2067 
2068 #if KMP_USE_HIER_SCHED
2069     if (pr->flags.use_hier)
2070       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2071     else
2072 #endif // KMP_USE_HIER_SCHED
2073       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2074                                                 p_st, th->th.th_team_nproc,
2075                                                 th->th.th_info.ds.ds_tid);
2076     // status == 0: no more iterations to execute
2077     if (status == 0) {
2078       UT num_done;
2079 
2080       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2081 #ifdef KMP_DEBUG
2082       {
2083         char *buff;
2084         // create format specifiers before the debug output
2085         buff = __kmp_str_format(
2086             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2087             traits_t<UT>::spec);
2088         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2089         __kmp_str_free(&buff);
2090       }
2091 #endif
2092 
2093 #if KMP_USE_HIER_SCHED
2094       pr->flags.use_hier = FALSE;
2095 #endif
2096       if ((ST)num_done == th->th.th_team_nproc - 1) {
2097 #if (KMP_STATIC_STEAL_ENABLED)
2098         if (pr->schedule == kmp_sch_static_steal &&
2099             traits_t<T>::type_size > 4) {
2100           int i;
2101           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2102                     __kmp_dispatch_num_buffers; // current loop index
2103           kmp_info_t **other_threads = team->t.t_threads;
2104           // loop complete, safe to destroy locks used for stealing
2105           for (i = 0; i < th->th.th_team_nproc; ++i) {
2106             dispatch_private_info_template<T> *buf =
2107                 reinterpret_cast<dispatch_private_info_template<T> *>(
2108                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2109             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2110             KMP_ASSERT(lck != NULL);
2111             __kmp_destroy_lock(lck);
2112             __kmp_free(lck);
2113             buf->u.p.th_steal_lock = NULL;
2114           }
2115         }
2116 #endif
2117         /* NOTE: release this buffer to be reused */
2118 
2119         KMP_MB(); /* Flush all pending memory write invalidates.  */
2120 
2121         sh->u.s.num_done = 0;
2122         sh->u.s.iteration = 0;
2123 
2124         /* TODO replace with general release procedure? */
2125         if (pr->flags.ordered) {
2126           sh->u.s.ordered_iteration = 0;
2127         }
2128 
2129         KMP_MB(); /* Flush all pending memory write invalidates.  */
2130 
2131         sh->buffer_index += __kmp_dispatch_num_buffers;
2132         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2133                        gtid, sh->buffer_index));
2134 
2135         KMP_MB(); /* Flush all pending memory write invalidates.  */
2136 
2137       } // if
2138       if (__kmp_env_consistency_check) {
2139         if (pr->pushed_ws != ct_none) {
2140           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2141         }
2142       }
2143 
2144       th->th.th_dispatch->th_deo_fcn = NULL;
2145       th->th.th_dispatch->th_dxo_fcn = NULL;
2146       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2147       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2148     } // if (status == 0)
2149 #if KMP_OS_WINDOWS
2150     else if (last) {
2151       pr->u.p.last_upper = pr->u.p.ub;
2152     }
2153 #endif /* KMP_OS_WINDOWS */
2154     if (p_last != NULL && status != 0)
2155       *p_last = last;
2156   } // if
2157 
2158 #ifdef KMP_DEBUG
2159   {
2160     char *buff;
2161     // create format specifiers before the debug output
2162     buff = __kmp_str_format(
2163         "__kmp_dispatch_next: T#%%d normal case: "
2164         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2165         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2166     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2167                   (p_last ? *p_last : 0), status));
2168     __kmp_str_free(&buff);
2169   }
2170 #endif
2171 #if INCLUDE_SSC_MARKS
2172   SSC_MARK_DISPATCH_NEXT();
2173 #endif
2174   OMPT_LOOP_END;
2175   KMP_STATS_LOOP_END;
2176   return status;
2177 }
2178 
2179 template <typename T>
2180 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2181                                   kmp_int32 *plastiter, T *plower, T *pupper,
2182                                   typename traits_t<T>::signed_t incr) {
2183   typedef typename traits_t<T>::unsigned_t UT;
2184   kmp_uint32 team_id;
2185   kmp_uint32 nteams;
2186   UT trip_count;
2187   kmp_team_t *team;
2188   kmp_info_t *th;
2189 
2190   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2191   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2192 #ifdef KMP_DEBUG
2193   typedef typename traits_t<T>::signed_t ST;
2194   {
2195     char *buff;
2196     // create format specifiers before the debug output
2197     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2198                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2199                             traits_t<T>::spec, traits_t<T>::spec,
2200                             traits_t<ST>::spec, traits_t<T>::spec);
2201     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2202     __kmp_str_free(&buff);
2203   }
2204 #endif
2205 
2206   if (__kmp_env_consistency_check) {
2207     if (incr == 0) {
2208       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2209                             loc);
2210     }
2211     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2212       // The loop is illegal.
2213       // Some zero-trip loops maintained by compiler, e.g.:
2214       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2215       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2216       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2217       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2218       // Compiler does not check the following illegal loops:
2219       //   for(i=0;i<10;i+=incr) // where incr<0
2220       //   for(i=10;i>0;i-=incr) // where incr<0
2221       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2222     }
2223   }
2224   __kmp_assert_valid_gtid(gtid);
2225   th = __kmp_threads[gtid];
2226   team = th->th.th_team;
2227   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2228   nteams = th->th.th_teams_size.nteams;
2229   team_id = team->t.t_master_tid;
2230   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2231 
2232   // compute global trip count
2233   if (incr == 1) {
2234     trip_count = *pupper - *plower + 1;
2235   } else if (incr == -1) {
2236     trip_count = *plower - *pupper + 1;
2237   } else if (incr > 0) {
2238     // upper-lower can exceed the limit of signed type
2239     trip_count = (UT)(*pupper - *plower) / incr + 1;
2240   } else {
2241     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2242   }
2243 
2244   if (trip_count <= nteams) {
2245     KMP_DEBUG_ASSERT(
2246         __kmp_static == kmp_sch_static_greedy ||
2247         __kmp_static ==
2248             kmp_sch_static_balanced); // Unknown static scheduling type.
2249     // only some teams get single iteration, others get nothing
2250     if (team_id < trip_count) {
2251       *pupper = *plower = *plower + team_id * incr;
2252     } else {
2253       *plower = *pupper + incr; // zero-trip loop
2254     }
2255     if (plastiter != NULL)
2256       *plastiter = (team_id == trip_count - 1);
2257   } else {
2258     if (__kmp_static == kmp_sch_static_balanced) {
2259       UT chunk = trip_count / nteams;
2260       UT extras = trip_count % nteams;
2261       *plower +=
2262           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2263       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2264       if (plastiter != NULL)
2265         *plastiter = (team_id == nteams - 1);
2266     } else {
2267       T chunk_inc_count =
2268           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2269       T upper = *pupper;
2270       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2271       // Unknown static scheduling type.
2272       *plower += team_id * chunk_inc_count;
2273       *pupper = *plower + chunk_inc_count - incr;
2274       // Check/correct bounds if needed
2275       if (incr > 0) {
2276         if (*pupper < *plower)
2277           *pupper = traits_t<T>::max_value;
2278         if (plastiter != NULL)
2279           *plastiter = *plower <= upper && *pupper > upper - incr;
2280         if (*pupper > upper)
2281           *pupper = upper; // tracker C73258
2282       } else {
2283         if (*pupper > *plower)
2284           *pupper = traits_t<T>::min_value;
2285         if (plastiter != NULL)
2286           *plastiter = *plower >= upper && *pupper < upper - incr;
2287         if (*pupper < upper)
2288           *pupper = upper; // tracker C73258
2289       }
2290     }
2291   }
2292 }
2293 
2294 //-----------------------------------------------------------------------------
2295 // Dispatch routines
2296 //    Transfer call to template< type T >
2297 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2298 //                         T lb, T ub, ST st, ST chunk )
2299 extern "C" {
2300 
2301 /*!
2302 @ingroup WORK_SHARING
2303 @{
2304 @param loc Source location
2305 @param gtid Global thread id
2306 @param schedule Schedule type
2307 @param lb  Lower bound
2308 @param ub  Upper bound
2309 @param st  Step (or increment if you prefer)
2310 @param chunk The chunk size to block with
2311 
2312 This function prepares the runtime to start a dynamically scheduled for loop,
2313 saving the loop arguments.
2314 These functions are all identical apart from the types of the arguments.
2315 */
2316 
2317 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2318                             enum sched_type schedule, kmp_int32 lb,
2319                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2320   KMP_DEBUG_ASSERT(__kmp_init_serial);
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322   OMPT_STORE_RETURN_ADDRESS(gtid);
2323 #endif
2324   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2325 }
2326 /*!
2327 See @ref __kmpc_dispatch_init_4
2328 */
2329 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2330                              enum sched_type schedule, kmp_uint32 lb,
2331                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2332   KMP_DEBUG_ASSERT(__kmp_init_serial);
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334   OMPT_STORE_RETURN_ADDRESS(gtid);
2335 #endif
2336   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2337 }
2338 
2339 /*!
2340 See @ref __kmpc_dispatch_init_4
2341 */
2342 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2343                             enum sched_type schedule, kmp_int64 lb,
2344                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2345   KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347   OMPT_STORE_RETURN_ADDRESS(gtid);
2348 #endif
2349   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2350 }
2351 
2352 /*!
2353 See @ref __kmpc_dispatch_init_4
2354 */
2355 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2356                              enum sched_type schedule, kmp_uint64 lb,
2357                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2358   KMP_DEBUG_ASSERT(__kmp_init_serial);
2359 #if OMPT_SUPPORT && OMPT_OPTIONAL
2360   OMPT_STORE_RETURN_ADDRESS(gtid);
2361 #endif
2362   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2363 }
2364 
2365 /*!
2366 See @ref __kmpc_dispatch_init_4
2367 
2368 Difference from __kmpc_dispatch_init set of functions is these functions
2369 are called for composite distribute parallel for construct. Thus before
2370 regular iterations dispatching we need to calc per-team iteration space.
2371 
2372 These functions are all identical apart from the types of the arguments.
2373 */
2374 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2375                                  enum sched_type schedule, kmp_int32 *p_last,
2376                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2377                                  kmp_int32 chunk) {
2378   KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2383   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384 }
2385 
2386 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2387                                   enum sched_type schedule, kmp_int32 *p_last,
2388                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2389                                   kmp_int32 chunk) {
2390   KMP_DEBUG_ASSERT(__kmp_init_serial);
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392   OMPT_STORE_RETURN_ADDRESS(gtid);
2393 #endif
2394   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2395   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2396 }
2397 
2398 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2399                                  enum sched_type schedule, kmp_int32 *p_last,
2400                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2401                                  kmp_int64 chunk) {
2402   KMP_DEBUG_ASSERT(__kmp_init_serial);
2403 #if OMPT_SUPPORT && OMPT_OPTIONAL
2404   OMPT_STORE_RETURN_ADDRESS(gtid);
2405 #endif
2406   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2407   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2408 }
2409 
2410 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2411                                   enum sched_type schedule, kmp_int32 *p_last,
2412                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2413                                   kmp_int64 chunk) {
2414   KMP_DEBUG_ASSERT(__kmp_init_serial);
2415 #if OMPT_SUPPORT && OMPT_OPTIONAL
2416   OMPT_STORE_RETURN_ADDRESS(gtid);
2417 #endif
2418   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2419   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2420 }
2421 
2422 /*!
2423 @param loc Source code location
2424 @param gtid Global thread id
2425 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2426 otherwise
2427 @param p_lb   Pointer to the lower bound for the next chunk of work
2428 @param p_ub   Pointer to the upper bound for the next chunk of work
2429 @param p_st   Pointer to the stride for the next chunk of work
2430 @return one if there is work to be done, zero otherwise
2431 
2432 Get the next dynamically allocated chunk of work for this thread.
2433 If there is no more work, then the lb,ub and stride need not be modified.
2434 */
2435 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2436                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438   OMPT_STORE_RETURN_ADDRESS(gtid);
2439 #endif
2440   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2441 #if OMPT_SUPPORT && OMPT_OPTIONAL
2442                                         ,
2443                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2444 #endif
2445   );
2446 }
2447 
2448 /*!
2449 See @ref __kmpc_dispatch_next_4
2450 */
2451 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2452                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2453                             kmp_int32 *p_st) {
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL
2455   OMPT_STORE_RETURN_ADDRESS(gtid);
2456 #endif
2457   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2458 #if OMPT_SUPPORT && OMPT_OPTIONAL
2459                                          ,
2460                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2461 #endif
2462   );
2463 }
2464 
2465 /*!
2466 See @ref __kmpc_dispatch_next_4
2467 */
2468 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2469                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471   OMPT_STORE_RETURN_ADDRESS(gtid);
2472 #endif
2473   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2474 #if OMPT_SUPPORT && OMPT_OPTIONAL
2475                                         ,
2476                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2477 #endif
2478   );
2479 }
2480 
2481 /*!
2482 See @ref __kmpc_dispatch_next_4
2483 */
2484 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2485                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2486                             kmp_int64 *p_st) {
2487 #if OMPT_SUPPORT && OMPT_OPTIONAL
2488   OMPT_STORE_RETURN_ADDRESS(gtid);
2489 #endif
2490   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2491 #if OMPT_SUPPORT && OMPT_OPTIONAL
2492                                          ,
2493                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2494 #endif
2495   );
2496 }
2497 
2498 /*!
2499 @param loc Source code location
2500 @param gtid Global thread id
2501 
2502 Mark the end of a dynamic loop.
2503 */
2504 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2505   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2506 }
2507 
2508 /*!
2509 See @ref __kmpc_dispatch_fini_4
2510 */
2511 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2512   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2513 }
2514 
2515 /*!
2516 See @ref __kmpc_dispatch_fini_4
2517 */
2518 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2519   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2520 }
2521 
2522 /*!
2523 See @ref __kmpc_dispatch_fini_4
2524 */
2525 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2526   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2527 }
2528 /*! @} */
2529 
2530 //-----------------------------------------------------------------------------
2531 // Non-template routines from kmp_dispatch.cpp used in other sources
2532 
2533 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2534   return value == checker;
2535 }
2536 
2537 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2538   return value != checker;
2539 }
2540 
2541 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2542   return value < checker;
2543 }
2544 
2545 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2546   return value >= checker;
2547 }
2548 
2549 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2550   return value <= checker;
2551 }
2552 
2553 kmp_uint32
2554 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2555              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2556              void *obj // Higher-level synchronization object, or NULL.
2557 ) {
2558   // note: we may not belong to a team at this point
2559   volatile kmp_uint32 *spin = spinner;
2560   kmp_uint32 check = checker;
2561   kmp_uint32 spins;
2562   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2563   kmp_uint32 r;
2564 
2565   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2566   KMP_INIT_YIELD(spins);
2567   // main wait spin loop
2568   while (!f(r = TCR_4(*spin), check)) {
2569     KMP_FSYNC_SPIN_PREPARE(obj);
2570     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2571        split. It causes problems with infinite recursion because of exit lock */
2572     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2573         __kmp_abort_thread(); */
2574     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2575   }
2576   KMP_FSYNC_SPIN_ACQUIRED(obj);
2577   return r;
2578 }
2579 
2580 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2581                       kmp_uint32 (*pred)(void *, kmp_uint32),
2582                       void *obj // Higher-level synchronization object, or NULL.
2583 ) {
2584   // note: we may not belong to a team at this point
2585   void *spin = spinner;
2586   kmp_uint32 check = checker;
2587   kmp_uint32 spins;
2588   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2589 
2590   KMP_FSYNC_SPIN_INIT(obj, spin);
2591   KMP_INIT_YIELD(spins);
2592   // main wait spin loop
2593   while (!f(spin, check)) {
2594     KMP_FSYNC_SPIN_PREPARE(obj);
2595     /* if we have waited a bit, or are noversubscribed, yield */
2596     /* pause is in the following code */
2597     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2598   }
2599   KMP_FSYNC_SPIN_ACQUIRED(obj);
2600 }
2601 
2602 } // extern "C"
2603 
2604 #ifdef KMP_GOMP_COMPAT
2605 
2606 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2607                                enum sched_type schedule, kmp_int32 lb,
2608                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2609                                int push_ws) {
2610   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2611                                  push_ws);
2612 }
2613 
2614 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2615                                 enum sched_type schedule, kmp_uint32 lb,
2616                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2617                                 int push_ws) {
2618   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2619                                   push_ws);
2620 }
2621 
2622 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2623                                enum sched_type schedule, kmp_int64 lb,
2624                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2625                                int push_ws) {
2626   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2627                                  push_ws);
2628 }
2629 
2630 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2631                                 enum sched_type schedule, kmp_uint64 lb,
2632                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2633                                 int push_ws) {
2634   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2635                                   push_ws);
2636 }
2637 
2638 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2639   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2640 }
2641 
2642 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2643   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2644 }
2645 
2646 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2647   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2648 }
2649 
2650 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2651   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2652 }
2653 
2654 #endif /* KMP_GOMP_COMPAT */
2655 
2656 /* ------------------------------------------------------------------------ */
2657