1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier || __kmp_force_monotonic)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                      gtid));
395       goto dynamic_init;
396       break;
397     } // if
398   } // case
399 #endif
400   case kmp_sch_static_balanced: {
401     T init, limit;
402 
403     KD_TRACE(
404         100,
405         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
406          gtid));
407 
408     if (nproc > 1) {
409       T id = tid;
410 
411       if (tc < nproc) {
412         if (id < tc) {
413           init = id;
414           limit = id;
415           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
416         } else {
417           pr->u.p.count = 1; /* means no more chunks to execute */
418           pr->u.p.parm1 = FALSE;
419           break;
420         }
421       } else {
422         T small_chunk = tc / nproc;
423         T extras = tc % nproc;
424         init = id * small_chunk + (id < extras ? id : extras);
425         limit = init + small_chunk - (id < extras ? 0 : 1);
426         pr->u.p.parm1 = (id == nproc - 1);
427       }
428     } else {
429       if (tc > 0) {
430         init = 0;
431         limit = tc - 1;
432         pr->u.p.parm1 = TRUE;
433       } else {
434         // zero trip count
435         pr->u.p.count = 1; /* means no more chunks to execute */
436         pr->u.p.parm1 = FALSE;
437         break;
438       }
439     }
440 #if USE_ITT_BUILD
441     // Calculate chunk for metadata report
442     if (itt_need_metadata_reporting)
443       if (cur_chunk)
444         *cur_chunk = limit - init + 1;
445 #endif
446     if (st == 1) {
447       pr->u.p.lb = lb + init;
448       pr->u.p.ub = lb + limit;
449     } else {
450       // calculated upper bound, "ub" is user-defined upper bound
451       T ub_tmp = lb + limit * st;
452       pr->u.p.lb = lb + init * st;
453       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
454       // it exactly
455       if (st > 0) {
456         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
457       } else {
458         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
459       }
460     }
461     if (pr->flags.ordered) {
462       pr->u.p.ordered_lower = init;
463       pr->u.p.ordered_upper = limit;
464     }
465     break;
466   } // case
467   case kmp_sch_static_balanced_chunked: {
468     // similar to balanced, but chunk adjusted to multiple of simd width
469     T nth = nproc;
470     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
471                    " -> falling-through to static_greedy\n",
472                    gtid));
473     schedule = kmp_sch_static_greedy;
474     if (nth > 1)
475       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
476     else
477       pr->u.p.parm1 = tc;
478     break;
479   } // case
480   case kmp_sch_guided_simd:
481   case kmp_sch_guided_iterative_chunked: {
482     KD_TRACE(
483         100,
484         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
485          " case\n",
486          gtid));
487 
488     if (nproc > 1) {
489       if ((2L * chunk + 1) * nproc >= tc) {
490         /* chunk size too large, switch to dynamic */
491         schedule = kmp_sch_dynamic_chunked;
492         goto dynamic_init;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522         goto dynamic_init;
523       } else {
524         /* commonly used term: (2 nproc - 1)/(2 nproc) */
525         DBL x;
526 
527 #if KMP_USE_X87CONTROL
528         /* Linux* OS already has 64-bit computation by default for long double,
529            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
530            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
531            instead of the default 53-bit. Even though long double doesn't work
532            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
533            expected to impact the correctness of the algorithm, but this has not
534            been mathematically proven. */
535         // save original FPCW and set precision to 64-bit, as
536         // Windows* OS on IA-32 architecture defaults to 53-bit
537         unsigned int oldFpcw = _control87(0, 0);
538         _control87(_PC_64, _MCW_PC); // 0,0x30000
539 #endif
540         /* value used for comparison in solver for cross-over point */
541         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
542 
543         /* crossover point--chunk indexes equal to or greater than
544            this point switch to dynamic-style scheduling */
545         UT cross;
546 
547         /* commonly used term: (2 nproc - 1)/(2 nproc) */
548         x = 1.0 - 0.5 / (double)nproc;
549 
550 #ifdef KMP_DEBUG
551         { // test natural alignment
552           struct _test_a {
553             char a;
554             union {
555               char b;
556               DBL d;
557             };
558           } t;
559           ptrdiff_t natural_alignment =
560               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
561           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
562           // long)natural_alignment );
563           KMP_DEBUG_ASSERT(
564               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
565         }
566 #endif // KMP_DEBUG
567 
568         /* save the term in thread private dispatch structure */
569         *(DBL *)&pr->u.p.parm3 = x;
570 
571         /* solve for the crossover point to the nearest integer i for which C_i
572            <= chunk */
573         {
574           UT left, right, mid;
575           long double p;
576 
577           /* estimate initial upper and lower bound */
578 
579           /* doesn't matter what value right is as long as it is positive, but
580              it affects performance of the solver */
581           right = 229;
582           p = __kmp_pow<UT>(x, right);
583           if (p > target) {
584             do {
585               p *= p;
586               right <<= 1;
587             } while (p > target && right < (1 << 27));
588             /* lower bound is previous (failed) estimate of upper bound */
589             left = right >> 1;
590           } else {
591             left = 0;
592           }
593 
594           /* bisection root-finding method */
595           while (left + 1 < right) {
596             mid = (left + right) / 2;
597             if (__kmp_pow<UT>(x, mid) > target) {
598               left = mid;
599             } else {
600               right = mid;
601             }
602           } // while
603           cross = right;
604         }
605         /* assert sanity of computed crossover point */
606         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
607                    __kmp_pow<UT>(x, cross) <= target);
608 
609         /* save the crossover point in thread private dispatch structure */
610         pr->u.p.parm2 = cross;
611 
612 // C75803
613 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
614 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
615 #else
616 #define GUIDED_ANALYTICAL_WORKAROUND (x)
617 #endif
618         /* dynamic-style scheduling offset */
619         pr->u.p.count = tc -
620                         __kmp_dispatch_guided_remaining(
621                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
622                         cross * chunk;
623 #if KMP_USE_X87CONTROL
624         // restore FPCW
625         _control87(oldFpcw, _MCW_PC);
626 #endif
627       } // if
628     } else {
629       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
630                      "kmp_sch_static_greedy\n",
631                      gtid));
632       schedule = kmp_sch_static_greedy;
633       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
634       pr->u.p.parm1 = tc;
635     } // if
636   } // case
637   break;
638   case kmp_sch_static_greedy:
639     KD_TRACE(
640         100,
641         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
642          gtid));
643     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
644     break;
645   case kmp_sch_static_chunked:
646   case kmp_sch_dynamic_chunked:
647   dynamic_init:
648     if (pr->u.p.parm1 <= 0)
649       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
650     else if (pr->u.p.parm1 > tc)
651       pr->u.p.parm1 = tc;
652     // Store the total number of chunks to prevent integer overflow during
653     // bounds calculations in the get next chunk routine.
654     pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
655     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
656                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
657                    gtid));
658     break;
659   case kmp_sch_trapezoidal: {
660     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
661 
662     T parm1, parm2, parm3, parm4;
663     KD_TRACE(100,
664              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
665               gtid));
666 
667     parm1 = chunk;
668 
669     /* F : size of the first cycle */
670     parm2 = (tc / (2 * nproc));
671 
672     if (parm2 < 1) {
673       parm2 = 1;
674     }
675 
676     /* L : size of the last cycle.  Make sure the last cycle is not larger
677        than the first cycle. */
678     if (parm1 < 1) {
679       parm1 = 1;
680     } else if (parm1 > parm2) {
681       parm1 = parm2;
682     }
683 
684     /* N : number of cycles */
685     parm3 = (parm2 + parm1);
686     parm3 = (2 * tc + parm3 - 1) / parm3;
687 
688     if (parm3 < 2) {
689       parm3 = 2;
690     }
691 
692     /* sigma : decreasing incr of the trapezoid */
693     parm4 = (parm3 - 1);
694     parm4 = (parm2 - parm1) / parm4;
695 
696     // pointless check, because parm4 >= 0 always
697     // if ( parm4 < 0 ) {
698     //    parm4 = 0;
699     //}
700 
701     pr->u.p.parm1 = parm1;
702     pr->u.p.parm2 = parm2;
703     pr->u.p.parm3 = parm3;
704     pr->u.p.parm4 = parm4;
705   } // case
706   break;
707 
708   default: {
709     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
710                 KMP_HNT(GetNewerLibrary), // Hint
711                 __kmp_msg_null // Variadic argument list terminator
712     );
713   } break;
714   } // switch
715   pr->schedule = schedule;
716 }
717 
718 #if KMP_USE_HIER_SCHED
719 template <typename T>
720 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
721                                              typename traits_t<T>::signed_t st);
722 template <>
723 inline void
724 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
725                                             kmp_int32 ub, kmp_int32 st) {
726   __kmp_dispatch_init_hierarchy<kmp_int32>(
727       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
728       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
729 }
730 template <>
731 inline void
732 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
733                                              kmp_uint32 ub, kmp_int32 st) {
734   __kmp_dispatch_init_hierarchy<kmp_uint32>(
735       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
736       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
737 }
738 template <>
739 inline void
740 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
741                                             kmp_int64 ub, kmp_int64 st) {
742   __kmp_dispatch_init_hierarchy<kmp_int64>(
743       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
744       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
745 }
746 template <>
747 inline void
748 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
749                                              kmp_uint64 ub, kmp_int64 st) {
750   __kmp_dispatch_init_hierarchy<kmp_uint64>(
751       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
752       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
753 }
754 
755 // free all the hierarchy scheduling memory associated with the team
756 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
757   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
758   for (int i = 0; i < num_disp_buff; ++i) {
759     // type does not matter here so use kmp_int32
760     auto sh =
761         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
762             &team->t.t_disp_buffer[i]);
763     if (sh->hier) {
764       sh->hier->deallocate();
765       __kmp_free(sh->hier);
766     }
767   }
768 }
769 #endif
770 
771 // UT - unsigned flavor of T, ST - signed flavor of T,
772 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
773 template <typename T>
774 static void
775 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
776                     T ub, typename traits_t<T>::signed_t st,
777                     typename traits_t<T>::signed_t chunk, int push_ws) {
778   typedef typename traits_t<T>::unsigned_t UT;
779 
780   int active;
781   kmp_info_t *th;
782   kmp_team_t *team;
783   kmp_uint32 my_buffer_index;
784   dispatch_private_info_template<T> *pr;
785   dispatch_shared_info_template<T> volatile *sh;
786 
787   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
788                    sizeof(dispatch_private_info));
789   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
790                    sizeof(dispatch_shared_info));
791   __kmp_assert_valid_gtid(gtid);
792 
793   if (!TCR_4(__kmp_init_parallel))
794     __kmp_parallel_initialize();
795 
796   __kmp_resume_if_soft_paused();
797 
798 #if INCLUDE_SSC_MARKS
799   SSC_MARK_DISPATCH_INIT();
800 #endif
801 #ifdef KMP_DEBUG
802   typedef typename traits_t<T>::signed_t ST;
803   {
804     char *buff;
805     // create format specifiers before the debug output
806     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
807                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
808                             traits_t<ST>::spec, traits_t<T>::spec,
809                             traits_t<T>::spec, traits_t<ST>::spec);
810     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
811     __kmp_str_free(&buff);
812   }
813 #endif
814   /* setup data */
815   th = __kmp_threads[gtid];
816   team = th->th.th_team;
817   active = !team->t.t_serialized;
818   th->th.th_ident = loc;
819 
820   // Any half-decent optimizer will remove this test when the blocks are empty
821   // since the macros expand to nothing
822   // when statistics are disabled.
823   if (schedule == __kmp_static) {
824     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
825   } else {
826     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
827   }
828 
829 #if KMP_USE_HIER_SCHED
830   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
831   // Hierarchical scheduling does not work with ordered, so if ordered is
832   // detected, then revert back to threaded scheduling.
833   bool ordered;
834   enum sched_type my_sched = schedule;
835   my_buffer_index = th->th.th_dispatch->th_disp_index;
836   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
837       &th->th.th_dispatch
838            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
839   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
840   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
841     my_sched =
842         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
843   ordered = (kmp_ord_lower & my_sched);
844   if (pr->flags.use_hier) {
845     if (ordered) {
846       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
847                      "Disabling hierarchical scheduling.\n",
848                      gtid));
849       pr->flags.use_hier = FALSE;
850     }
851   }
852   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
853     // Don't use hierarchical for ordered parallel loops and don't
854     // use the runtime hierarchy if one was specified in the program
855     if (!ordered && !pr->flags.use_hier)
856       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
857   }
858 #endif // KMP_USE_HIER_SCHED
859 
860 #if USE_ITT_BUILD
861   kmp_uint64 cur_chunk = chunk;
862   int itt_need_metadata_reporting =
863       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
864       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
865       team->t.t_active_level == 1;
866 #endif
867   if (!active) {
868     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
869         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
870   } else {
871     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
872                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
873 
874     my_buffer_index = th->th.th_dispatch->th_disp_index++;
875 
876     /* What happens when number of threads changes, need to resize buffer? */
877     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
878         &th->th.th_dispatch
879              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
880     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
881         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
882     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
883                   my_buffer_index));
884   }
885 
886   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
887 #if USE_ITT_BUILD
888                                 &cur_chunk,
889 #endif
890                                 chunk, (T)th->th.th_team_nproc,
891                                 (T)th->th.th_info.ds.ds_tid);
892   if (active) {
893     if (pr->flags.ordered == 0) {
894       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
895       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
896     } else {
897       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
898       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
899     }
900   }
901 
902   if (active) {
903     /* The name of this buffer should be my_buffer_index when it's free to use
904      * it */
905 
906     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
907                    "sh->buffer_index:%d\n",
908                    gtid, my_buffer_index, sh->buffer_index));
909     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
910                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
911     // Note: KMP_WAIT() cannot be used there: buffer index and
912     // my_buffer_index are *always* 32-bit integers.
913     KMP_MB(); /* is this necessary? */
914     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
915                    "sh->buffer_index:%d\n",
916                    gtid, my_buffer_index, sh->buffer_index));
917 
918     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
919     th->th.th_dispatch->th_dispatch_sh_current =
920         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
921 #if USE_ITT_BUILD
922     if (pr->flags.ordered) {
923       __kmp_itt_ordered_init(gtid);
924     }
925     // Report loop metadata
926     if (itt_need_metadata_reporting) {
927       // Only report metadata by primary thread of active team at level 1
928       kmp_uint64 schedtype = 0;
929       switch (schedule) {
930       case kmp_sch_static_chunked:
931       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
932         break;
933       case kmp_sch_static_greedy:
934         cur_chunk = pr->u.p.parm1;
935         break;
936       case kmp_sch_dynamic_chunked:
937         schedtype = 1;
938         break;
939       case kmp_sch_guided_iterative_chunked:
940       case kmp_sch_guided_analytical_chunked:
941       case kmp_sch_guided_simd:
942         schedtype = 2;
943         break;
944       default:
945         // Should we put this case under "static"?
946         // case kmp_sch_static_steal:
947         schedtype = 3;
948         break;
949       }
950       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
951     }
952 #if KMP_USE_HIER_SCHED
953     if (pr->flags.use_hier) {
954       pr->u.p.count = 0;
955       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
956     }
957 #endif // KMP_USER_HIER_SCHED
958 #endif /* USE_ITT_BUILD */
959   }
960 
961 #ifdef KMP_DEBUG
962   {
963     char *buff;
964     // create format specifiers before the debug output
965     buff = __kmp_str_format(
966         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
967         "lb:%%%s ub:%%%s"
968         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
969         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
970         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
971         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
972         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
973         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
974     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
975                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
976                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
977                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
978     __kmp_str_free(&buff);
979   }
980 #endif
981 #if (KMP_STATIC_STEAL_ENABLED)
982   // It cannot be guaranteed that after execution of a loop with some other
983   // schedule kind all the parm3 variables will contain the same value. Even if
984   // all parm3 will be the same, it still exists a bad case like using 0 and 1
985   // rather than program life-time increment. So the dedicated variable is
986   // required. The 'static_steal_counter' is used.
987   if (pr->schedule == kmp_sch_static_steal) {
988     // Other threads will inspect this variable when searching for a victim.
989     // This is a flag showing that other threads may steal from this thread
990     // since then.
991     volatile T *p = &pr->u.p.static_steal_counter;
992     *p = *p + 1;
993   }
994 #endif // ( KMP_STATIC_STEAL_ENABLED )
995 
996 #if OMPT_SUPPORT && OMPT_OPTIONAL
997   if (ompt_enabled.ompt_callback_work) {
998     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
999     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1000     ompt_callbacks.ompt_callback(ompt_callback_work)(
1001         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1002         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1003   }
1004 #endif
1005   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1006 }
1007 
1008 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1009  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1010  * every chunk of iterations.  If the ordered section(s) were not executed
1011  * for this iteration (or every iteration in this chunk), we need to set the
1012  * ordered iteration counters so that the next thread can proceed. */
1013 template <typename UT>
1014 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1015   typedef typename traits_t<UT>::signed_t ST;
1016   __kmp_assert_valid_gtid(gtid);
1017   kmp_info_t *th = __kmp_threads[gtid];
1018 
1019   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1020   if (!th->th.th_team->t.t_serialized) {
1021 
1022     dispatch_private_info_template<UT> *pr =
1023         reinterpret_cast<dispatch_private_info_template<UT> *>(
1024             th->th.th_dispatch->th_dispatch_pr_current);
1025     dispatch_shared_info_template<UT> volatile *sh =
1026         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1027             th->th.th_dispatch->th_dispatch_sh_current);
1028     KMP_DEBUG_ASSERT(pr);
1029     KMP_DEBUG_ASSERT(sh);
1030     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1031                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1032 
1033     if (pr->ordered_bumped) {
1034       KD_TRACE(
1035           1000,
1036           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1037            gtid));
1038       pr->ordered_bumped = 0;
1039     } else {
1040       UT lower = pr->u.p.ordered_lower;
1041 
1042 #ifdef KMP_DEBUG
1043       {
1044         char *buff;
1045         // create format specifiers before the debug output
1046         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1047                                 "ordered_iteration:%%%s lower:%%%s\n",
1048                                 traits_t<UT>::spec, traits_t<UT>::spec);
1049         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1050         __kmp_str_free(&buff);
1051       }
1052 #endif
1053 
1054       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1055                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1056       KMP_MB(); /* is this necessary? */
1057 #ifdef KMP_DEBUG
1058       {
1059         char *buff;
1060         // create format specifiers before the debug output
1061         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1062                                 "ordered_iteration:%%%s lower:%%%s\n",
1063                                 traits_t<UT>::spec, traits_t<UT>::spec);
1064         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1065         __kmp_str_free(&buff);
1066       }
1067 #endif
1068 
1069       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1070     } // if
1071   } // if
1072   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1073 }
1074 
1075 #ifdef KMP_GOMP_COMPAT
1076 
1077 template <typename UT>
1078 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1079   typedef typename traits_t<UT>::signed_t ST;
1080   __kmp_assert_valid_gtid(gtid);
1081   kmp_info_t *th = __kmp_threads[gtid];
1082 
1083   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1084   if (!th->th.th_team->t.t_serialized) {
1085     //        int cid;
1086     dispatch_private_info_template<UT> *pr =
1087         reinterpret_cast<dispatch_private_info_template<UT> *>(
1088             th->th.th_dispatch->th_dispatch_pr_current);
1089     dispatch_shared_info_template<UT> volatile *sh =
1090         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1091             th->th.th_dispatch->th_dispatch_sh_current);
1092     KMP_DEBUG_ASSERT(pr);
1093     KMP_DEBUG_ASSERT(sh);
1094     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1095                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1096 
1097     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1098     UT lower = pr->u.p.ordered_lower;
1099     UT upper = pr->u.p.ordered_upper;
1100     UT inc = upper - lower + 1;
1101 
1102     if (pr->ordered_bumped == inc) {
1103       KD_TRACE(
1104           1000,
1105           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106            gtid));
1107       pr->ordered_bumped = 0;
1108     } else {
1109       inc -= pr->ordered_bumped;
1110 
1111 #ifdef KMP_DEBUG
1112       {
1113         char *buff;
1114         // create format specifiers before the debug output
1115         buff = __kmp_str_format(
1116             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120         __kmp_str_free(&buff);
1121       }
1122 #endif
1123 
1124       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126 
1127       KMP_MB(); /* is this necessary? */
1128       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129                       "ordered_bumped to zero\n",
1130                       gtid));
1131       pr->ordered_bumped = 0;
1132 //!!!!! TODO check if the inc should be unsigned, or signed???
1133 #ifdef KMP_DEBUG
1134       {
1135         char *buff;
1136         // create format specifiers before the debug output
1137         buff = __kmp_str_format(
1138             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141             traits_t<UT>::spec);
1142         KD_TRACE(1000,
1143                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144         __kmp_str_free(&buff);
1145       }
1146 #endif
1147 
1148       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149     }
1150     //        }
1151   }
1152   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153 }
1154 
1155 #endif /* KMP_GOMP_COMPAT */
1156 
1157 template <typename T>
1158 int __kmp_dispatch_next_algorithm(int gtid,
1159                                   dispatch_private_info_template<T> *pr,
1160                                   dispatch_shared_info_template<T> volatile *sh,
1161                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1162                                   typename traits_t<T>::signed_t *p_st, T nproc,
1163                                   T tid) {
1164   typedef typename traits_t<T>::unsigned_t UT;
1165   typedef typename traits_t<T>::signed_t ST;
1166   typedef typename traits_t<T>::floating_t DBL;
1167   int status = 0;
1168   bool last = false;
1169   T start;
1170   ST incr;
1171   UT limit, trip, init;
1172   kmp_info_t *th = __kmp_threads[gtid];
1173   kmp_team_t *team = th->th.th_team;
1174 
1175   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177   KMP_DEBUG_ASSERT(pr);
1178   KMP_DEBUG_ASSERT(sh);
1179   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180 #ifdef KMP_DEBUG
1181   {
1182     char *buff;
1183     // create format specifiers before the debug output
1184     buff =
1185         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186                          "sh:%%p nproc:%%%s tid:%%%s\n",
1187                          traits_t<T>::spec, traits_t<T>::spec);
1188     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189     __kmp_str_free(&buff);
1190   }
1191 #endif
1192 
1193   // zero trip count
1194   if (pr->u.p.tc == 0) {
1195     KD_TRACE(10,
1196              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197               "zero status:%d\n",
1198               gtid, status));
1199     return 0;
1200   }
1201 
1202   switch (pr->schedule) {
1203 #if (KMP_STATIC_STEAL_ENABLED)
1204   case kmp_sch_static_steal: {
1205     T chunk = pr->u.p.parm1;
1206 
1207     KD_TRACE(100,
1208              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209               gtid));
1210 
1211     trip = pr->u.p.tc - 1;
1212 
1213     if (traits_t<T>::type_size > 4) {
1214       // use lock for 8-byte and CAS for 4-byte induction
1215       // variable. TODO (optional): check and use 16-byte CAS
1216       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1217       KMP_DEBUG_ASSERT(lck != NULL);
1218       if (pr->u.p.count < (UT)pr->u.p.ub) {
1219         __kmp_acquire_lock(lck, gtid);
1220         // try to get own chunk of iterations
1221         init = (pr->u.p.count)++;
1222         status = (init < (UT)pr->u.p.ub);
1223         __kmp_release_lock(lck, gtid);
1224       } else {
1225         status = 0; // no own chunks
1226       }
1227       if (!status) { // try to steal
1228         kmp_info_t **other_threads = team->t.t_threads;
1229         T while_limit = pr->u.p.parm3;
1230         T while_index = 0;
1231         T id = pr->u.p.static_steal_counter; // loop id
1232         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233                   __kmp_dispatch_num_buffers; // current loop index
1234         // note: victim thread can potentially execute another loop
1235         // TODO: algorithm of searching for a victim
1236         // should be cleaned up and measured
1237         while ((!status) && (while_limit != ++while_index)) {
1238           dispatch_private_info_template<T> *victim;
1239           T remaining;
1240           T victimIdx = pr->u.p.parm4;
1241           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1242           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1243               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1244           KMP_DEBUG_ASSERT(victim);
1245           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1246                  oldVictimIdx != victimIdx) {
1247             victimIdx = (victimIdx + 1) % nproc;
1248             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1249                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1250             KMP_DEBUG_ASSERT(victim);
1251           }
1252           if (victim == pr || id != victim->u.p.static_steal_counter) {
1253             continue; // try once more (nproc attempts in total)
1254             // no victim is ready yet to participate in stealing
1255             // because no victim passed kmp_init_dispatch yet
1256           }
1257           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1258             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1259             continue; // not enough chunks to steal, goto next victim
1260           }
1261 
1262           lck = victim->u.p.th_steal_lock;
1263           KMP_ASSERT(lck != NULL);
1264           __kmp_acquire_lock(lck, gtid);
1265           limit = victim->u.p.ub; // keep initial ub
1266           if (victim->u.p.count >= limit ||
1267               (remaining = limit - victim->u.p.count) < 2) {
1268             __kmp_release_lock(lck, gtid);
1269             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1270             continue; // not enough chunks to steal
1271           }
1272           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1273           // by 1
1274           if (remaining > 3) {
1275             // steal 1/4 of remaining
1276             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1277             init = (victim->u.p.ub -= (remaining >> 2));
1278           } else {
1279             // steal 1 chunk of 2 or 3 remaining
1280             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1281             init = (victim->u.p.ub -= 1);
1282           }
1283           __kmp_release_lock(lck, gtid);
1284 
1285           KMP_DEBUG_ASSERT(init + 1 <= limit);
1286           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1287           status = 1;
1288           while_index = 0;
1289           // now update own count and ub with stolen range but init chunk
1290           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1291           pr->u.p.count = init + 1;
1292           pr->u.p.ub = limit;
1293           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1294         } // while (search for victim)
1295       } // if (try to find victim and steal)
1296     } else {
1297       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1298       typedef union {
1299         struct {
1300           UT count;
1301           T ub;
1302         } p;
1303         kmp_int64 b;
1304       } union_i4;
1305       // All operations on 'count' or 'ub' must be combined atomically
1306       // together.
1307       {
1308         union_i4 vold, vnew;
1309         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1310         vnew = vold;
1311         vnew.p.count++;
1312         while (!KMP_COMPARE_AND_STORE_ACQ64(
1313             (volatile kmp_int64 *)&pr->u.p.count,
1314             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1315             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1316           KMP_CPU_PAUSE();
1317           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1318           vnew = vold;
1319           vnew.p.count++;
1320         }
1321         vnew = vold;
1322         init = vnew.p.count;
1323         status = (init < (UT)vnew.p.ub);
1324       }
1325 
1326       if (!status) {
1327         kmp_info_t **other_threads = team->t.t_threads;
1328         T while_limit = pr->u.p.parm3;
1329         T while_index = 0;
1330         T id = pr->u.p.static_steal_counter; // loop id
1331         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1332                   __kmp_dispatch_num_buffers; // current loop index
1333         // note: victim thread can potentially execute another loop
1334         // TODO: algorithm of searching for a victim
1335         // should be cleaned up and measured
1336         while ((!status) && (while_limit != ++while_index)) {
1337           dispatch_private_info_template<T> *victim;
1338           union_i4 vold, vnew;
1339           T remaining;
1340           T victimIdx = pr->u.p.parm4;
1341           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1342           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1343               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1344           KMP_DEBUG_ASSERT(victim);
1345           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1346                  oldVictimIdx != victimIdx) {
1347             victimIdx = (victimIdx + 1) % nproc;
1348             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1349                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1350             KMP_DEBUG_ASSERT(victim);
1351           }
1352           if (victim == pr || id != victim->u.p.static_steal_counter) {
1353             continue; // try once more (nproc attempts in total)
1354             // no victim is ready yet to participate in stealing
1355             // because no victim passed kmp_init_dispatch yet
1356           }
1357           pr->u.p.parm4 = victimIdx; // new victim found
1358           while (1) { // CAS loop if victim has enough chunks to steal
1359             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1360             vnew = vold;
1361 
1362             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1363             if (vnew.p.count >= (UT)vnew.p.ub ||
1364                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1365               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1366               break; // not enough chunks to steal, goto next victim
1367             }
1368             if (remaining > 3) {
1369               // try to steal 1/4 of remaining
1370               vnew.p.ub -= remaining >> 2;
1371             } else {
1372               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1373             }
1374             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1375             // TODO: Should this be acquire or release?
1376             if (KMP_COMPARE_AND_STORE_ACQ64(
1377                     (volatile kmp_int64 *)&victim->u.p.count,
1378                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1379                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1380               // stealing succeeded
1381               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1382                                         vold.p.ub - vnew.p.ub);
1383               status = 1;
1384               while_index = 0;
1385               // now update own count and ub
1386               init = vnew.p.ub;
1387               vold.p.count = init + 1;
1388 #if KMP_ARCH_X86
1389               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1390 #else
1391               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1392 #endif
1393               break;
1394             } // if (check CAS result)
1395             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1396           } // while (try to steal from particular victim)
1397         } // while (search for victim)
1398       } // if (try to find victim and steal)
1399     } // if (4-byte induction variable)
1400     if (!status) {
1401       *p_lb = 0;
1402       *p_ub = 0;
1403       if (p_st != NULL)
1404         *p_st = 0;
1405     } else {
1406       start = pr->u.p.parm2;
1407       init *= chunk;
1408       limit = chunk + init - 1;
1409       incr = pr->u.p.st;
1410       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1411 
1412       KMP_DEBUG_ASSERT(init <= trip);
1413       if ((last = (limit >= trip)) != 0)
1414         limit = trip;
1415       if (p_st != NULL)
1416         *p_st = incr;
1417 
1418       if (incr == 1) {
1419         *p_lb = start + init;
1420         *p_ub = start + limit;
1421       } else {
1422         *p_lb = start + init * incr;
1423         *p_ub = start + limit * incr;
1424       }
1425 
1426       if (pr->flags.ordered) {
1427         pr->u.p.ordered_lower = init;
1428         pr->u.p.ordered_upper = limit;
1429       } // if
1430     } // if
1431     break;
1432   } // case
1433 #endif // ( KMP_STATIC_STEAL_ENABLED )
1434   case kmp_sch_static_balanced: {
1435     KD_TRACE(
1436         10,
1437         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1438          gtid));
1439     /* check if thread has any iteration to do */
1440     if ((status = !pr->u.p.count) != 0) {
1441       pr->u.p.count = 1;
1442       *p_lb = pr->u.p.lb;
1443       *p_ub = pr->u.p.ub;
1444       last = (pr->u.p.parm1 != 0);
1445       if (p_st != NULL)
1446         *p_st = pr->u.p.st;
1447     } else { /* no iterations to do */
1448       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1449     }
1450   } // case
1451   break;
1452   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1453                                  merged here */
1454   case kmp_sch_static_chunked: {
1455     T parm1;
1456 
1457     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1458                    "kmp_sch_static_[affinity|chunked] case\n",
1459                    gtid));
1460     parm1 = pr->u.p.parm1;
1461 
1462     trip = pr->u.p.tc - 1;
1463     init = parm1 * (pr->u.p.count + tid);
1464 
1465     if ((status = (init <= trip)) != 0) {
1466       start = pr->u.p.lb;
1467       incr = pr->u.p.st;
1468       limit = parm1 + init - 1;
1469 
1470       if ((last = (limit >= trip)) != 0)
1471         limit = trip;
1472 
1473       if (p_st != NULL)
1474         *p_st = incr;
1475 
1476       pr->u.p.count += nproc;
1477 
1478       if (incr == 1) {
1479         *p_lb = start + init;
1480         *p_ub = start + limit;
1481       } else {
1482         *p_lb = start + init * incr;
1483         *p_ub = start + limit * incr;
1484       }
1485 
1486       if (pr->flags.ordered) {
1487         pr->u.p.ordered_lower = init;
1488         pr->u.p.ordered_upper = limit;
1489       } // if
1490     } // if
1491   } // case
1492   break;
1493 
1494   case kmp_sch_dynamic_chunked: {
1495     UT chunk_number;
1496     UT chunk_size = pr->u.p.parm1;
1497     UT nchunks = pr->u.p.parm2;
1498 
1499     KD_TRACE(
1500         100,
1501         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1502          gtid));
1503 
1504     chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1505     status = (chunk_number < nchunks);
1506     if (!status) {
1507       *p_lb = 0;
1508       *p_ub = 0;
1509       if (p_st != NULL)
1510         *p_st = 0;
1511     } else {
1512       init = chunk_size * chunk_number;
1513       trip = pr->u.p.tc - 1;
1514       start = pr->u.p.lb;
1515       incr = pr->u.p.st;
1516 
1517       if ((last = (trip - init < (UT)chunk_size)))
1518         limit = trip;
1519       else
1520         limit = chunk_size + init - 1;
1521 
1522       if (p_st != NULL)
1523         *p_st = incr;
1524 
1525       if (incr == 1) {
1526         *p_lb = start + init;
1527         *p_ub = start + limit;
1528       } else {
1529         *p_lb = start + init * incr;
1530         *p_ub = start + limit * incr;
1531       }
1532 
1533       if (pr->flags.ordered) {
1534         pr->u.p.ordered_lower = init;
1535         pr->u.p.ordered_upper = limit;
1536       } // if
1537     } // if
1538   } // case
1539   break;
1540 
1541   case kmp_sch_guided_iterative_chunked: {
1542     T chunkspec = pr->u.p.parm1;
1543     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1544                    "iterative case\n",
1545                    gtid));
1546     trip = pr->u.p.tc;
1547     // Start atomic part of calculations
1548     while (1) {
1549       ST remaining; // signed, because can be < 0
1550       init = sh->u.s.iteration; // shared value
1551       remaining = trip - init;
1552       if (remaining <= 0) { // AC: need to compare with 0 first
1553         // nothing to do, don't try atomic op
1554         status = 0;
1555         break;
1556       }
1557       if ((T)remaining <
1558           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1559         // use dynamic-style schedule
1560         // atomically increment iterations, get old value
1561         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1562                                  (ST)chunkspec);
1563         remaining = trip - init;
1564         if (remaining <= 0) {
1565           status = 0; // all iterations got by other threads
1566         } else {
1567           // got some iterations to work on
1568           status = 1;
1569           if ((T)remaining > chunkspec) {
1570             limit = init + chunkspec - 1;
1571           } else {
1572             last = true; // the last chunk
1573             limit = init + remaining - 1;
1574           } // if
1575         } // if
1576         break;
1577       } // if
1578       limit = init + (UT)((double)remaining *
1579                           *(double *)&pr->u.p.parm3); // divide by K*nproc
1580       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1581                                (ST)init, (ST)limit)) {
1582         // CAS was successful, chunk obtained
1583         status = 1;
1584         --limit;
1585         break;
1586       } // if
1587     } // while
1588     if (status != 0) {
1589       start = pr->u.p.lb;
1590       incr = pr->u.p.st;
1591       if (p_st != NULL)
1592         *p_st = incr;
1593       *p_lb = start + init * incr;
1594       *p_ub = start + limit * incr;
1595       if (pr->flags.ordered) {
1596         pr->u.p.ordered_lower = init;
1597         pr->u.p.ordered_upper = limit;
1598       } // if
1599     } else {
1600       *p_lb = 0;
1601       *p_ub = 0;
1602       if (p_st != NULL)
1603         *p_st = 0;
1604     } // if
1605   } // case
1606   break;
1607 
1608   case kmp_sch_guided_simd: {
1609     // same as iterative but curr-chunk adjusted to be multiple of given
1610     // chunk
1611     T chunk = pr->u.p.parm1;
1612     KD_TRACE(100,
1613              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1614               gtid));
1615     trip = pr->u.p.tc;
1616     // Start atomic part of calculations
1617     while (1) {
1618       ST remaining; // signed, because can be < 0
1619       init = sh->u.s.iteration; // shared value
1620       remaining = trip - init;
1621       if (remaining <= 0) { // AC: need to compare with 0 first
1622         status = 0; // nothing to do, don't try atomic op
1623         break;
1624       }
1625       KMP_DEBUG_ASSERT(init % chunk == 0);
1626       // compare with K*nproc*(chunk+1), K=2 by default
1627       if ((T)remaining < pr->u.p.parm2) {
1628         // use dynamic-style schedule
1629         // atomically increment iterations, get old value
1630         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1631                                  (ST)chunk);
1632         remaining = trip - init;
1633         if (remaining <= 0) {
1634           status = 0; // all iterations got by other threads
1635         } else {
1636           // got some iterations to work on
1637           status = 1;
1638           if ((T)remaining > chunk) {
1639             limit = init + chunk - 1;
1640           } else {
1641             last = true; // the last chunk
1642             limit = init + remaining - 1;
1643           } // if
1644         } // if
1645         break;
1646       } // if
1647       // divide by K*nproc
1648       UT span;
1649       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1650                          &span);
1651       UT rem = span % chunk;
1652       if (rem) // adjust so that span%chunk == 0
1653         span += chunk - rem;
1654       limit = init + span;
1655       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656                                (ST)init, (ST)limit)) {
1657         // CAS was successful, chunk obtained
1658         status = 1;
1659         --limit;
1660         break;
1661       } // if
1662     } // while
1663     if (status != 0) {
1664       start = pr->u.p.lb;
1665       incr = pr->u.p.st;
1666       if (p_st != NULL)
1667         *p_st = incr;
1668       *p_lb = start + init * incr;
1669       *p_ub = start + limit * incr;
1670       if (pr->flags.ordered) {
1671         pr->u.p.ordered_lower = init;
1672         pr->u.p.ordered_upper = limit;
1673       } // if
1674     } else {
1675       *p_lb = 0;
1676       *p_ub = 0;
1677       if (p_st != NULL)
1678         *p_st = 0;
1679     } // if
1680   } // case
1681   break;
1682 
1683   case kmp_sch_guided_analytical_chunked: {
1684     T chunkspec = pr->u.p.parm1;
1685     UT chunkIdx;
1686 #if KMP_USE_X87CONTROL
1687     /* for storing original FPCW value for Windows* OS on
1688        IA-32 architecture 8-byte version */
1689     unsigned int oldFpcw;
1690     unsigned int fpcwSet = 0;
1691 #endif
1692     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1693                    "kmp_sch_guided_analytical_chunked case\n",
1694                    gtid));
1695 
1696     trip = pr->u.p.tc;
1697 
1698     KMP_DEBUG_ASSERT(nproc > 1);
1699     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1700 
1701     while (1) { /* this while loop is a safeguard against unexpected zero
1702                    chunk sizes */
1703       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1704       if (chunkIdx >= (UT)pr->u.p.parm2) {
1705         --trip;
1706         /* use dynamic-style scheduling */
1707         init = chunkIdx * chunkspec + pr->u.p.count;
1708         /* need to verify init > 0 in case of overflow in the above
1709          * calculation */
1710         if ((status = (init > 0 && init <= trip)) != 0) {
1711           limit = init + chunkspec - 1;
1712 
1713           if ((last = (limit >= trip)) != 0)
1714             limit = trip;
1715         }
1716         break;
1717       } else {
1718 /* use exponential-style scheduling */
1719 /* The following check is to workaround the lack of long double precision on
1720    Windows* OS.
1721    This check works around the possible effect that init != 0 for chunkIdx == 0.
1722  */
1723 #if KMP_USE_X87CONTROL
1724         /* If we haven't already done so, save original
1725            FPCW and set precision to 64-bit, as Windows* OS
1726            on IA-32 architecture defaults to 53-bit */
1727         if (!fpcwSet) {
1728           oldFpcw = _control87(0, 0);
1729           _control87(_PC_64, _MCW_PC);
1730           fpcwSet = 0x30000;
1731         }
1732 #endif
1733         if (chunkIdx) {
1734           init = __kmp_dispatch_guided_remaining<T>(
1735               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1736           KMP_DEBUG_ASSERT(init);
1737           init = trip - init;
1738         } else
1739           init = 0;
1740         limit = trip - __kmp_dispatch_guided_remaining<T>(
1741                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1742         KMP_ASSERT(init <= limit);
1743         if (init < limit) {
1744           KMP_DEBUG_ASSERT(limit <= trip);
1745           --limit;
1746           status = 1;
1747           break;
1748         } // if
1749       } // if
1750     } // while (1)
1751 #if KMP_USE_X87CONTROL
1752     /* restore FPCW if necessary
1753        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1754     */
1755     if (fpcwSet && (oldFpcw & fpcwSet))
1756       _control87(oldFpcw, _MCW_PC);
1757 #endif
1758     if (status != 0) {
1759       start = pr->u.p.lb;
1760       incr = pr->u.p.st;
1761       if (p_st != NULL)
1762         *p_st = incr;
1763       *p_lb = start + init * incr;
1764       *p_ub = start + limit * incr;
1765       if (pr->flags.ordered) {
1766         pr->u.p.ordered_lower = init;
1767         pr->u.p.ordered_upper = limit;
1768       }
1769     } else {
1770       *p_lb = 0;
1771       *p_ub = 0;
1772       if (p_st != NULL)
1773         *p_st = 0;
1774     }
1775   } // case
1776   break;
1777 
1778   case kmp_sch_trapezoidal: {
1779     UT index;
1780     T parm2 = pr->u.p.parm2;
1781     T parm3 = pr->u.p.parm3;
1782     T parm4 = pr->u.p.parm4;
1783     KD_TRACE(100,
1784              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1785               gtid));
1786 
1787     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1788 
1789     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1790     trip = pr->u.p.tc - 1;
1791 
1792     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1793       *p_lb = 0;
1794       *p_ub = 0;
1795       if (p_st != NULL)
1796         *p_st = 0;
1797     } else {
1798       start = pr->u.p.lb;
1799       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1800       incr = pr->u.p.st;
1801 
1802       if ((last = (limit >= trip)) != 0)
1803         limit = trip;
1804 
1805       if (p_st != NULL)
1806         *p_st = incr;
1807 
1808       if (incr == 1) {
1809         *p_lb = start + init;
1810         *p_ub = start + limit;
1811       } else {
1812         *p_lb = start + init * incr;
1813         *p_ub = start + limit * incr;
1814       }
1815 
1816       if (pr->flags.ordered) {
1817         pr->u.p.ordered_lower = init;
1818         pr->u.p.ordered_upper = limit;
1819       } // if
1820     } // if
1821   } // case
1822   break;
1823   default: {
1824     status = 0; // to avoid complaints on uninitialized variable use
1825     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1826                 KMP_HNT(GetNewerLibrary), // Hint
1827                 __kmp_msg_null // Variadic argument list terminator
1828     );
1829   } break;
1830   } // switch
1831   if (p_last)
1832     *p_last = last;
1833 #ifdef KMP_DEBUG
1834   if (pr->flags.ordered) {
1835     char *buff;
1836     // create format specifiers before the debug output
1837     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1838                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1839                             traits_t<UT>::spec, traits_t<UT>::spec);
1840     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1841     __kmp_str_free(&buff);
1842   }
1843   {
1844     char *buff;
1845     // create format specifiers before the debug output
1846     buff = __kmp_str_format(
1847         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1848         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1849         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1850     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1851     __kmp_str_free(&buff);
1852   }
1853 #endif
1854   return status;
1855 }
1856 
1857 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1858    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1859    is not called. */
1860 #if OMPT_SUPPORT && OMPT_OPTIONAL
1861 #define OMPT_LOOP_END                                                          \
1862   if (status == 0) {                                                           \
1863     if (ompt_enabled.ompt_callback_work) {                                     \
1864       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1865       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1866       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1867           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1868           &(task_info->task_data), 0, codeptr);                                \
1869     }                                                                          \
1870   }
1871 // TODO: implement count
1872 #else
1873 #define OMPT_LOOP_END // no-op
1874 #endif
1875 
1876 #if KMP_STATS_ENABLED
1877 #define KMP_STATS_LOOP_END                                                     \
1878   {                                                                            \
1879     kmp_int64 u, l, t, i;                                                      \
1880     l = (kmp_int64)(*p_lb);                                                    \
1881     u = (kmp_int64)(*p_ub);                                                    \
1882     i = (kmp_int64)(pr->u.p.st);                                               \
1883     if (status == 0) {                                                         \
1884       t = 0;                                                                   \
1885       KMP_POP_PARTITIONED_TIMER();                                             \
1886     } else if (i == 1) {                                                       \
1887       if (u >= l)                                                              \
1888         t = u - l + 1;                                                         \
1889       else                                                                     \
1890         t = 0;                                                                 \
1891     } else if (i < 0) {                                                        \
1892       if (l >= u)                                                              \
1893         t = (l - u) / (-i) + 1;                                                \
1894       else                                                                     \
1895         t = 0;                                                                 \
1896     } else {                                                                   \
1897       if (u >= l)                                                              \
1898         t = (u - l) / i + 1;                                                   \
1899       else                                                                     \
1900         t = 0;                                                                 \
1901     }                                                                          \
1902     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1903   }
1904 #else
1905 #define KMP_STATS_LOOP_END /* Nothing */
1906 #endif
1907 
1908 template <typename T>
1909 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1910                                T *p_lb, T *p_ub,
1911                                typename traits_t<T>::signed_t *p_st
1912 #if OMPT_SUPPORT && OMPT_OPTIONAL
1913                                ,
1914                                void *codeptr
1915 #endif
1916 ) {
1917 
1918   typedef typename traits_t<T>::unsigned_t UT;
1919   typedef typename traits_t<T>::signed_t ST;
1920   // This is potentially slightly misleading, schedule(runtime) will appear here
1921   // even if the actual runtime schedule is static. (Which points out a
1922   // disadvantage of schedule(runtime): even when static scheduling is used it
1923   // costs more than a compile time choice to use static scheduling would.)
1924   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1925 
1926   int status;
1927   dispatch_private_info_template<T> *pr;
1928   __kmp_assert_valid_gtid(gtid);
1929   kmp_info_t *th = __kmp_threads[gtid];
1930   kmp_team_t *team = th->th.th_team;
1931 
1932   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1933   KD_TRACE(
1934       1000,
1935       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1936        gtid, p_lb, p_ub, p_st, p_last));
1937 
1938   if (team->t.t_serialized) {
1939     /* NOTE: serialize this dispatch because we are not at the active level */
1940     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1941         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1942     KMP_DEBUG_ASSERT(pr);
1943 
1944     if ((status = (pr->u.p.tc != 0)) == 0) {
1945       *p_lb = 0;
1946       *p_ub = 0;
1947       //            if ( p_last != NULL )
1948       //                *p_last = 0;
1949       if (p_st != NULL)
1950         *p_st = 0;
1951       if (__kmp_env_consistency_check) {
1952         if (pr->pushed_ws != ct_none) {
1953           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1954         }
1955       }
1956     } else if (pr->flags.nomerge) {
1957       kmp_int32 last;
1958       T start;
1959       UT limit, trip, init;
1960       ST incr;
1961       T chunk = pr->u.p.parm1;
1962 
1963       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1964                      gtid));
1965 
1966       init = chunk * pr->u.p.count++;
1967       trip = pr->u.p.tc - 1;
1968 
1969       if ((status = (init <= trip)) == 0) {
1970         *p_lb = 0;
1971         *p_ub = 0;
1972         //                if ( p_last != NULL )
1973         //                    *p_last = 0;
1974         if (p_st != NULL)
1975           *p_st = 0;
1976         if (__kmp_env_consistency_check) {
1977           if (pr->pushed_ws != ct_none) {
1978             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1979           }
1980         }
1981       } else {
1982         start = pr->u.p.lb;
1983         limit = chunk + init - 1;
1984         incr = pr->u.p.st;
1985 
1986         if ((last = (limit >= trip)) != 0) {
1987           limit = trip;
1988 #if KMP_OS_WINDOWS
1989           pr->u.p.last_upper = pr->u.p.ub;
1990 #endif /* KMP_OS_WINDOWS */
1991         }
1992         if (p_last != NULL)
1993           *p_last = last;
1994         if (p_st != NULL)
1995           *p_st = incr;
1996         if (incr == 1) {
1997           *p_lb = start + init;
1998           *p_ub = start + limit;
1999         } else {
2000           *p_lb = start + init * incr;
2001           *p_ub = start + limit * incr;
2002         }
2003 
2004         if (pr->flags.ordered) {
2005           pr->u.p.ordered_lower = init;
2006           pr->u.p.ordered_upper = limit;
2007 #ifdef KMP_DEBUG
2008           {
2009             char *buff;
2010             // create format specifiers before the debug output
2011             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2012                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
2013                                     traits_t<UT>::spec, traits_t<UT>::spec);
2014             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2015                             pr->u.p.ordered_upper));
2016             __kmp_str_free(&buff);
2017           }
2018 #endif
2019         } // if
2020       } // if
2021     } else {
2022       pr->u.p.tc = 0;
2023       *p_lb = pr->u.p.lb;
2024       *p_ub = pr->u.p.ub;
2025 #if KMP_OS_WINDOWS
2026       pr->u.p.last_upper = *p_ub;
2027 #endif /* KMP_OS_WINDOWS */
2028       if (p_last != NULL)
2029         *p_last = TRUE;
2030       if (p_st != NULL)
2031         *p_st = pr->u.p.st;
2032     } // if
2033 #ifdef KMP_DEBUG
2034     {
2035       char *buff;
2036       // create format specifiers before the debug output
2037       buff = __kmp_str_format(
2038           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2039           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2040           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2041       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2042                     (p_last ? *p_last : 0), status));
2043       __kmp_str_free(&buff);
2044     }
2045 #endif
2046 #if INCLUDE_SSC_MARKS
2047     SSC_MARK_DISPATCH_NEXT();
2048 #endif
2049     OMPT_LOOP_END;
2050     KMP_STATS_LOOP_END;
2051     return status;
2052   } else {
2053     kmp_int32 last = 0;
2054     dispatch_shared_info_template<T> volatile *sh;
2055 
2056     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2057                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2058 
2059     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2060         th->th.th_dispatch->th_dispatch_pr_current);
2061     KMP_DEBUG_ASSERT(pr);
2062     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2063         th->th.th_dispatch->th_dispatch_sh_current);
2064     KMP_DEBUG_ASSERT(sh);
2065 
2066 #if KMP_USE_HIER_SCHED
2067     if (pr->flags.use_hier)
2068       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2069     else
2070 #endif // KMP_USE_HIER_SCHED
2071       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2072                                                 p_st, th->th.th_team_nproc,
2073                                                 th->th.th_info.ds.ds_tid);
2074     // status == 0: no more iterations to execute
2075     if (status == 0) {
2076       UT num_done;
2077 
2078       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2079 #ifdef KMP_DEBUG
2080       {
2081         char *buff;
2082         // create format specifiers before the debug output
2083         buff = __kmp_str_format(
2084             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2085             traits_t<UT>::spec);
2086         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2087         __kmp_str_free(&buff);
2088       }
2089 #endif
2090 
2091 #if KMP_USE_HIER_SCHED
2092       pr->flags.use_hier = FALSE;
2093 #endif
2094       if ((ST)num_done == th->th.th_team_nproc - 1) {
2095 #if (KMP_STATIC_STEAL_ENABLED)
2096         if (pr->schedule == kmp_sch_static_steal &&
2097             traits_t<T>::type_size > 4) {
2098           int i;
2099           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2100                     __kmp_dispatch_num_buffers; // current loop index
2101           kmp_info_t **other_threads = team->t.t_threads;
2102           // loop complete, safe to destroy locks used for stealing
2103           for (i = 0; i < th->th.th_team_nproc; ++i) {
2104             dispatch_private_info_template<T> *buf =
2105                 reinterpret_cast<dispatch_private_info_template<T> *>(
2106                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2107             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2108             KMP_ASSERT(lck != NULL);
2109             __kmp_destroy_lock(lck);
2110             __kmp_free(lck);
2111             buf->u.p.th_steal_lock = NULL;
2112           }
2113         }
2114 #endif
2115         /* NOTE: release this buffer to be reused */
2116 
2117         KMP_MB(); /* Flush all pending memory write invalidates.  */
2118 
2119         sh->u.s.num_done = 0;
2120         sh->u.s.iteration = 0;
2121 
2122         /* TODO replace with general release procedure? */
2123         if (pr->flags.ordered) {
2124           sh->u.s.ordered_iteration = 0;
2125         }
2126 
2127         KMP_MB(); /* Flush all pending memory write invalidates.  */
2128 
2129         sh->buffer_index += __kmp_dispatch_num_buffers;
2130         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2131                        gtid, sh->buffer_index));
2132 
2133         KMP_MB(); /* Flush all pending memory write invalidates.  */
2134 
2135       } // if
2136       if (__kmp_env_consistency_check) {
2137         if (pr->pushed_ws != ct_none) {
2138           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2139         }
2140       }
2141 
2142       th->th.th_dispatch->th_deo_fcn = NULL;
2143       th->th.th_dispatch->th_dxo_fcn = NULL;
2144       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2145       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2146     } // if (status == 0)
2147 #if KMP_OS_WINDOWS
2148     else if (last) {
2149       pr->u.p.last_upper = pr->u.p.ub;
2150     }
2151 #endif /* KMP_OS_WINDOWS */
2152     if (p_last != NULL && status != 0)
2153       *p_last = last;
2154   } // if
2155 
2156 #ifdef KMP_DEBUG
2157   {
2158     char *buff;
2159     // create format specifiers before the debug output
2160     buff = __kmp_str_format(
2161         "__kmp_dispatch_next: T#%%d normal case: "
2162         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2163         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2164     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2165                   (p_last ? *p_last : 0), status));
2166     __kmp_str_free(&buff);
2167   }
2168 #endif
2169 #if INCLUDE_SSC_MARKS
2170   SSC_MARK_DISPATCH_NEXT();
2171 #endif
2172   OMPT_LOOP_END;
2173   KMP_STATS_LOOP_END;
2174   return status;
2175 }
2176 
2177 template <typename T>
2178 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2179                                   kmp_int32 *plastiter, T *plower, T *pupper,
2180                                   typename traits_t<T>::signed_t incr) {
2181   typedef typename traits_t<T>::unsigned_t UT;
2182   kmp_uint32 team_id;
2183   kmp_uint32 nteams;
2184   UT trip_count;
2185   kmp_team_t *team;
2186   kmp_info_t *th;
2187 
2188   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2189   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2190 #ifdef KMP_DEBUG
2191   typedef typename traits_t<T>::signed_t ST;
2192   {
2193     char *buff;
2194     // create format specifiers before the debug output
2195     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2196                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2197                             traits_t<T>::spec, traits_t<T>::spec,
2198                             traits_t<ST>::spec, traits_t<T>::spec);
2199     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2200     __kmp_str_free(&buff);
2201   }
2202 #endif
2203 
2204   if (__kmp_env_consistency_check) {
2205     if (incr == 0) {
2206       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2207                             loc);
2208     }
2209     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2210       // The loop is illegal.
2211       // Some zero-trip loops maintained by compiler, e.g.:
2212       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2213       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2214       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2215       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2216       // Compiler does not check the following illegal loops:
2217       //   for(i=0;i<10;i+=incr) // where incr<0
2218       //   for(i=10;i>0;i-=incr) // where incr<0
2219       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2220     }
2221   }
2222   __kmp_assert_valid_gtid(gtid);
2223   th = __kmp_threads[gtid];
2224   team = th->th.th_team;
2225   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2226   nteams = th->th.th_teams_size.nteams;
2227   team_id = team->t.t_master_tid;
2228   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2229 
2230   // compute global trip count
2231   if (incr == 1) {
2232     trip_count = *pupper - *plower + 1;
2233   } else if (incr == -1) {
2234     trip_count = *plower - *pupper + 1;
2235   } else if (incr > 0) {
2236     // upper-lower can exceed the limit of signed type
2237     trip_count = (UT)(*pupper - *plower) / incr + 1;
2238   } else {
2239     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2240   }
2241 
2242   if (trip_count <= nteams) {
2243     KMP_DEBUG_ASSERT(
2244         __kmp_static == kmp_sch_static_greedy ||
2245         __kmp_static ==
2246             kmp_sch_static_balanced); // Unknown static scheduling type.
2247     // only some teams get single iteration, others get nothing
2248     if (team_id < trip_count) {
2249       *pupper = *plower = *plower + team_id * incr;
2250     } else {
2251       *plower = *pupper + incr; // zero-trip loop
2252     }
2253     if (plastiter != NULL)
2254       *plastiter = (team_id == trip_count - 1);
2255   } else {
2256     if (__kmp_static == kmp_sch_static_balanced) {
2257       UT chunk = trip_count / nteams;
2258       UT extras = trip_count % nteams;
2259       *plower +=
2260           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2261       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2262       if (plastiter != NULL)
2263         *plastiter = (team_id == nteams - 1);
2264     } else {
2265       T chunk_inc_count =
2266           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2267       T upper = *pupper;
2268       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2269       // Unknown static scheduling type.
2270       *plower += team_id * chunk_inc_count;
2271       *pupper = *plower + chunk_inc_count - incr;
2272       // Check/correct bounds if needed
2273       if (incr > 0) {
2274         if (*pupper < *plower)
2275           *pupper = traits_t<T>::max_value;
2276         if (plastiter != NULL)
2277           *plastiter = *plower <= upper && *pupper > upper - incr;
2278         if (*pupper > upper)
2279           *pupper = upper; // tracker C73258
2280       } else {
2281         if (*pupper > *plower)
2282           *pupper = traits_t<T>::min_value;
2283         if (plastiter != NULL)
2284           *plastiter = *plower >= upper && *pupper < upper - incr;
2285         if (*pupper < upper)
2286           *pupper = upper; // tracker C73258
2287       }
2288     }
2289   }
2290 }
2291 
2292 //-----------------------------------------------------------------------------
2293 // Dispatch routines
2294 //    Transfer call to template< type T >
2295 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2296 //                         T lb, T ub, ST st, ST chunk )
2297 extern "C" {
2298 
2299 /*!
2300 @ingroup WORK_SHARING
2301 @{
2302 @param loc Source location
2303 @param gtid Global thread id
2304 @param schedule Schedule type
2305 @param lb  Lower bound
2306 @param ub  Upper bound
2307 @param st  Step (or increment if you prefer)
2308 @param chunk The chunk size to block with
2309 
2310 This function prepares the runtime to start a dynamically scheduled for loop,
2311 saving the loop arguments.
2312 These functions are all identical apart from the types of the arguments.
2313 */
2314 
2315 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2316                             enum sched_type schedule, kmp_int32 lb,
2317                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2318   KMP_DEBUG_ASSERT(__kmp_init_serial);
2319 #if OMPT_SUPPORT && OMPT_OPTIONAL
2320   OMPT_STORE_RETURN_ADDRESS(gtid);
2321 #endif
2322   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2323 }
2324 /*!
2325 See @ref __kmpc_dispatch_init_4
2326 */
2327 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2328                              enum sched_type schedule, kmp_uint32 lb,
2329                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2330   KMP_DEBUG_ASSERT(__kmp_init_serial);
2331 #if OMPT_SUPPORT && OMPT_OPTIONAL
2332   OMPT_STORE_RETURN_ADDRESS(gtid);
2333 #endif
2334   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2335 }
2336 
2337 /*!
2338 See @ref __kmpc_dispatch_init_4
2339 */
2340 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2341                             enum sched_type schedule, kmp_int64 lb,
2342                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2343   KMP_DEBUG_ASSERT(__kmp_init_serial);
2344 #if OMPT_SUPPORT && OMPT_OPTIONAL
2345   OMPT_STORE_RETURN_ADDRESS(gtid);
2346 #endif
2347   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2348 }
2349 
2350 /*!
2351 See @ref __kmpc_dispatch_init_4
2352 */
2353 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2354                              enum sched_type schedule, kmp_uint64 lb,
2355                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2356   KMP_DEBUG_ASSERT(__kmp_init_serial);
2357 #if OMPT_SUPPORT && OMPT_OPTIONAL
2358   OMPT_STORE_RETURN_ADDRESS(gtid);
2359 #endif
2360   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2361 }
2362 
2363 /*!
2364 See @ref __kmpc_dispatch_init_4
2365 
2366 Difference from __kmpc_dispatch_init set of functions is these functions
2367 are called for composite distribute parallel for construct. Thus before
2368 regular iterations dispatching we need to calc per-team iteration space.
2369 
2370 These functions are all identical apart from the types of the arguments.
2371 */
2372 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2373                                  enum sched_type schedule, kmp_int32 *p_last,
2374                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2375                                  kmp_int32 chunk) {
2376   KMP_DEBUG_ASSERT(__kmp_init_serial);
2377 #if OMPT_SUPPORT && OMPT_OPTIONAL
2378   OMPT_STORE_RETURN_ADDRESS(gtid);
2379 #endif
2380   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2381   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2382 }
2383 
2384 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2385                                   enum sched_type schedule, kmp_int32 *p_last,
2386                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2387                                   kmp_int32 chunk) {
2388   KMP_DEBUG_ASSERT(__kmp_init_serial);
2389 #if OMPT_SUPPORT && OMPT_OPTIONAL
2390   OMPT_STORE_RETURN_ADDRESS(gtid);
2391 #endif
2392   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2393   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2394 }
2395 
2396 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2397                                  enum sched_type schedule, kmp_int32 *p_last,
2398                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2399                                  kmp_int64 chunk) {
2400   KMP_DEBUG_ASSERT(__kmp_init_serial);
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402   OMPT_STORE_RETURN_ADDRESS(gtid);
2403 #endif
2404   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2405   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2406 }
2407 
2408 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2409                                   enum sched_type schedule, kmp_int32 *p_last,
2410                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2411                                   kmp_int64 chunk) {
2412   KMP_DEBUG_ASSERT(__kmp_init_serial);
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414   OMPT_STORE_RETURN_ADDRESS(gtid);
2415 #endif
2416   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2417   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2418 }
2419 
2420 /*!
2421 @param loc Source code location
2422 @param gtid Global thread id
2423 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2424 otherwise
2425 @param p_lb   Pointer to the lower bound for the next chunk of work
2426 @param p_ub   Pointer to the upper bound for the next chunk of work
2427 @param p_st   Pointer to the stride for the next chunk of work
2428 @return one if there is work to be done, zero otherwise
2429 
2430 Get the next dynamically allocated chunk of work for this thread.
2431 If there is no more work, then the lb,ub and stride need not be modified.
2432 */
2433 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2434                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436   OMPT_STORE_RETURN_ADDRESS(gtid);
2437 #endif
2438   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440                                         ,
2441                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2442 #endif
2443   );
2444 }
2445 
2446 /*!
2447 See @ref __kmpc_dispatch_next_4
2448 */
2449 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2450                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2451                             kmp_int32 *p_st) {
2452 #if OMPT_SUPPORT && OMPT_OPTIONAL
2453   OMPT_STORE_RETURN_ADDRESS(gtid);
2454 #endif
2455   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2456 #if OMPT_SUPPORT && OMPT_OPTIONAL
2457                                          ,
2458                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2459 #endif
2460   );
2461 }
2462 
2463 /*!
2464 See @ref __kmpc_dispatch_next_4
2465 */
2466 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2467                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2468 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469   OMPT_STORE_RETURN_ADDRESS(gtid);
2470 #endif
2471   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473                                         ,
2474                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2475 #endif
2476   );
2477 }
2478 
2479 /*!
2480 See @ref __kmpc_dispatch_next_4
2481 */
2482 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2483                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2484                             kmp_int64 *p_st) {
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486   OMPT_STORE_RETURN_ADDRESS(gtid);
2487 #endif
2488   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2489 #if OMPT_SUPPORT && OMPT_OPTIONAL
2490                                          ,
2491                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2492 #endif
2493   );
2494 }
2495 
2496 /*!
2497 @param loc Source code location
2498 @param gtid Global thread id
2499 
2500 Mark the end of a dynamic loop.
2501 */
2502 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2503   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2504 }
2505 
2506 /*!
2507 See @ref __kmpc_dispatch_fini_4
2508 */
2509 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2510   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2511 }
2512 
2513 /*!
2514 See @ref __kmpc_dispatch_fini_4
2515 */
2516 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2517   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2518 }
2519 
2520 /*!
2521 See @ref __kmpc_dispatch_fini_4
2522 */
2523 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2524   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2525 }
2526 /*! @} */
2527 
2528 //-----------------------------------------------------------------------------
2529 // Non-template routines from kmp_dispatch.cpp used in other sources
2530 
2531 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2532   return value == checker;
2533 }
2534 
2535 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2536   return value != checker;
2537 }
2538 
2539 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2540   return value < checker;
2541 }
2542 
2543 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2544   return value >= checker;
2545 }
2546 
2547 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2548   return value <= checker;
2549 }
2550 
2551 kmp_uint32
2552 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2553              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2554              void *obj // Higher-level synchronization object, or NULL.
2555 ) {
2556   // note: we may not belong to a team at this point
2557   volatile kmp_uint32 *spin = spinner;
2558   kmp_uint32 check = checker;
2559   kmp_uint32 spins;
2560   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2561   kmp_uint32 r;
2562 
2563   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2564   KMP_INIT_YIELD(spins);
2565   // main wait spin loop
2566   while (!f(r = TCR_4(*spin), check)) {
2567     KMP_FSYNC_SPIN_PREPARE(obj);
2568     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2569        split. It causes problems with infinite recursion because of exit lock */
2570     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2571         __kmp_abort_thread(); */
2572     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2573   }
2574   KMP_FSYNC_SPIN_ACQUIRED(obj);
2575   return r;
2576 }
2577 
2578 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2579                       kmp_uint32 (*pred)(void *, kmp_uint32),
2580                       void *obj // Higher-level synchronization object, or NULL.
2581 ) {
2582   // note: we may not belong to a team at this point
2583   void *spin = spinner;
2584   kmp_uint32 check = checker;
2585   kmp_uint32 spins;
2586   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2587 
2588   KMP_FSYNC_SPIN_INIT(obj, spin);
2589   KMP_INIT_YIELD(spins);
2590   // main wait spin loop
2591   while (!f(spin, check)) {
2592     KMP_FSYNC_SPIN_PREPARE(obj);
2593     /* if we have waited a bit, or are noversubscribed, yield */
2594     /* pause is in the following code */
2595     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2596   }
2597   KMP_FSYNC_SPIN_ACQUIRED(obj);
2598 }
2599 
2600 } // extern "C"
2601 
2602 #ifdef KMP_GOMP_COMPAT
2603 
2604 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2605                                enum sched_type schedule, kmp_int32 lb,
2606                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2607                                int push_ws) {
2608   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2609                                  push_ws);
2610 }
2611 
2612 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2613                                 enum sched_type schedule, kmp_uint32 lb,
2614                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2615                                 int push_ws) {
2616   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2617                                   push_ws);
2618 }
2619 
2620 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2621                                enum sched_type schedule, kmp_int64 lb,
2622                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2623                                int push_ws) {
2624   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2625                                  push_ws);
2626 }
2627 
2628 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2629                                 enum sched_type schedule, kmp_uint64 lb,
2630                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2631                                 int push_ws) {
2632   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2633                                   push_ws);
2634 }
2635 
2636 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2637   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2638 }
2639 
2640 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2641   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2642 }
2643 
2644 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2645   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2646 }
2647 
2648 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2649   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2650 }
2651 
2652 #endif /* KMP_GOMP_COMPAT */
2653 
2654 /* ------------------------------------------------------------------------ */
2655