1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   int monotonicity;
76   // default to monotonic
77   monotonicity = SCHEDULE_MONOTONIC;
78   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79     monotonicity = SCHEDULE_NONMONOTONIC;
80   else if (SCHEDULE_HAS_MONOTONIC(schedule))
81     monotonicity = SCHEDULE_MONOTONIC;
82   return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk.  The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used).  tid is the id of the thread calling
90 // the function within the group of nproc threads.  It will have a value
91 // between 0 and nproc - 1.  This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                    dispatch_private_info_template<T> *pr,
98                                    enum sched_type schedule, T lb, T ub,
99                                    typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101                                    kmp_uint64 *cur_chunk,
102 #endif
103                                    typename traits_t<T>::signed_t chunk,
104                                    T nproc, T tid) {
105   typedef typename traits_t<T>::unsigned_t UT;
106   typedef typename traits_t<T>::floating_t DBL;
107 
108   int active;
109   T tc;
110   kmp_info_t *th;
111   kmp_team_t *team;
112   int monotonicity;
113   bool use_hier;
114 
115 #ifdef KMP_DEBUG
116   typedef typename traits_t<T>::signed_t ST;
117   {
118     char *buff;
119     // create format specifiers before the debug output
120     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                             traits_t<T>::spec, traits_t<T>::spec,
124                             traits_t<ST>::spec, traits_t<ST>::spec,
125                             traits_t<T>::spec, traits_t<T>::spec);
126     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127     __kmp_str_free(&buff);
128   }
129 #endif
130   /* setup data */
131   th = __kmp_threads[gtid];
132   team = th->th.th_team;
133   active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136   int itt_need_metadata_reporting =
137       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139       team->t.t_active_level == 1;
140 #endif
141 
142 #if KMP_USE_HIER_SCHED
143   use_hier = pr->flags.use_hier;
144 #else
145   use_hier = false;
146 #endif
147 
148   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149   monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151 
152   /* Pick up the nomerge/ordered bits from the scheduling type */
153   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154     pr->flags.nomerge = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157   } else {
158     pr->flags.nomerge = FALSE;
159   }
160   pr->type_size = traits_t<T>::type_size; // remember the size of variables
161   if (kmp_ord_lower & schedule) {
162     pr->flags.ordered = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165   } else {
166     pr->flags.ordered = FALSE;
167   }
168   // Ordered overrides nonmonotonic
169   if (pr->flags.ordered) {
170     monotonicity = SCHEDULE_MONOTONIC;
171   }
172 
173   if (schedule == kmp_sch_static) {
174     schedule = __kmp_static;
175   } else {
176     if (schedule == kmp_sch_runtime) {
177       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178       // not specified)
179       schedule = team->t.t_sched.r_sched_type;
180       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182       // Detail the schedule if needed (global controls are differentiated
183       // appropriately)
184       if (schedule == kmp_sch_guided_chunked) {
185         schedule = __kmp_guided;
186       } else if (schedule == kmp_sch_static) {
187         schedule = __kmp_static;
188       }
189       // Use the chunk size specified by OMP_SCHEDULE (or default if not
190       // specified)
191       chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193       if (cur_chunk)
194         *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197       {
198         char *buff;
199         // create format specifiers before the debug output
200         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                 "schedule:%%d chunk:%%%s\n",
202                                 traits_t<ST>::spec);
203         KD_TRACE(10, (buff, gtid, schedule, chunk));
204         __kmp_str_free(&buff);
205       }
206 #endif
207     } else {
208       if (schedule == kmp_sch_guided_chunked) {
209         schedule = __kmp_guided;
210       }
211       if (chunk <= 0) {
212         chunk = KMP_DEFAULT_CHUNK;
213       }
214     }
215 
216     if (schedule == kmp_sch_auto) {
217       // mapping and differentiation: in the __kmp_do_serial_initialize()
218       schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220       {
221         char *buff;
222         // create format specifiers before the debug output
223         buff = __kmp_str_format(
224             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225             "schedule:%%d chunk:%%%s\n",
226             traits_t<ST>::spec);
227         KD_TRACE(10, (buff, gtid, schedule, chunk));
228         __kmp_str_free(&buff);
229       }
230 #endif
231     }
232 #if KMP_STATIC_STEAL_ENABLED
233     // map nonmonotonic:dynamic to static steal
234     if (schedule == kmp_sch_dynamic_chunked) {
235       if (monotonicity == SCHEDULE_NONMONOTONIC)
236         schedule = kmp_sch_static_steal;
237     }
238 #endif
239     /* guided analytical not safe for too many threads */
240     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241       schedule = kmp_sch_guided_iterative_chunked;
242       KMP_WARNING(DispatchManyThreads);
243     }
244     if (schedule == kmp_sch_runtime_simd) {
245       // compiler provides simd_width in the chunk parameter
246       schedule = team->t.t_sched.r_sched_type;
247       monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249       // Detail the schedule if needed (global controls are differentiated
250       // appropriately)
251       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252           schedule == __kmp_static) {
253         schedule = kmp_sch_static_balanced_chunked;
254       } else {
255         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256           schedule = kmp_sch_guided_simd;
257         }
258         chunk = team->t.t_sched.chunk * chunk;
259       }
260 #if USE_ITT_BUILD
261       if (cur_chunk)
262         *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265       {
266         char *buff;
267         // create format specifiers before the debug output
268         buff = __kmp_str_format(
269             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270             " chunk:%%%s\n",
271             traits_t<ST>::spec);
272         KD_TRACE(10, (buff, gtid, schedule, chunk));
273         __kmp_str_free(&buff);
274       }
275 #endif
276     }
277     pr->u.p.parm1 = chunk;
278   }
279   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280               "unknown scheduling type");
281 
282   pr->u.p.count = 0;
283 
284   if (__kmp_env_consistency_check) {
285     if (st == 0) {
286       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288     }
289   }
290   // compute trip count
291   if (st == 1) { // most common case
292     if (ub >= lb) {
293       tc = ub - lb + 1;
294     } else { // ub < lb
295       tc = 0; // zero-trip
296     }
297   } else if (st < 0) {
298     if (lb >= ub) {
299       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300       // where the division needs to be unsigned regardless of the result type
301       tc = (UT)(lb - ub) / (-st) + 1;
302     } else { // lb < ub
303       tc = 0; // zero-trip
304     }
305   } else { // st > 0
306     if (ub >= lb) {
307       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(ub - lb) / st + 1;
310     } else { // ub < lb
311       tc = 0; // zero-trip
312     }
313   }
314 
315 #if KMP_STATS_ENABLED
316   if (KMP_MASTER_GTID(gtid)) {
317     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318   }
319 #endif
320 
321   pr->u.p.lb = lb;
322   pr->u.p.ub = ub;
323   pr->u.p.st = st;
324   pr->u.p.tc = tc;
325 
326 #if KMP_OS_WINDOWS
327   pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329 
330   /* NOTE: only the active parallel region(s) has active ordered sections */
331 
332   if (active) {
333     if (pr->flags.ordered) {
334       pr->ordered_bumped = 0;
335       pr->u.p.ordered_lower = 1;
336       pr->u.p.ordered_upper = 0;
337     }
338   }
339 
340   switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342   case kmp_sch_static_steal: {
343     T ntc, init;
344 
345     KD_TRACE(100,
346              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347               gtid));
348 
349     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350     if (nproc > 1 && ntc >= nproc) {
351       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352       T id = tid;
353       T small_chunk, extras;
354 
355       small_chunk = ntc / nproc;
356       extras = ntc % nproc;
357 
358       init = id * small_chunk + (id < extras ? id : extras);
359       pr->u.p.count = init;
360       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361 
362       pr->u.p.parm2 = lb;
363       // parm3 is the number of times to attempt stealing which is
364       // proportional to the number of chunks per thread up until
365       // the maximum value of nproc.
366       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368       pr->u.p.st = st;
369       if (traits_t<T>::type_size > 4) {
370         // AC: TODO: check if 16-byte CAS available and use it to
371         // improve performance (probably wait for explicit request
372         // before spending time on this).
373         // For now use dynamically allocated per-thread lock,
374         // free memory in __kmp_dispatch_next when status==0.
375         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376         pr->u.p.th_steal_lock =
377             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378         __kmp_init_lock(pr->u.p.th_steal_lock);
379       }
380       break;
381     } else {
382       /* too few chunks: switching to kmp_sch_dynamic_chunked */
383       schedule = kmp_sch_dynamic_chunked;
384       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385                      "kmp_sch_dynamic_chunked\n",
386                       gtid));
387       if (pr->u.p.parm1 <= 0)
388         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389       break;
390     } // if
391   } // case
392 #endif
393   case kmp_sch_static_balanced: {
394     T init, limit;
395 
396     KD_TRACE(
397         100,
398         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399          gtid));
400 
401     if (nproc > 1) {
402       T id = tid;
403 
404       if (tc < nproc) {
405         if (id < tc) {
406           init = id;
407           limit = id;
408           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409         } else {
410           pr->u.p.count = 1; /* means no more chunks to execute */
411           pr->u.p.parm1 = FALSE;
412           break;
413         }
414       } else {
415         T small_chunk = tc / nproc;
416         T extras = tc % nproc;
417         init = id * small_chunk + (id < extras ? id : extras);
418         limit = init + small_chunk - (id < extras ? 0 : 1);
419         pr->u.p.parm1 = (id == nproc - 1);
420       }
421     } else {
422       if (tc > 0) {
423         init = 0;
424         limit = tc - 1;
425         pr->u.p.parm1 = TRUE;
426       } else {
427         // zero trip count
428         pr->u.p.count = 1; /* means no more chunks to execute */
429         pr->u.p.parm1 = FALSE;
430         break;
431       }
432     }
433 #if USE_ITT_BUILD
434     // Calculate chunk for metadata report
435     if (itt_need_metadata_reporting)
436       if (cur_chunk)
437         *cur_chunk = limit - init + 1;
438 #endif
439     if (st == 1) {
440       pr->u.p.lb = lb + init;
441       pr->u.p.ub = lb + limit;
442     } else {
443       // calculated upper bound, "ub" is user-defined upper bound
444       T ub_tmp = lb + limit * st;
445       pr->u.p.lb = lb + init * st;
446       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447       // it exactly
448       if (st > 0) {
449         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450       } else {
451         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452       }
453     }
454     if (pr->flags.ordered) {
455       pr->u.p.ordered_lower = init;
456       pr->u.p.ordered_upper = limit;
457     }
458     break;
459   } // case
460   case kmp_sch_static_balanced_chunked: {
461     // similar to balanced, but chunk adjusted to multiple of simd width
462     T nth = nproc;
463     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464                    " -> falling-through to static_greedy\n",
465                    gtid));
466     schedule = kmp_sch_static_greedy;
467     if (nth > 1)
468       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469     else
470       pr->u.p.parm1 = tc;
471     break;
472   } // case
473   case kmp_sch_guided_simd:
474   case kmp_sch_guided_iterative_chunked: {
475     KD_TRACE(
476         100,
477         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478          " case\n",
479          gtid));
480 
481     if (nproc > 1) {
482       if ((2L * chunk + 1) * nproc >= tc) {
483         /* chunk size too large, switch to dynamic */
484         schedule = kmp_sch_dynamic_chunked;
485       } else {
486         // when remaining iters become less than parm2 - switch to dynamic
487         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488         *(double *)&pr->u.p.parm3 =
489             guided_flt_param / nproc; // may occupy parm3 and parm4
490       }
491     } else {
492       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493                      "kmp_sch_static_greedy\n",
494                      gtid));
495       schedule = kmp_sch_static_greedy;
496       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497       KD_TRACE(
498           100,
499           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500            gtid));
501       pr->u.p.parm1 = tc;
502     } // if
503   } // case
504   break;
505   case kmp_sch_guided_analytical_chunked: {
506     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507                    "kmp_sch_guided_analytical_chunked case\n",
508                    gtid));
509 
510     if (nproc > 1) {
511       if ((2L * chunk + 1) * nproc >= tc) {
512         /* chunk size too large, switch to dynamic */
513         schedule = kmp_sch_dynamic_chunked;
514       } else {
515         /* commonly used term: (2 nproc - 1)/(2 nproc) */
516         DBL x;
517 
518 #if KMP_USE_X87CONTROL
519         /* Linux* OS already has 64-bit computation by default for long double,
520            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522            instead of the default 53-bit. Even though long double doesn't work
523            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524            expected to impact the correctness of the algorithm, but this has not
525            been mathematically proven. */
526         // save original FPCW and set precision to 64-bit, as
527         // Windows* OS on IA-32 architecture defaults to 53-bit
528         unsigned int oldFpcw = _control87(0, 0);
529         _control87(_PC_64, _MCW_PC); // 0,0x30000
530 #endif
531         /* value used for comparison in solver for cross-over point */
532         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533 
534         /* crossover point--chunk indexes equal to or greater than
535            this point switch to dynamic-style scheduling */
536         UT cross;
537 
538         /* commonly used term: (2 nproc - 1)/(2 nproc) */
539         x = (long double)1.0 - (long double)0.5 / nproc;
540 
541 #ifdef KMP_DEBUG
542         { // test natural alignment
543           struct _test_a {
544             char a;
545             union {
546               char b;
547               DBL d;
548             };
549           } t;
550           ptrdiff_t natural_alignment =
551               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553           // long)natural_alignment );
554           KMP_DEBUG_ASSERT(
555               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556         }
557 #endif // KMP_DEBUG
558 
559         /* save the term in thread private dispatch structure */
560         *(DBL *)&pr->u.p.parm3 = x;
561 
562         /* solve for the crossover point to the nearest integer i for which C_i
563            <= chunk */
564         {
565           UT left, right, mid;
566           long double p;
567 
568           /* estimate initial upper and lower bound */
569 
570           /* doesn't matter what value right is as long as it is positive, but
571              it affects performance of the solver */
572           right = 229;
573           p = __kmp_pow<UT>(x, right);
574           if (p > target) {
575             do {
576               p *= p;
577               right <<= 1;
578             } while (p > target && right < (1 << 27));
579             /* lower bound is previous (failed) estimate of upper bound */
580             left = right >> 1;
581           } else {
582             left = 0;
583           }
584 
585           /* bisection root-finding method */
586           while (left + 1 < right) {
587             mid = (left + right) / 2;
588             if (__kmp_pow<UT>(x, mid) > target) {
589               left = mid;
590             } else {
591               right = mid;
592             }
593           } // while
594           cross = right;
595         }
596         /* assert sanity of computed crossover point */
597         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598                    __kmp_pow<UT>(x, cross) <= target);
599 
600         /* save the crossover point in thread private dispatch structure */
601         pr->u.p.parm2 = cross;
602 
603 // C75803
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606 #else
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
608 #endif
609         /* dynamic-style scheduling offset */
610         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612                         cross * chunk;
613 #if KMP_USE_X87CONTROL
614         // restore FPCW
615         _control87(oldFpcw, _MCW_PC);
616 #endif
617       } // if
618     } else {
619       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620                      "kmp_sch_static_greedy\n",
621                      gtid));
622       schedule = kmp_sch_static_greedy;
623       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624       pr->u.p.parm1 = tc;
625     } // if
626   } // case
627   break;
628   case kmp_sch_static_greedy:
629     KD_TRACE(
630         100,
631         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632          gtid));
633     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634     break;
635   case kmp_sch_static_chunked:
636   case kmp_sch_dynamic_chunked:
637     if (pr->u.p.parm1 <= 0) {
638       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639     }
640     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642                    gtid));
643     break;
644   case kmp_sch_trapezoidal: {
645     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646 
647     T parm1, parm2, parm3, parm4;
648     KD_TRACE(100,
649              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650               gtid));
651 
652     parm1 = chunk;
653 
654     /* F : size of the first cycle */
655     parm2 = (tc / (2 * nproc));
656 
657     if (parm2 < 1) {
658       parm2 = 1;
659     }
660 
661     /* L : size of the last cycle.  Make sure the last cycle is not larger
662        than the first cycle. */
663     if (parm1 < 1) {
664       parm1 = 1;
665     } else if (parm1 > parm2) {
666       parm1 = parm2;
667     }
668 
669     /* N : number of cycles */
670     parm3 = (parm2 + parm1);
671     parm3 = (2 * tc + parm3 - 1) / parm3;
672 
673     if (parm3 < 2) {
674       parm3 = 2;
675     }
676 
677     /* sigma : decreasing incr of the trapezoid */
678     parm4 = (parm3 - 1);
679     parm4 = (parm2 - parm1) / parm4;
680 
681     // pointless check, because parm4 >= 0 always
682     // if ( parm4 < 0 ) {
683     //    parm4 = 0;
684     //}
685 
686     pr->u.p.parm1 = parm1;
687     pr->u.p.parm2 = parm2;
688     pr->u.p.parm3 = parm3;
689     pr->u.p.parm4 = parm4;
690   } // case
691   break;
692 
693   default: {
694     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695                 KMP_HNT(GetNewerLibrary), // Hint
696                 __kmp_msg_null // Variadic argument list terminator
697                 );
698   } break;
699   } // switch
700   pr->schedule = schedule;
701 }
702 
703 #if KMP_USE_HIER_SCHED
704 template <typename T>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706                                              typename traits_t<T>::signed_t st);
707 template <>
708 inline void
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710                                             kmp_int32 ub, kmp_int32 st) {
711   __kmp_dispatch_init_hierarchy<kmp_int32>(
712       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714 }
715 template <>
716 inline void
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718                                              kmp_uint32 ub, kmp_int32 st) {
719   __kmp_dispatch_init_hierarchy<kmp_uint32>(
720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726                                             kmp_int64 ub, kmp_int64 st) {
727   __kmp_dispatch_init_hierarchy<kmp_int64>(
728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734                                              kmp_uint64 ub, kmp_int64 st) {
735   __kmp_dispatch_init_hierarchy<kmp_uint64>(
736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 
740 // free all the hierarchy scheduling memory associated with the team
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743   for (int i = 0; i < num_disp_buff; ++i) {
744     // type does not matter here so use kmp_int32
745     auto sh =
746         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747             &team->t.t_disp_buffer[i]);
748     if (sh->hier) {
749       sh->hier->deallocate();
750       __kmp_free(sh->hier);
751     }
752   }
753 }
754 #endif
755 
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T>
759 static void
760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761                     T ub, typename traits_t<T>::signed_t st,
762                     typename traits_t<T>::signed_t chunk, int push_ws) {
763   typedef typename traits_t<T>::unsigned_t UT;
764 
765   int active;
766   kmp_info_t *th;
767   kmp_team_t *team;
768   kmp_uint32 my_buffer_index;
769   dispatch_private_info_template<T> *pr;
770   dispatch_shared_info_template<T> volatile *sh;
771 
772   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773                    sizeof(dispatch_private_info));
774   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775                    sizeof(dispatch_shared_info));
776   __kmp_assert_valid_gtid(gtid);
777 
778   if (!TCR_4(__kmp_init_parallel))
779     __kmp_parallel_initialize();
780 
781   __kmp_resume_if_soft_paused();
782 
783 #if INCLUDE_SSC_MARKS
784   SSC_MARK_DISPATCH_INIT();
785 #endif
786 #ifdef KMP_DEBUG
787   typedef typename traits_t<T>::signed_t ST;
788   {
789     char *buff;
790     // create format specifiers before the debug output
791     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
792                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
793                             traits_t<ST>::spec, traits_t<T>::spec,
794                             traits_t<T>::spec, traits_t<ST>::spec);
795     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
796     __kmp_str_free(&buff);
797   }
798 #endif
799   /* setup data */
800   th = __kmp_threads[gtid];
801   team = th->th.th_team;
802   active = !team->t.t_serialized;
803   th->th.th_ident = loc;
804 
805   // Any half-decent optimizer will remove this test when the blocks are empty
806   // since the macros expand to nothing
807   // when statistics are disabled.
808   if (schedule == __kmp_static) {
809     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
810   } else {
811     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
812   }
813 
814 #if KMP_USE_HIER_SCHED
815   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
816   // Hierarchical scheduling does not work with ordered, so if ordered is
817   // detected, then revert back to threaded scheduling.
818   bool ordered;
819   enum sched_type my_sched = schedule;
820   my_buffer_index = th->th.th_dispatch->th_disp_index;
821   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
822       &th->th.th_dispatch
823            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
824   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
825   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
826     my_sched =
827         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
828   ordered = (kmp_ord_lower & my_sched);
829   if (pr->flags.use_hier) {
830     if (ordered) {
831       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
832                      "Disabling hierarchical scheduling.\n",
833                      gtid));
834       pr->flags.use_hier = FALSE;
835     }
836   }
837   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
838     // Don't use hierarchical for ordered parallel loops and don't
839     // use the runtime hierarchy if one was specified in the program
840     if (!ordered && !pr->flags.use_hier)
841       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
842   }
843 #endif // KMP_USE_HIER_SCHED
844 
845 #if USE_ITT_BUILD
846   kmp_uint64 cur_chunk = chunk;
847   int itt_need_metadata_reporting =
848       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
849       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
850       team->t.t_active_level == 1;
851 #endif
852   if (!active) {
853     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
854         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
855   } else {
856     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
857                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
858 
859     my_buffer_index = th->th.th_dispatch->th_disp_index++;
860 
861     /* What happens when number of threads changes, need to resize buffer? */
862     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863         &th->th.th_dispatch
864              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
866         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
867     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
868                   my_buffer_index));
869   }
870 
871   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
872 #if USE_ITT_BUILD
873                                 &cur_chunk,
874 #endif
875                                 chunk, (T)th->th.th_team_nproc,
876                                 (T)th->th.th_info.ds.ds_tid);
877   if (active) {
878     if (pr->flags.ordered == 0) {
879       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
880       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
881     } else {
882       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
883       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
884     }
885   }
886 
887   if (active) {
888     /* The name of this buffer should be my_buffer_index when it's free to use
889      * it */
890 
891     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
892                    "sh->buffer_index:%d\n",
893                    gtid, my_buffer_index, sh->buffer_index));
894     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
895                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
896     // Note: KMP_WAIT() cannot be used there: buffer index and
897     // my_buffer_index are *always* 32-bit integers.
898     KMP_MB(); /* is this necessary? */
899     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
900                    "sh->buffer_index:%d\n",
901                    gtid, my_buffer_index, sh->buffer_index));
902 
903     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
904     th->th.th_dispatch->th_dispatch_sh_current =
905         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
906 #if USE_ITT_BUILD
907     if (pr->flags.ordered) {
908       __kmp_itt_ordered_init(gtid);
909     }
910     // Report loop metadata
911     if (itt_need_metadata_reporting) {
912       // Only report metadata by master of active team at level 1
913       kmp_uint64 schedtype = 0;
914       switch (schedule) {
915       case kmp_sch_static_chunked:
916       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
917         break;
918       case kmp_sch_static_greedy:
919         cur_chunk = pr->u.p.parm1;
920         break;
921       case kmp_sch_dynamic_chunked:
922         schedtype = 1;
923         break;
924       case kmp_sch_guided_iterative_chunked:
925       case kmp_sch_guided_analytical_chunked:
926       case kmp_sch_guided_simd:
927         schedtype = 2;
928         break;
929       default:
930         // Should we put this case under "static"?
931         // case kmp_sch_static_steal:
932         schedtype = 3;
933         break;
934       }
935       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
936     }
937 #if KMP_USE_HIER_SCHED
938     if (pr->flags.use_hier) {
939       pr->u.p.count = 0;
940       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
941     }
942 #endif // KMP_USER_HIER_SCHED
943 #endif /* USE_ITT_BUILD */
944   }
945 
946 #ifdef KMP_DEBUG
947   {
948     char *buff;
949     // create format specifiers before the debug output
950     buff = __kmp_str_format(
951         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
952         "lb:%%%s ub:%%%s"
953         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
954         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
955         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
956         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
957         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
958         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
959     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
960                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
961                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
962                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
963     __kmp_str_free(&buff);
964   }
965 #endif
966 #if (KMP_STATIC_STEAL_ENABLED)
967   // It cannot be guaranteed that after execution of a loop with some other
968   // schedule kind all the parm3 variables will contain the same value. Even if
969   // all parm3 will be the same, it still exists a bad case like using 0 and 1
970   // rather than program life-time increment. So the dedicated variable is
971   // required. The 'static_steal_counter' is used.
972   if (pr->schedule == kmp_sch_static_steal) {
973     // Other threads will inspect this variable when searching for a victim.
974     // This is a flag showing that other threads may steal from this thread
975     // since then.
976     volatile T *p = &pr->u.p.static_steal_counter;
977     *p = *p + 1;
978   }
979 #endif // ( KMP_STATIC_STEAL_ENABLED )
980 
981 #if OMPT_SUPPORT && OMPT_OPTIONAL
982   if (ompt_enabled.ompt_callback_work) {
983     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
984     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
985     ompt_callbacks.ompt_callback(ompt_callback_work)(
986         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
987         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
988   }
989 #endif
990   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
991 }
992 
993 /* For ordered loops, either __kmp_dispatch_finish() should be called after
994  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
995  * every chunk of iterations.  If the ordered section(s) were not executed
996  * for this iteration (or every iteration in this chunk), we need to set the
997  * ordered iteration counters so that the next thread can proceed. */
998 template <typename UT>
999 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1000   typedef typename traits_t<UT>::signed_t ST;
1001   __kmp_assert_valid_gtid(gtid);
1002   kmp_info_t *th = __kmp_threads[gtid];
1003 
1004   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1005   if (!th->th.th_team->t.t_serialized) {
1006 
1007     dispatch_private_info_template<UT> *pr =
1008         reinterpret_cast<dispatch_private_info_template<UT> *>(
1009             th->th.th_dispatch->th_dispatch_pr_current);
1010     dispatch_shared_info_template<UT> volatile *sh =
1011         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1012             th->th.th_dispatch->th_dispatch_sh_current);
1013     KMP_DEBUG_ASSERT(pr);
1014     KMP_DEBUG_ASSERT(sh);
1015     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1016                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1017 
1018     if (pr->ordered_bumped) {
1019       KD_TRACE(
1020           1000,
1021           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1022            gtid));
1023       pr->ordered_bumped = 0;
1024     } else {
1025       UT lower = pr->u.p.ordered_lower;
1026 
1027 #ifdef KMP_DEBUG
1028       {
1029         char *buff;
1030         // create format specifiers before the debug output
1031         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1032                                 "ordered_iteration:%%%s lower:%%%s\n",
1033                                 traits_t<UT>::spec, traits_t<UT>::spec);
1034         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1035         __kmp_str_free(&buff);
1036       }
1037 #endif
1038 
1039       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1040                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1041       KMP_MB(); /* is this necessary? */
1042 #ifdef KMP_DEBUG
1043       {
1044         char *buff;
1045         // create format specifiers before the debug output
1046         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1047                                 "ordered_iteration:%%%s lower:%%%s\n",
1048                                 traits_t<UT>::spec, traits_t<UT>::spec);
1049         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1050         __kmp_str_free(&buff);
1051       }
1052 #endif
1053 
1054       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1055     } // if
1056   } // if
1057   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1058 }
1059 
1060 #ifdef KMP_GOMP_COMPAT
1061 
1062 template <typename UT>
1063 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1064   typedef typename traits_t<UT>::signed_t ST;
1065   __kmp_assert_valid_gtid(gtid);
1066   kmp_info_t *th = __kmp_threads[gtid];
1067 
1068   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1069   if (!th->th.th_team->t.t_serialized) {
1070     //        int cid;
1071     dispatch_private_info_template<UT> *pr =
1072         reinterpret_cast<dispatch_private_info_template<UT> *>(
1073             th->th.th_dispatch->th_dispatch_pr_current);
1074     dispatch_shared_info_template<UT> volatile *sh =
1075         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1076             th->th.th_dispatch->th_dispatch_sh_current);
1077     KMP_DEBUG_ASSERT(pr);
1078     KMP_DEBUG_ASSERT(sh);
1079     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1080                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1081 
1082     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1083     UT lower = pr->u.p.ordered_lower;
1084     UT upper = pr->u.p.ordered_upper;
1085     UT inc = upper - lower + 1;
1086 
1087     if (pr->ordered_bumped == inc) {
1088       KD_TRACE(
1089           1000,
1090           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1091            gtid));
1092       pr->ordered_bumped = 0;
1093     } else {
1094       inc -= pr->ordered_bumped;
1095 
1096 #ifdef KMP_DEBUG
1097       {
1098         char *buff;
1099         // create format specifiers before the debug output
1100         buff = __kmp_str_format(
1101             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1102             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1103             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1104         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1105         __kmp_str_free(&buff);
1106       }
1107 #endif
1108 
1109       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1110                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1111 
1112       KMP_MB(); /* is this necessary? */
1113       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1114                       "ordered_bumped to zero\n",
1115                       gtid));
1116       pr->ordered_bumped = 0;
1117 //!!!!! TODO check if the inc should be unsigned, or signed???
1118 #ifdef KMP_DEBUG
1119       {
1120         char *buff;
1121         // create format specifiers before the debug output
1122         buff = __kmp_str_format(
1123             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1124             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1125             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1126             traits_t<UT>::spec);
1127         KD_TRACE(1000,
1128                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1129         __kmp_str_free(&buff);
1130       }
1131 #endif
1132 
1133       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1134     }
1135     //        }
1136   }
1137   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1138 }
1139 
1140 #endif /* KMP_GOMP_COMPAT */
1141 
1142 template <typename T>
1143 int __kmp_dispatch_next_algorithm(int gtid,
1144                                   dispatch_private_info_template<T> *pr,
1145                                   dispatch_shared_info_template<T> volatile *sh,
1146                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1147                                   typename traits_t<T>::signed_t *p_st, T nproc,
1148                                   T tid) {
1149   typedef typename traits_t<T>::unsigned_t UT;
1150   typedef typename traits_t<T>::signed_t ST;
1151   typedef typename traits_t<T>::floating_t DBL;
1152   int status = 0;
1153   kmp_int32 last = 0;
1154   T start;
1155   ST incr;
1156   UT limit, trip, init;
1157   kmp_info_t *th = __kmp_threads[gtid];
1158   kmp_team_t *team = th->th.th_team;
1159 
1160   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1161                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1162   KMP_DEBUG_ASSERT(pr);
1163   KMP_DEBUG_ASSERT(sh);
1164   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1165 #ifdef KMP_DEBUG
1166   {
1167     char *buff;
1168     // create format specifiers before the debug output
1169     buff =
1170         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1171                          "sh:%%p nproc:%%%s tid:%%%s\n",
1172                          traits_t<T>::spec, traits_t<T>::spec);
1173     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1174     __kmp_str_free(&buff);
1175   }
1176 #endif
1177 
1178   // zero trip count
1179   if (pr->u.p.tc == 0) {
1180     KD_TRACE(10,
1181              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1182               "zero status:%d\n",
1183               gtid, status));
1184     return 0;
1185   }
1186 
1187   switch (pr->schedule) {
1188 #if (KMP_STATIC_STEAL_ENABLED)
1189   case kmp_sch_static_steal: {
1190     T chunk = pr->u.p.parm1;
1191 
1192     KD_TRACE(100,
1193              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1194               gtid));
1195 
1196     trip = pr->u.p.tc - 1;
1197 
1198     if (traits_t<T>::type_size > 4) {
1199       // use lock for 8-byte and CAS for 4-byte induction
1200       // variable. TODO (optional): check and use 16-byte CAS
1201       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1202       KMP_DEBUG_ASSERT(lck != NULL);
1203       if (pr->u.p.count < (UT)pr->u.p.ub) {
1204         __kmp_acquire_lock(lck, gtid);
1205         // try to get own chunk of iterations
1206         init = (pr->u.p.count)++;
1207         status = (init < (UT)pr->u.p.ub);
1208         __kmp_release_lock(lck, gtid);
1209       } else {
1210         status = 0; // no own chunks
1211       }
1212       if (!status) { // try to steal
1213         kmp_info_t **other_threads = team->t.t_threads;
1214         int while_limit = pr->u.p.parm3;
1215         int while_index = 0;
1216         T id = pr->u.p.static_steal_counter; // loop id
1217         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1218                   __kmp_dispatch_num_buffers; // current loop index
1219         // note: victim thread can potentially execute another loop
1220         // TODO: algorithm of searching for a victim
1221         // should be cleaned up and measured
1222         while ((!status) && (while_limit != ++while_index)) {
1223           dispatch_private_info_template<T> *victim;
1224           T remaining;
1225           T victimIdx = pr->u.p.parm4;
1226           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1227           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1228               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1229           KMP_DEBUG_ASSERT(victim);
1230           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1231                  oldVictimIdx != victimIdx) {
1232             victimIdx = (victimIdx + 1) % nproc;
1233             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1234                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1235             KMP_DEBUG_ASSERT(victim);
1236           }
1237           if (victim == pr || id != victim->u.p.static_steal_counter) {
1238             continue; // try once more (nproc attempts in total)
1239             // no victim is ready yet to participate in stealing
1240             // because no victim passed kmp_init_dispatch yet
1241           }
1242           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1243             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1244             continue; // not enough chunks to steal, goto next victim
1245           }
1246 
1247           lck = victim->u.p.th_steal_lock;
1248           KMP_ASSERT(lck != NULL);
1249           __kmp_acquire_lock(lck, gtid);
1250           limit = victim->u.p.ub; // keep initial ub
1251           if (victim->u.p.count >= limit ||
1252               (remaining = limit - victim->u.p.count) < 2) {
1253             __kmp_release_lock(lck, gtid);
1254             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1255             continue; // not enough chunks to steal
1256           }
1257           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1258           // by 1
1259           if (remaining > 3) {
1260             // steal 1/4 of remaining
1261             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1262             init = (victim->u.p.ub -= (remaining >> 2));
1263           } else {
1264             // steal 1 chunk of 2 or 3 remaining
1265             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1266             init = (victim->u.p.ub -= 1);
1267           }
1268           __kmp_release_lock(lck, gtid);
1269 
1270           KMP_DEBUG_ASSERT(init + 1 <= limit);
1271           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1272           status = 1;
1273           while_index = 0;
1274           // now update own count and ub with stolen range but init chunk
1275           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1276           pr->u.p.count = init + 1;
1277           pr->u.p.ub = limit;
1278           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1279         } // while (search for victim)
1280       } // if (try to find victim and steal)
1281     } else {
1282       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1283       typedef union {
1284         struct {
1285           UT count;
1286           T ub;
1287         } p;
1288         kmp_int64 b;
1289       } union_i4;
1290       // All operations on 'count' or 'ub' must be combined atomically
1291       // together.
1292       {
1293         union_i4 vold, vnew;
1294         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1295         vnew = vold;
1296         vnew.p.count++;
1297         while (!KMP_COMPARE_AND_STORE_ACQ64(
1298             (volatile kmp_int64 *)&pr->u.p.count,
1299             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1300             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1301           KMP_CPU_PAUSE();
1302           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303           vnew = vold;
1304           vnew.p.count++;
1305         }
1306         vnew = vold;
1307         init = vnew.p.count;
1308         status = (init < (UT)vnew.p.ub);
1309       }
1310 
1311       if (!status) {
1312         kmp_info_t **other_threads = team->t.t_threads;
1313         int while_limit = pr->u.p.parm3;
1314         int while_index = 0;
1315         T id = pr->u.p.static_steal_counter; // loop id
1316         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1317                   __kmp_dispatch_num_buffers; // current loop index
1318         // note: victim thread can potentially execute another loop
1319         // TODO: algorithm of searching for a victim
1320         // should be cleaned up and measured
1321         while ((!status) && (while_limit != ++while_index)) {
1322           dispatch_private_info_template<T> *victim;
1323           union_i4 vold, vnew;
1324           kmp_int32 remaining;
1325           T victimIdx = pr->u.p.parm4;
1326           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1327           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1328               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1329           KMP_DEBUG_ASSERT(victim);
1330           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1331                  oldVictimIdx != victimIdx) {
1332             victimIdx = (victimIdx + 1) % nproc;
1333             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1334                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1335             KMP_DEBUG_ASSERT(victim);
1336           }
1337           if (victim == pr || id != victim->u.p.static_steal_counter) {
1338             continue; // try once more (nproc attempts in total)
1339             // no victim is ready yet to participate in stealing
1340             // because no victim passed kmp_init_dispatch yet
1341           }
1342           pr->u.p.parm4 = victimIdx; // new victim found
1343           while (1) { // CAS loop if victim has enough chunks to steal
1344             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1345             vnew = vold;
1346 
1347             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1348             if (vnew.p.count >= (UT)vnew.p.ub ||
1349                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1350               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1351               break; // not enough chunks to steal, goto next victim
1352             }
1353             if (remaining > 3) {
1354               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1355             } else {
1356               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1357             }
1358             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1359             // TODO: Should this be acquire or release?
1360             if (KMP_COMPARE_AND_STORE_ACQ64(
1361                     (volatile kmp_int64 *)&victim->u.p.count,
1362                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1363                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1364               // stealing succeeded
1365               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1366                                         vold.p.ub - vnew.p.ub);
1367               status = 1;
1368               while_index = 0;
1369               // now update own count and ub
1370               init = vnew.p.ub;
1371               vold.p.count = init + 1;
1372 #if KMP_ARCH_X86
1373               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1374 #else
1375               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1376 #endif
1377               break;
1378             } // if (check CAS result)
1379             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1380           } // while (try to steal from particular victim)
1381         } // while (search for victim)
1382       } // if (try to find victim and steal)
1383     } // if (4-byte induction variable)
1384     if (!status) {
1385       *p_lb = 0;
1386       *p_ub = 0;
1387       if (p_st != NULL)
1388         *p_st = 0;
1389     } else {
1390       start = pr->u.p.parm2;
1391       init *= chunk;
1392       limit = chunk + init - 1;
1393       incr = pr->u.p.st;
1394       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1395 
1396       KMP_DEBUG_ASSERT(init <= trip);
1397       if ((last = (limit >= trip)) != 0)
1398         limit = trip;
1399       if (p_st != NULL)
1400         *p_st = incr;
1401 
1402       if (incr == 1) {
1403         *p_lb = start + init;
1404         *p_ub = start + limit;
1405       } else {
1406         *p_lb = start + init * incr;
1407         *p_ub = start + limit * incr;
1408       }
1409 
1410       if (pr->flags.ordered) {
1411         pr->u.p.ordered_lower = init;
1412         pr->u.p.ordered_upper = limit;
1413       } // if
1414     } // if
1415     break;
1416   } // case
1417 #endif // ( KMP_STATIC_STEAL_ENABLED )
1418   case kmp_sch_static_balanced: {
1419     KD_TRACE(
1420         10,
1421         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1422          gtid));
1423     /* check if thread has any iteration to do */
1424     if ((status = !pr->u.p.count) != 0) {
1425       pr->u.p.count = 1;
1426       *p_lb = pr->u.p.lb;
1427       *p_ub = pr->u.p.ub;
1428       last = pr->u.p.parm1;
1429       if (p_st != NULL)
1430         *p_st = pr->u.p.st;
1431     } else { /* no iterations to do */
1432       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1433     }
1434   } // case
1435   break;
1436   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1437                                  merged here */
1438   case kmp_sch_static_chunked: {
1439     T parm1;
1440 
1441     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1442                    "kmp_sch_static_[affinity|chunked] case\n",
1443                    gtid));
1444     parm1 = pr->u.p.parm1;
1445 
1446     trip = pr->u.p.tc - 1;
1447     init = parm1 * (pr->u.p.count + tid);
1448 
1449     if ((status = (init <= trip)) != 0) {
1450       start = pr->u.p.lb;
1451       incr = pr->u.p.st;
1452       limit = parm1 + init - 1;
1453 
1454       if ((last = (limit >= trip)) != 0)
1455         limit = trip;
1456 
1457       if (p_st != NULL)
1458         *p_st = incr;
1459 
1460       pr->u.p.count += nproc;
1461 
1462       if (incr == 1) {
1463         *p_lb = start + init;
1464         *p_ub = start + limit;
1465       } else {
1466         *p_lb = start + init * incr;
1467         *p_ub = start + limit * incr;
1468       }
1469 
1470       if (pr->flags.ordered) {
1471         pr->u.p.ordered_lower = init;
1472         pr->u.p.ordered_upper = limit;
1473       } // if
1474     } // if
1475   } // case
1476   break;
1477 
1478   case kmp_sch_dynamic_chunked: {
1479     T chunk = pr->u.p.parm1;
1480 
1481     KD_TRACE(
1482         100,
1483         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1484          gtid));
1485 
1486     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1487     trip = pr->u.p.tc - 1;
1488 
1489     if ((status = (init <= trip)) == 0) {
1490       *p_lb = 0;
1491       *p_ub = 0;
1492       if (p_st != NULL)
1493         *p_st = 0;
1494     } else {
1495       start = pr->u.p.lb;
1496       limit = chunk + init - 1;
1497       incr = pr->u.p.st;
1498 
1499       if ((last = (limit >= trip)) != 0)
1500         limit = trip;
1501 
1502       if (p_st != NULL)
1503         *p_st = incr;
1504 
1505       if (incr == 1) {
1506         *p_lb = start + init;
1507         *p_ub = start + limit;
1508       } else {
1509         *p_lb = start + init * incr;
1510         *p_ub = start + limit * incr;
1511       }
1512 
1513       if (pr->flags.ordered) {
1514         pr->u.p.ordered_lower = init;
1515         pr->u.p.ordered_upper = limit;
1516       } // if
1517     } // if
1518   } // case
1519   break;
1520 
1521   case kmp_sch_guided_iterative_chunked: {
1522     T chunkspec = pr->u.p.parm1;
1523     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1524                    "iterative case\n",
1525                    gtid));
1526     trip = pr->u.p.tc;
1527     // Start atomic part of calculations
1528     while (1) {
1529       ST remaining; // signed, because can be < 0
1530       init = sh->u.s.iteration; // shared value
1531       remaining = trip - init;
1532       if (remaining <= 0) { // AC: need to compare with 0 first
1533         // nothing to do, don't try atomic op
1534         status = 0;
1535         break;
1536       }
1537       if ((T)remaining <
1538           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1539         // use dynamic-style schedule
1540         // atomically increment iterations, get old value
1541         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1542                                  (ST)chunkspec);
1543         remaining = trip - init;
1544         if (remaining <= 0) {
1545           status = 0; // all iterations got by other threads
1546         } else {
1547           // got some iterations to work on
1548           status = 1;
1549           if ((T)remaining > chunkspec) {
1550             limit = init + chunkspec - 1;
1551           } else {
1552             last = 1; // the last chunk
1553             limit = init + remaining - 1;
1554           } // if
1555         } // if
1556         break;
1557       } // if
1558       limit = init +
1559               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1560       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1561                                (ST)init, (ST)limit)) {
1562         // CAS was successful, chunk obtained
1563         status = 1;
1564         --limit;
1565         break;
1566       } // if
1567     } // while
1568     if (status != 0) {
1569       start = pr->u.p.lb;
1570       incr = pr->u.p.st;
1571       if (p_st != NULL)
1572         *p_st = incr;
1573       *p_lb = start + init * incr;
1574       *p_ub = start + limit * incr;
1575       if (pr->flags.ordered) {
1576         pr->u.p.ordered_lower = init;
1577         pr->u.p.ordered_upper = limit;
1578       } // if
1579     } else {
1580       *p_lb = 0;
1581       *p_ub = 0;
1582       if (p_st != NULL)
1583         *p_st = 0;
1584     } // if
1585   } // case
1586   break;
1587 
1588   case kmp_sch_guided_simd: {
1589     // same as iterative but curr-chunk adjusted to be multiple of given
1590     // chunk
1591     T chunk = pr->u.p.parm1;
1592     KD_TRACE(100,
1593              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1594               gtid));
1595     trip = pr->u.p.tc;
1596     // Start atomic part of calculations
1597     while (1) {
1598       ST remaining; // signed, because can be < 0
1599       init = sh->u.s.iteration; // shared value
1600       remaining = trip - init;
1601       if (remaining <= 0) { // AC: need to compare with 0 first
1602         status = 0; // nothing to do, don't try atomic op
1603         break;
1604       }
1605       KMP_DEBUG_ASSERT(init % chunk == 0);
1606       // compare with K*nproc*(chunk+1), K=2 by default
1607       if ((T)remaining < pr->u.p.parm2) {
1608         // use dynamic-style schedule
1609         // atomically increment iterations, get old value
1610         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1611                                  (ST)chunk);
1612         remaining = trip - init;
1613         if (remaining <= 0) {
1614           status = 0; // all iterations got by other threads
1615         } else {
1616           // got some iterations to work on
1617           status = 1;
1618           if ((T)remaining > chunk) {
1619             limit = init + chunk - 1;
1620           } else {
1621             last = 1; // the last chunk
1622             limit = init + remaining - 1;
1623           } // if
1624         } // if
1625         break;
1626       } // if
1627       // divide by K*nproc
1628       UT span = remaining * (*(double *)&pr->u.p.parm3);
1629       UT rem = span % chunk;
1630       if (rem) // adjust so that span%chunk == 0
1631         span += chunk - rem;
1632       limit = init + span;
1633       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1634                                (ST)init, (ST)limit)) {
1635         // CAS was successful, chunk obtained
1636         status = 1;
1637         --limit;
1638         break;
1639       } // if
1640     } // while
1641     if (status != 0) {
1642       start = pr->u.p.lb;
1643       incr = pr->u.p.st;
1644       if (p_st != NULL)
1645         *p_st = incr;
1646       *p_lb = start + init * incr;
1647       *p_ub = start + limit * incr;
1648       if (pr->flags.ordered) {
1649         pr->u.p.ordered_lower = init;
1650         pr->u.p.ordered_upper = limit;
1651       } // if
1652     } else {
1653       *p_lb = 0;
1654       *p_ub = 0;
1655       if (p_st != NULL)
1656         *p_st = 0;
1657     } // if
1658   } // case
1659   break;
1660 
1661   case kmp_sch_guided_analytical_chunked: {
1662     T chunkspec = pr->u.p.parm1;
1663     UT chunkIdx;
1664 #if KMP_USE_X87CONTROL
1665     /* for storing original FPCW value for Windows* OS on
1666        IA-32 architecture 8-byte version */
1667     unsigned int oldFpcw;
1668     unsigned int fpcwSet = 0;
1669 #endif
1670     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1671                    "kmp_sch_guided_analytical_chunked case\n",
1672                    gtid));
1673 
1674     trip = pr->u.p.tc;
1675 
1676     KMP_DEBUG_ASSERT(nproc > 1);
1677     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1678 
1679     while (1) { /* this while loop is a safeguard against unexpected zero
1680                    chunk sizes */
1681       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1682       if (chunkIdx >= (UT)pr->u.p.parm2) {
1683         --trip;
1684         /* use dynamic-style scheduling */
1685         init = chunkIdx * chunkspec + pr->u.p.count;
1686         /* need to verify init > 0 in case of overflow in the above
1687          * calculation */
1688         if ((status = (init > 0 && init <= trip)) != 0) {
1689           limit = init + chunkspec - 1;
1690 
1691           if ((last = (limit >= trip)) != 0)
1692             limit = trip;
1693         }
1694         break;
1695       } else {
1696 /* use exponential-style scheduling */
1697 /* The following check is to workaround the lack of long double precision on
1698    Windows* OS.
1699    This check works around the possible effect that init != 0 for chunkIdx == 0.
1700  */
1701 #if KMP_USE_X87CONTROL
1702         /* If we haven't already done so, save original
1703            FPCW and set precision to 64-bit, as Windows* OS
1704            on IA-32 architecture defaults to 53-bit */
1705         if (!fpcwSet) {
1706           oldFpcw = _control87(0, 0);
1707           _control87(_PC_64, _MCW_PC);
1708           fpcwSet = 0x30000;
1709         }
1710 #endif
1711         if (chunkIdx) {
1712           init = __kmp_dispatch_guided_remaining<T>(
1713               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1714           KMP_DEBUG_ASSERT(init);
1715           init = trip - init;
1716         } else
1717           init = 0;
1718         limit = trip - __kmp_dispatch_guided_remaining<T>(
1719                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1720         KMP_ASSERT(init <= limit);
1721         if (init < limit) {
1722           KMP_DEBUG_ASSERT(limit <= trip);
1723           --limit;
1724           status = 1;
1725           break;
1726         } // if
1727       } // if
1728     } // while (1)
1729 #if KMP_USE_X87CONTROL
1730     /* restore FPCW if necessary
1731        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1732     */
1733     if (fpcwSet && (oldFpcw & fpcwSet))
1734       _control87(oldFpcw, _MCW_PC);
1735 #endif
1736     if (status != 0) {
1737       start = pr->u.p.lb;
1738       incr = pr->u.p.st;
1739       if (p_st != NULL)
1740         *p_st = incr;
1741       *p_lb = start + init * incr;
1742       *p_ub = start + limit * incr;
1743       if (pr->flags.ordered) {
1744         pr->u.p.ordered_lower = init;
1745         pr->u.p.ordered_upper = limit;
1746       }
1747     } else {
1748       *p_lb = 0;
1749       *p_ub = 0;
1750       if (p_st != NULL)
1751         *p_st = 0;
1752     }
1753   } // case
1754   break;
1755 
1756   case kmp_sch_trapezoidal: {
1757     UT index;
1758     T parm2 = pr->u.p.parm2;
1759     T parm3 = pr->u.p.parm3;
1760     T parm4 = pr->u.p.parm4;
1761     KD_TRACE(100,
1762              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1763               gtid));
1764 
1765     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1766 
1767     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1768     trip = pr->u.p.tc - 1;
1769 
1770     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1771       *p_lb = 0;
1772       *p_ub = 0;
1773       if (p_st != NULL)
1774         *p_st = 0;
1775     } else {
1776       start = pr->u.p.lb;
1777       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1778       incr = pr->u.p.st;
1779 
1780       if ((last = (limit >= trip)) != 0)
1781         limit = trip;
1782 
1783       if (p_st != NULL)
1784         *p_st = incr;
1785 
1786       if (incr == 1) {
1787         *p_lb = start + init;
1788         *p_ub = start + limit;
1789       } else {
1790         *p_lb = start + init * incr;
1791         *p_ub = start + limit * incr;
1792       }
1793 
1794       if (pr->flags.ordered) {
1795         pr->u.p.ordered_lower = init;
1796         pr->u.p.ordered_upper = limit;
1797       } // if
1798     } // if
1799   } // case
1800   break;
1801   default: {
1802     status = 0; // to avoid complaints on uninitialized variable use
1803     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1804                 KMP_HNT(GetNewerLibrary), // Hint
1805                 __kmp_msg_null // Variadic argument list terminator
1806                 );
1807   } break;
1808   } // switch
1809   if (p_last)
1810     *p_last = last;
1811 #ifdef KMP_DEBUG
1812   if (pr->flags.ordered) {
1813     char *buff;
1814     // create format specifiers before the debug output
1815     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1816                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1817                             traits_t<UT>::spec, traits_t<UT>::spec);
1818     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1819     __kmp_str_free(&buff);
1820   }
1821   {
1822     char *buff;
1823     // create format specifiers before the debug output
1824     buff = __kmp_str_format(
1825         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1826         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1827         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1828     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1829     __kmp_str_free(&buff);
1830   }
1831 #endif
1832   return status;
1833 }
1834 
1835 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1836    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1837    is not called. */
1838 #if OMPT_SUPPORT && OMPT_OPTIONAL
1839 #define OMPT_LOOP_END                                                          \
1840   if (status == 0) {                                                           \
1841     if (ompt_enabled.ompt_callback_work) {                                     \
1842       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1843       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1844       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1845           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1846           &(task_info->task_data), 0, codeptr);                                \
1847     }                                                                          \
1848   }
1849 // TODO: implement count
1850 #else
1851 #define OMPT_LOOP_END // no-op
1852 #endif
1853 
1854 #if KMP_STATS_ENABLED
1855 #define KMP_STATS_LOOP_END                                                     \
1856   {                                                                            \
1857     kmp_int64 u, l, t, i;                                                      \
1858     l = (kmp_int64)(*p_lb);                                                    \
1859     u = (kmp_int64)(*p_ub);                                                    \
1860     i = (kmp_int64)(pr->u.p.st);                                               \
1861     if (status == 0) {                                                         \
1862       t = 0;                                                                   \
1863       KMP_POP_PARTITIONED_TIMER();                                             \
1864     } else if (i == 1) {                                                       \
1865       if (u >= l)                                                              \
1866         t = u - l + 1;                                                         \
1867       else                                                                     \
1868         t = 0;                                                                 \
1869     } else if (i < 0) {                                                        \
1870       if (l >= u)                                                              \
1871         t = (l - u) / (-i) + 1;                                                \
1872       else                                                                     \
1873         t = 0;                                                                 \
1874     } else {                                                                   \
1875       if (u >= l)                                                              \
1876         t = (u - l) / i + 1;                                                   \
1877       else                                                                     \
1878         t = 0;                                                                 \
1879     }                                                                          \
1880     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1881   }
1882 #else
1883 #define KMP_STATS_LOOP_END /* Nothing */
1884 #endif
1885 
1886 template <typename T>
1887 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1888                                T *p_lb, T *p_ub,
1889                                typename traits_t<T>::signed_t *p_st
1890 #if OMPT_SUPPORT && OMPT_OPTIONAL
1891                                ,
1892                                void *codeptr
1893 #endif
1894                                ) {
1895 
1896   typedef typename traits_t<T>::unsigned_t UT;
1897   typedef typename traits_t<T>::signed_t ST;
1898   // This is potentially slightly misleading, schedule(runtime) will appear here
1899   // even if the actual runtime schedule is static. (Which points out a
1900   // disadvantage of schedule(runtime): even when static scheduling is used it
1901   // costs more than a compile time choice to use static scheduling would.)
1902   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1903 
1904   int status;
1905   dispatch_private_info_template<T> *pr;
1906   __kmp_assert_valid_gtid(gtid);
1907   kmp_info_t *th = __kmp_threads[gtid];
1908   kmp_team_t *team = th->th.th_team;
1909 
1910   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1911   KD_TRACE(
1912       1000,
1913       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1914        gtid, p_lb, p_ub, p_st, p_last));
1915 
1916   if (team->t.t_serialized) {
1917     /* NOTE: serialize this dispatch because we are not at the active level */
1918     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1919         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1920     KMP_DEBUG_ASSERT(pr);
1921 
1922     if ((status = (pr->u.p.tc != 0)) == 0) {
1923       *p_lb = 0;
1924       *p_ub = 0;
1925       //            if ( p_last != NULL )
1926       //                *p_last = 0;
1927       if (p_st != NULL)
1928         *p_st = 0;
1929       if (__kmp_env_consistency_check) {
1930         if (pr->pushed_ws != ct_none) {
1931           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1932         }
1933       }
1934     } else if (pr->flags.nomerge) {
1935       kmp_int32 last;
1936       T start;
1937       UT limit, trip, init;
1938       ST incr;
1939       T chunk = pr->u.p.parm1;
1940 
1941       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1942                      gtid));
1943 
1944       init = chunk * pr->u.p.count++;
1945       trip = pr->u.p.tc - 1;
1946 
1947       if ((status = (init <= trip)) == 0) {
1948         *p_lb = 0;
1949         *p_ub = 0;
1950         //                if ( p_last != NULL )
1951         //                    *p_last = 0;
1952         if (p_st != NULL)
1953           *p_st = 0;
1954         if (__kmp_env_consistency_check) {
1955           if (pr->pushed_ws != ct_none) {
1956             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1957           }
1958         }
1959       } else {
1960         start = pr->u.p.lb;
1961         limit = chunk + init - 1;
1962         incr = pr->u.p.st;
1963 
1964         if ((last = (limit >= trip)) != 0) {
1965           limit = trip;
1966 #if KMP_OS_WINDOWS
1967           pr->u.p.last_upper = pr->u.p.ub;
1968 #endif /* KMP_OS_WINDOWS */
1969         }
1970         if (p_last != NULL)
1971           *p_last = last;
1972         if (p_st != NULL)
1973           *p_st = incr;
1974         if (incr == 1) {
1975           *p_lb = start + init;
1976           *p_ub = start + limit;
1977         } else {
1978           *p_lb = start + init * incr;
1979           *p_ub = start + limit * incr;
1980         }
1981 
1982         if (pr->flags.ordered) {
1983           pr->u.p.ordered_lower = init;
1984           pr->u.p.ordered_upper = limit;
1985 #ifdef KMP_DEBUG
1986           {
1987             char *buff;
1988             // create format specifiers before the debug output
1989             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1990                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1991                                     traits_t<UT>::spec, traits_t<UT>::spec);
1992             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1993                             pr->u.p.ordered_upper));
1994             __kmp_str_free(&buff);
1995           }
1996 #endif
1997         } // if
1998       } // if
1999     } else {
2000       pr->u.p.tc = 0;
2001       *p_lb = pr->u.p.lb;
2002       *p_ub = pr->u.p.ub;
2003 #if KMP_OS_WINDOWS
2004       pr->u.p.last_upper = *p_ub;
2005 #endif /* KMP_OS_WINDOWS */
2006       if (p_last != NULL)
2007         *p_last = TRUE;
2008       if (p_st != NULL)
2009         *p_st = pr->u.p.st;
2010     } // if
2011 #ifdef KMP_DEBUG
2012     {
2013       char *buff;
2014       // create format specifiers before the debug output
2015       buff = __kmp_str_format(
2016           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2017           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2018           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2019       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2020       __kmp_str_free(&buff);
2021     }
2022 #endif
2023 #if INCLUDE_SSC_MARKS
2024     SSC_MARK_DISPATCH_NEXT();
2025 #endif
2026     OMPT_LOOP_END;
2027     KMP_STATS_LOOP_END;
2028     return status;
2029   } else {
2030     kmp_int32 last = 0;
2031     dispatch_shared_info_template<T> volatile *sh;
2032 
2033     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2034                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2035 
2036     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2037         th->th.th_dispatch->th_dispatch_pr_current);
2038     KMP_DEBUG_ASSERT(pr);
2039     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2040         th->th.th_dispatch->th_dispatch_sh_current);
2041     KMP_DEBUG_ASSERT(sh);
2042 
2043 #if KMP_USE_HIER_SCHED
2044     if (pr->flags.use_hier)
2045       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2046     else
2047 #endif // KMP_USE_HIER_SCHED
2048       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2049                                                 p_st, th->th.th_team_nproc,
2050                                                 th->th.th_info.ds.ds_tid);
2051     // status == 0: no more iterations to execute
2052     if (status == 0) {
2053       UT num_done;
2054 
2055       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2056 #ifdef KMP_DEBUG
2057       {
2058         char *buff;
2059         // create format specifiers before the debug output
2060         buff = __kmp_str_format(
2061             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2062             traits_t<UT>::spec);
2063         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2064         __kmp_str_free(&buff);
2065       }
2066 #endif
2067 
2068 #if KMP_USE_HIER_SCHED
2069       pr->flags.use_hier = FALSE;
2070 #endif
2071       if ((ST)num_done == th->th.th_team_nproc - 1) {
2072 #if (KMP_STATIC_STEAL_ENABLED)
2073         if (pr->schedule == kmp_sch_static_steal &&
2074             traits_t<T>::type_size > 4) {
2075           int i;
2076           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2077                     __kmp_dispatch_num_buffers; // current loop index
2078           kmp_info_t **other_threads = team->t.t_threads;
2079           // loop complete, safe to destroy locks used for stealing
2080           for (i = 0; i < th->th.th_team_nproc; ++i) {
2081             dispatch_private_info_template<T> *buf =
2082                 reinterpret_cast<dispatch_private_info_template<T> *>(
2083                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2084             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2085             KMP_ASSERT(lck != NULL);
2086             __kmp_destroy_lock(lck);
2087             __kmp_free(lck);
2088             buf->u.p.th_steal_lock = NULL;
2089           }
2090         }
2091 #endif
2092         /* NOTE: release this buffer to be reused */
2093 
2094         KMP_MB(); /* Flush all pending memory write invalidates.  */
2095 
2096         sh->u.s.num_done = 0;
2097         sh->u.s.iteration = 0;
2098 
2099         /* TODO replace with general release procedure? */
2100         if (pr->flags.ordered) {
2101           sh->u.s.ordered_iteration = 0;
2102         }
2103 
2104         KMP_MB(); /* Flush all pending memory write invalidates.  */
2105 
2106         sh->buffer_index += __kmp_dispatch_num_buffers;
2107         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2108                        gtid, sh->buffer_index));
2109 
2110         KMP_MB(); /* Flush all pending memory write invalidates.  */
2111 
2112       } // if
2113       if (__kmp_env_consistency_check) {
2114         if (pr->pushed_ws != ct_none) {
2115           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2116         }
2117       }
2118 
2119       th->th.th_dispatch->th_deo_fcn = NULL;
2120       th->th.th_dispatch->th_dxo_fcn = NULL;
2121       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2122       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2123     } // if (status == 0)
2124 #if KMP_OS_WINDOWS
2125     else if (last) {
2126       pr->u.p.last_upper = pr->u.p.ub;
2127     }
2128 #endif /* KMP_OS_WINDOWS */
2129     if (p_last != NULL && status != 0)
2130       *p_last = last;
2131   } // if
2132 
2133 #ifdef KMP_DEBUG
2134   {
2135     char *buff;
2136     // create format specifiers before the debug output
2137     buff = __kmp_str_format(
2138         "__kmp_dispatch_next: T#%%d normal case: "
2139         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2140         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2141     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2142                   (p_last ? *p_last : 0), status));
2143     __kmp_str_free(&buff);
2144   }
2145 #endif
2146 #if INCLUDE_SSC_MARKS
2147   SSC_MARK_DISPATCH_NEXT();
2148 #endif
2149   OMPT_LOOP_END;
2150   KMP_STATS_LOOP_END;
2151   return status;
2152 }
2153 
2154 template <typename T>
2155 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2156                                   kmp_int32 *plastiter, T *plower, T *pupper,
2157                                   typename traits_t<T>::signed_t incr) {
2158   typedef typename traits_t<T>::unsigned_t UT;
2159   kmp_uint32 team_id;
2160   kmp_uint32 nteams;
2161   UT trip_count;
2162   kmp_team_t *team;
2163   kmp_info_t *th;
2164 
2165   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2166   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2167 #ifdef KMP_DEBUG
2168   typedef typename traits_t<T>::signed_t ST;
2169   {
2170     char *buff;
2171     // create format specifiers before the debug output
2172     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2173                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2174                             traits_t<T>::spec, traits_t<T>::spec,
2175                             traits_t<ST>::spec, traits_t<T>::spec);
2176     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2177     __kmp_str_free(&buff);
2178   }
2179 #endif
2180 
2181   if (__kmp_env_consistency_check) {
2182     if (incr == 0) {
2183       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2184                             loc);
2185     }
2186     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2187       // The loop is illegal.
2188       // Some zero-trip loops maintained by compiler, e.g.:
2189       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2190       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2191       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2192       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2193       // Compiler does not check the following illegal loops:
2194       //   for(i=0;i<10;i+=incr) // where incr<0
2195       //   for(i=10;i>0;i-=incr) // where incr<0
2196       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2197     }
2198   }
2199   __kmp_assert_valid_gtid(gtid);
2200   th = __kmp_threads[gtid];
2201   team = th->th.th_team;
2202   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2203   nteams = th->th.th_teams_size.nteams;
2204   team_id = team->t.t_master_tid;
2205   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2206 
2207   // compute global trip count
2208   if (incr == 1) {
2209     trip_count = *pupper - *plower + 1;
2210   } else if (incr == -1) {
2211     trip_count = *plower - *pupper + 1;
2212   } else if (incr > 0) {
2213     // upper-lower can exceed the limit of signed type
2214     trip_count = (UT)(*pupper - *plower) / incr + 1;
2215   } else {
2216     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2217   }
2218 
2219   if (trip_count <= nteams) {
2220     KMP_DEBUG_ASSERT(
2221         __kmp_static == kmp_sch_static_greedy ||
2222         __kmp_static ==
2223             kmp_sch_static_balanced); // Unknown static scheduling type.
2224     // only some teams get single iteration, others get nothing
2225     if (team_id < trip_count) {
2226       *pupper = *plower = *plower + team_id * incr;
2227     } else {
2228       *plower = *pupper + incr; // zero-trip loop
2229     }
2230     if (plastiter != NULL)
2231       *plastiter = (team_id == trip_count - 1);
2232   } else {
2233     if (__kmp_static == kmp_sch_static_balanced) {
2234       UT chunk = trip_count / nteams;
2235       UT extras = trip_count % nteams;
2236       *plower +=
2237           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2238       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2239       if (plastiter != NULL)
2240         *plastiter = (team_id == nteams - 1);
2241     } else {
2242       T chunk_inc_count =
2243           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2244       T upper = *pupper;
2245       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2246       // Unknown static scheduling type.
2247       *plower += team_id * chunk_inc_count;
2248       *pupper = *plower + chunk_inc_count - incr;
2249       // Check/correct bounds if needed
2250       if (incr > 0) {
2251         if (*pupper < *plower)
2252           *pupper = traits_t<T>::max_value;
2253         if (plastiter != NULL)
2254           *plastiter = *plower <= upper && *pupper > upper - incr;
2255         if (*pupper > upper)
2256           *pupper = upper; // tracker C73258
2257       } else {
2258         if (*pupper > *plower)
2259           *pupper = traits_t<T>::min_value;
2260         if (plastiter != NULL)
2261           *plastiter = *plower >= upper && *pupper < upper - incr;
2262         if (*pupper < upper)
2263           *pupper = upper; // tracker C73258
2264       }
2265     }
2266   }
2267 }
2268 
2269 //-----------------------------------------------------------------------------
2270 // Dispatch routines
2271 //    Transfer call to template< type T >
2272 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2273 //                         T lb, T ub, ST st, ST chunk )
2274 extern "C" {
2275 
2276 /*!
2277 @ingroup WORK_SHARING
2278 @{
2279 @param loc Source location
2280 @param gtid Global thread id
2281 @param schedule Schedule type
2282 @param lb  Lower bound
2283 @param ub  Upper bound
2284 @param st  Step (or increment if you prefer)
2285 @param chunk The chunk size to block with
2286 
2287 This function prepares the runtime to start a dynamically scheduled for loop,
2288 saving the loop arguments.
2289 These functions are all identical apart from the types of the arguments.
2290 */
2291 
2292 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2293                             enum sched_type schedule, kmp_int32 lb,
2294                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2295   KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 #if OMPT_SUPPORT && OMPT_OPTIONAL
2297   OMPT_STORE_RETURN_ADDRESS(gtid);
2298 #endif
2299   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2300 }
2301 /*!
2302 See @ref __kmpc_dispatch_init_4
2303 */
2304 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2305                              enum sched_type schedule, kmp_uint32 lb,
2306                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2307   KMP_DEBUG_ASSERT(__kmp_init_serial);
2308 #if OMPT_SUPPORT && OMPT_OPTIONAL
2309   OMPT_STORE_RETURN_ADDRESS(gtid);
2310 #endif
2311   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2312 }
2313 
2314 /*!
2315 See @ref __kmpc_dispatch_init_4
2316 */
2317 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2318                             enum sched_type schedule, kmp_int64 lb,
2319                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2320   KMP_DEBUG_ASSERT(__kmp_init_serial);
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322   OMPT_STORE_RETURN_ADDRESS(gtid);
2323 #endif
2324   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2325 }
2326 
2327 /*!
2328 See @ref __kmpc_dispatch_init_4
2329 */
2330 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2331                              enum sched_type schedule, kmp_uint64 lb,
2332                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2333   KMP_DEBUG_ASSERT(__kmp_init_serial);
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335   OMPT_STORE_RETURN_ADDRESS(gtid);
2336 #endif
2337   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2338 }
2339 
2340 /*!
2341 See @ref __kmpc_dispatch_init_4
2342 
2343 Difference from __kmpc_dispatch_init set of functions is these functions
2344 are called for composite distribute parallel for construct. Thus before
2345 regular iterations dispatching we need to calc per-team iteration space.
2346 
2347 These functions are all identical apart from the types of the arguments.
2348 */
2349 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2350                                  enum sched_type schedule, kmp_int32 *p_last,
2351                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2352                                  kmp_int32 chunk) {
2353   KMP_DEBUG_ASSERT(__kmp_init_serial);
2354 #if OMPT_SUPPORT && OMPT_OPTIONAL
2355   OMPT_STORE_RETURN_ADDRESS(gtid);
2356 #endif
2357   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2358   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2359 }
2360 
2361 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2362                                   enum sched_type schedule, kmp_int32 *p_last,
2363                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2364                                   kmp_int32 chunk) {
2365   KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL
2367   OMPT_STORE_RETURN_ADDRESS(gtid);
2368 #endif
2369   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2370   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2371 }
2372 
2373 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2374                                  enum sched_type schedule, kmp_int32 *p_last,
2375                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2376                                  kmp_int64 chunk) {
2377   KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2382   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2383 }
2384 
2385 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2386                                   enum sched_type schedule, kmp_int32 *p_last,
2387                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2388                                   kmp_int64 chunk) {
2389   KMP_DEBUG_ASSERT(__kmp_init_serial);
2390 #if OMPT_SUPPORT && OMPT_OPTIONAL
2391   OMPT_STORE_RETURN_ADDRESS(gtid);
2392 #endif
2393   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2394   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2395 }
2396 
2397 /*!
2398 @param loc Source code location
2399 @param gtid Global thread id
2400 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2401 otherwise
2402 @param p_lb   Pointer to the lower bound for the next chunk of work
2403 @param p_ub   Pointer to the upper bound for the next chunk of work
2404 @param p_st   Pointer to the stride for the next chunk of work
2405 @return one if there is work to be done, zero otherwise
2406 
2407 Get the next dynamically allocated chunk of work for this thread.
2408 If there is no more work, then the lb,ub and stride need not be modified.
2409 */
2410 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2411                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413   OMPT_STORE_RETURN_ADDRESS(gtid);
2414 #endif
2415   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2416 #if OMPT_SUPPORT && OMPT_OPTIONAL
2417                                         ,
2418                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2419 #endif
2420                                             );
2421 }
2422 
2423 /*!
2424 See @ref __kmpc_dispatch_next_4
2425 */
2426 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2427                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2428                             kmp_int32 *p_st) {
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL
2430   OMPT_STORE_RETURN_ADDRESS(gtid);
2431 #endif
2432   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2433 #if OMPT_SUPPORT && OMPT_OPTIONAL
2434                                          ,
2435                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2436 #endif
2437                                              );
2438 }
2439 
2440 /*!
2441 See @ref __kmpc_dispatch_next_4
2442 */
2443 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2444                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446   OMPT_STORE_RETURN_ADDRESS(gtid);
2447 #endif
2448   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2449 #if OMPT_SUPPORT && OMPT_OPTIONAL
2450                                         ,
2451                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2452 #endif
2453                                             );
2454 }
2455 
2456 /*!
2457 See @ref __kmpc_dispatch_next_4
2458 */
2459 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2460                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2461                             kmp_int64 *p_st) {
2462 #if OMPT_SUPPORT && OMPT_OPTIONAL
2463   OMPT_STORE_RETURN_ADDRESS(gtid);
2464 #endif
2465   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2466 #if OMPT_SUPPORT && OMPT_OPTIONAL
2467                                          ,
2468                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2469 #endif
2470                                              );
2471 }
2472 
2473 /*!
2474 @param loc Source code location
2475 @param gtid Global thread id
2476 
2477 Mark the end of a dynamic loop.
2478 */
2479 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2480   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2481 }
2482 
2483 /*!
2484 See @ref __kmpc_dispatch_fini_4
2485 */
2486 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2487   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2488 }
2489 
2490 /*!
2491 See @ref __kmpc_dispatch_fini_4
2492 */
2493 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2494   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2495 }
2496 
2497 /*!
2498 See @ref __kmpc_dispatch_fini_4
2499 */
2500 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2501   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2502 }
2503 /*! @} */
2504 
2505 //-----------------------------------------------------------------------------
2506 // Non-template routines from kmp_dispatch.cpp used in other sources
2507 
2508 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2509   return value == checker;
2510 }
2511 
2512 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2513   return value != checker;
2514 }
2515 
2516 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2517   return value < checker;
2518 }
2519 
2520 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2521   return value >= checker;
2522 }
2523 
2524 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2525   return value <= checker;
2526 }
2527 
2528 kmp_uint32
2529 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2530              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2531              void *obj // Higher-level synchronization object, or NULL.
2532              ) {
2533   // note: we may not belong to a team at this point
2534   volatile kmp_uint32 *spin = spinner;
2535   kmp_uint32 check = checker;
2536   kmp_uint32 spins;
2537   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2538   kmp_uint32 r;
2539 
2540   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2541   KMP_INIT_YIELD(spins);
2542   // main wait spin loop
2543   while (!f(r = TCR_4(*spin), check)) {
2544     KMP_FSYNC_SPIN_PREPARE(obj);
2545     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2546        split. It causes problems with infinite recursion because of exit lock */
2547     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2548         __kmp_abort_thread(); */
2549     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2550   }
2551   KMP_FSYNC_SPIN_ACQUIRED(obj);
2552   return r;
2553 }
2554 
2555 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2556                       kmp_uint32 (*pred)(void *, kmp_uint32),
2557                       void *obj // Higher-level synchronization object, or NULL.
2558                       ) {
2559   // note: we may not belong to a team at this point
2560   void *spin = spinner;
2561   kmp_uint32 check = checker;
2562   kmp_uint32 spins;
2563   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2564 
2565   KMP_FSYNC_SPIN_INIT(obj, spin);
2566   KMP_INIT_YIELD(spins);
2567   // main wait spin loop
2568   while (!f(spin, check)) {
2569     KMP_FSYNC_SPIN_PREPARE(obj);
2570     /* if we have waited a bit, or are noversubscribed, yield */
2571     /* pause is in the following code */
2572     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2573   }
2574   KMP_FSYNC_SPIN_ACQUIRED(obj);
2575 }
2576 
2577 } // extern "C"
2578 
2579 #ifdef KMP_GOMP_COMPAT
2580 
2581 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2582                                enum sched_type schedule, kmp_int32 lb,
2583                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2584                                int push_ws) {
2585   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2586                                  push_ws);
2587 }
2588 
2589 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2590                                 enum sched_type schedule, kmp_uint32 lb,
2591                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2592                                 int push_ws) {
2593   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2594                                   push_ws);
2595 }
2596 
2597 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2598                                enum sched_type schedule, kmp_int64 lb,
2599                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2600                                int push_ws) {
2601   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2602                                  push_ws);
2603 }
2604 
2605 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2606                                 enum sched_type schedule, kmp_uint64 lb,
2607                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2608                                 int push_ws) {
2609   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2610                                   push_ws);
2611 }
2612 
2613 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2614   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2615 }
2616 
2617 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2618   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2619 }
2620 
2621 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2622   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2623 }
2624 
2625 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2626   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2627 }
2628 
2629 #endif /* KMP_GOMP_COMPAT */
2630 
2631 /* ------------------------------------------------------------------------ */
2632