1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 #include "kmp_lock.h"
38 #include "kmp_dispatch.h"
39 #if KMP_USE_HIER_SCHED
40 #include "kmp_dispatch_hier.h"
41 #endif
42 
43 #if OMPT_SUPPORT
44 #include "ompt-specific.h"
45 #endif
46 
47 /* ------------------------------------------------------------------------ */
48 /* ------------------------------------------------------------------------ */
49 
50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
51   kmp_info_t *th;
52 
53   KMP_DEBUG_ASSERT(gtid_ref);
54 
55   if (__kmp_env_consistency_check) {
56     th = __kmp_threads[*gtid_ref];
57     if (th->th.th_root->r.r_active &&
58         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK
60       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
61 #else
62       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
63 #endif
64     }
65   }
66 }
67 
68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
69   kmp_info_t *th;
70 
71   if (__kmp_env_consistency_check) {
72     th = __kmp_threads[*gtid_ref];
73     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
75     }
76   }
77 }
78 
79 // Initialize a dispatch_private_info_template<T> buffer for a particular
80 // type of schedule,chunk.  The loop description is found in lb (lower bound),
81 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
82 // to the scheduling (often the number of threads in a team, but not always if
83 // hierarchical scheduling is used).  tid is the id of the thread calling
84 // the function within the group of nproc threads.  It will have a value
85 // between 0 and nproc - 1.  This is often just the thread id within a team, but
86 // is not necessarily the case when using hierarchical scheduling.
87 // loc is the source file location of the corresponding loop
88 // gtid is the global thread id
89 template <typename T>
90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
91                                    dispatch_private_info_template<T> *pr,
92                                    enum sched_type schedule, T lb, T ub,
93                                    typename traits_t<T>::signed_t st,
94 #if USE_ITT_BUILD
95                                    kmp_uint64 *cur_chunk,
96 #endif
97                                    typename traits_t<T>::signed_t chunk,
98                                    T nproc, T tid) {
99   typedef typename traits_t<T>::unsigned_t UT;
100   typedef typename traits_t<T>::signed_t ST;
101   typedef typename traits_t<T>::floating_t DBL;
102 
103   int active;
104   T tc;
105   kmp_info_t *th;
106   kmp_team_t *team;
107 
108 #ifdef KMP_DEBUG
109   {
110     char *buff;
111     // create format specifiers before the debug output
112     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
113                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
114                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115                             traits_t<T>::spec, traits_t<T>::spec,
116                             traits_t<ST>::spec, traits_t<ST>::spec,
117                             traits_t<T>::spec, traits_t<T>::spec);
118     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119     __kmp_str_free(&buff);
120   }
121 #endif
122   /* setup data */
123   th = __kmp_threads[gtid];
124   team = th->th.th_team;
125   active = !team->t.t_serialized;
126 
127 #if USE_ITT_BUILD
128   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129                                     __kmp_forkjoin_frames_mode == 3 &&
130                                     KMP_MASTER_GTID(gtid) &&
131 #if OMP_40_ENABLED
132                                     th->th.th_teams_microtask == NULL &&
133 #endif
134                                     team->t.t_active_level == 1;
135 #endif
136 #if (KMP_STATIC_STEAL_ENABLED)
137   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
138     // AC: we now have only one implementation of stealing, so use it
139     schedule = kmp_sch_static_steal;
140   else
141 #endif
142     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
143 
144   /* Pick up the nomerge/ordered bits from the scheduling type */
145   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
146     pr->flags.nomerge = TRUE;
147     schedule =
148         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
149   } else {
150     pr->flags.nomerge = FALSE;
151   }
152   pr->type_size = traits_t<T>::type_size; // remember the size of variables
153   if (kmp_ord_lower & schedule) {
154     pr->flags.ordered = TRUE;
155     schedule =
156         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
157   } else {
158     pr->flags.ordered = FALSE;
159   }
160 
161   if (schedule == kmp_sch_static) {
162     schedule = __kmp_static;
163   } else {
164     if (schedule == kmp_sch_runtime) {
165       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
166       // not specified)
167       schedule = team->t.t_sched.r_sched_type;
168       // Detail the schedule if needed (global controls are differentiated
169       // appropriately)
170       if (schedule == kmp_sch_guided_chunked) {
171         schedule = __kmp_guided;
172       } else if (schedule == kmp_sch_static) {
173         schedule = __kmp_static;
174       }
175       // Use the chunk size specified by OMP_SCHEDULE (or default if not
176       // specified)
177       chunk = team->t.t_sched.chunk;
178 #if USE_ITT_BUILD
179       if (cur_chunk)
180         *cur_chunk = chunk;
181 #endif
182 #ifdef KMP_DEBUG
183       {
184         char *buff;
185         // create format specifiers before the debug output
186         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
187                                 "schedule:%%d chunk:%%%s\n",
188                                 traits_t<ST>::spec);
189         KD_TRACE(10, (buff, gtid, schedule, chunk));
190         __kmp_str_free(&buff);
191       }
192 #endif
193     } else {
194       if (schedule == kmp_sch_guided_chunked) {
195         schedule = __kmp_guided;
196       }
197       if (chunk <= 0) {
198         chunk = KMP_DEFAULT_CHUNK;
199       }
200     }
201 
202     if (schedule == kmp_sch_auto) {
203       // mapping and differentiation: in the __kmp_do_serial_initialize()
204       schedule = __kmp_auto;
205 #ifdef KMP_DEBUG
206       {
207         char *buff;
208         // create format specifiers before the debug output
209         buff = __kmp_str_format(
210             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
211             "schedule:%%d chunk:%%%s\n",
212             traits_t<ST>::spec);
213         KD_TRACE(10, (buff, gtid, schedule, chunk));
214         __kmp_str_free(&buff);
215       }
216 #endif
217     }
218 
219     /* guided analytical not safe for too many threads */
220     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221       schedule = kmp_sch_guided_iterative_chunked;
222       KMP_WARNING(DispatchManyThreads);
223     }
224 #if OMP_45_ENABLED
225     if (schedule == kmp_sch_runtime_simd) {
226       // compiler provides simd_width in the chunk parameter
227       schedule = team->t.t_sched.r_sched_type;
228       // Detail the schedule if needed (global controls are differentiated
229       // appropriately)
230       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
231           schedule == __kmp_static) {
232         schedule = kmp_sch_static_balanced_chunked;
233       } else {
234         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
235           schedule = kmp_sch_guided_simd;
236         }
237         chunk = team->t.t_sched.chunk * chunk;
238       }
239 #if USE_ITT_BUILD
240       if (cur_chunk)
241         *cur_chunk = chunk;
242 #endif
243 #ifdef KMP_DEBUG
244       {
245         char *buff;
246         // create format specifiers before the debug output
247         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
248                                 " chunk:%%%s\n",
249                                 traits_t<ST>::spec);
250         KD_TRACE(10, (buff, gtid, schedule, chunk));
251         __kmp_str_free(&buff);
252       }
253 #endif
254     }
255 #endif // OMP_45_ENABLED
256     pr->u.p.parm1 = chunk;
257   }
258   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
259               "unknown scheduling type");
260 
261   pr->u.p.count = 0;
262 
263   if (__kmp_env_consistency_check) {
264     if (st == 0) {
265       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
267     }
268   }
269   // compute trip count
270   if (st == 1) { // most common case
271     if (ub >= lb) {
272       tc = ub - lb + 1;
273     } else { // ub < lb
274       tc = 0; // zero-trip
275     }
276   } else if (st < 0) {
277     if (lb >= ub) {
278       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
279       // where the division needs to be unsigned regardless of the result type
280       tc = (UT)(lb - ub) / (-st) + 1;
281     } else { // lb < ub
282       tc = 0; // zero-trip
283     }
284   } else { // st > 0
285     if (ub >= lb) {
286       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
287       // where the division needs to be unsigned regardless of the result type
288       tc = (UT)(ub - lb) / st + 1;
289     } else { // ub < lb
290       tc = 0; // zero-trip
291     }
292   }
293 
294   pr->u.p.lb = lb;
295   pr->u.p.ub = ub;
296   pr->u.p.st = st;
297   pr->u.p.tc = tc;
298 
299 #if KMP_OS_WINDOWS
300   pr->u.p.last_upper = ub + st;
301 #endif /* KMP_OS_WINDOWS */
302 
303   /* NOTE: only the active parallel region(s) has active ordered sections */
304 
305   if (active) {
306     if (pr->flags.ordered) {
307       pr->ordered_bumped = 0;
308       pr->u.p.ordered_lower = 1;
309       pr->u.p.ordered_upper = 0;
310     }
311   }
312 
313   switch (schedule) {
314 #if (KMP_STATIC_STEAL_ENABLED)
315   case kmp_sch_static_steal: {
316     T ntc, init;
317 
318     KD_TRACE(100,
319              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320               gtid));
321 
322     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323     if (nproc > 1 && ntc >= nproc) {
324       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
325       T id = tid;
326       T small_chunk, extras;
327 
328       small_chunk = ntc / nproc;
329       extras = ntc % nproc;
330 
331       init = id * small_chunk + (id < extras ? id : extras);
332       pr->u.p.count = init;
333       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
334 
335       pr->u.p.parm2 = lb;
336       // pr->pfields.parm3 = 0; // it's not used in static_steal
337       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
338       pr->u.p.st = st;
339       if (traits_t<T>::type_size > 4) {
340         // AC: TODO: check if 16-byte CAS available and use it to
341         // improve performance (probably wait for explicit request
342         // before spending time on this).
343         // For now use dynamically allocated per-thread lock,
344         // free memory in __kmp_dispatch_next when status==0.
345         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346         th->th.th_dispatch->th_steal_lock =
347             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
348         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
349       }
350       break;
351     } else {
352       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
353                      "kmp_sch_static_balanced\n",
354                      gtid));
355       schedule = kmp_sch_static_balanced;
356       /* too few iterations: fall-through to kmp_sch_static_balanced */
357     } // if
358     /* FALL-THROUGH to static balanced */
359   } // case
360 #endif
361   case kmp_sch_static_balanced: {
362     T init, limit;
363 
364     KD_TRACE(
365         100,
366         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
367          gtid));
368 
369     if (nproc > 1) {
370       T id = tid;
371 
372       if (tc < nproc) {
373         if (id < tc) {
374           init = id;
375           limit = id;
376           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
377         } else {
378           pr->u.p.count = 1; /* means no more chunks to execute */
379           pr->u.p.parm1 = FALSE;
380           break;
381         }
382       } else {
383         T small_chunk = tc / nproc;
384         T extras = tc % nproc;
385         init = id * small_chunk + (id < extras ? id : extras);
386         limit = init + small_chunk - (id < extras ? 0 : 1);
387         pr->u.p.parm1 = (id == nproc - 1);
388       }
389     } else {
390       if (tc > 0) {
391         init = 0;
392         limit = tc - 1;
393         pr->u.p.parm1 = TRUE;
394       } else {
395         // zero trip count
396         pr->u.p.count = 1; /* means no more chunks to execute */
397         pr->u.p.parm1 = FALSE;
398         break;
399       }
400     }
401 #if USE_ITT_BUILD
402     // Calculate chunk for metadata report
403     if (itt_need_metadata_reporting)
404       if (cur_chunk)
405         *cur_chunk = limit - init + 1;
406 #endif
407     if (st == 1) {
408       pr->u.p.lb = lb + init;
409       pr->u.p.ub = lb + limit;
410     } else {
411       // calculated upper bound, "ub" is user-defined upper bound
412       T ub_tmp = lb + limit * st;
413       pr->u.p.lb = lb + init * st;
414       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
415       // it exactly
416       if (st > 0) {
417         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418       } else {
419         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
420       }
421     }
422     if (pr->flags.ordered) {
423       pr->u.p.ordered_lower = init;
424       pr->u.p.ordered_upper = limit;
425     }
426     break;
427   } // case
428 #if OMP_45_ENABLED
429   case kmp_sch_static_balanced_chunked: {
430     // similar to balanced, but chunk adjusted to multiple of simd width
431     T nth = nproc;
432     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
433                    " -> falling-through to static_greedy\n",
434                    gtid));
435     schedule = kmp_sch_static_greedy;
436     if (nth > 1)
437       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
438     else
439       pr->u.p.parm1 = tc;
440     break;
441   } // case
442   case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED
444   case kmp_sch_guided_iterative_chunked: {
445     KD_TRACE(
446         100,
447         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
448          " case\n",
449          gtid));
450 
451     if (nproc > 1) {
452       if ((2L * chunk + 1) * nproc >= tc) {
453         /* chunk size too large, switch to dynamic */
454         schedule = kmp_sch_dynamic_chunked;
455       } else {
456         // when remaining iters become less than parm2 - switch to dynamic
457         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458         *(double *)&pr->u.p.parm3 =
459             guided_flt_param / nproc; // may occupy parm3 and parm4
460       }
461     } else {
462       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
463                      "kmp_sch_static_greedy\n",
464                      gtid));
465       schedule = kmp_sch_static_greedy;
466       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
467       KD_TRACE(
468           100,
469           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
470            gtid));
471       pr->u.p.parm1 = tc;
472     } // if
473   } // case
474   break;
475   case kmp_sch_guided_analytical_chunked: {
476     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
477                    "kmp_sch_guided_analytical_chunked case\n",
478                    gtid));
479 
480     if (nproc > 1) {
481       if ((2L * chunk + 1) * nproc >= tc) {
482         /* chunk size too large, switch to dynamic */
483         schedule = kmp_sch_dynamic_chunked;
484       } else {
485         /* commonly used term: (2 nproc - 1)/(2 nproc) */
486         DBL x;
487 
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86
489         /* Linux* OS already has 64-bit computation by default for long double,
490            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
491            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
492            instead of the default 53-bit. Even though long double doesn't work
493            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
494            expected to impact the correctness of the algorithm, but this has not
495            been mathematically proven. */
496         // save original FPCW and set precision to 64-bit, as
497         // Windows* OS on IA-32 architecture defaults to 53-bit
498         unsigned int oldFpcw = _control87(0, 0);
499         _control87(_PC_64, _MCW_PC); // 0,0x30000
500 #endif
501         /* value used for comparison in solver for cross-over point */
502         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
503 
504         /* crossover point--chunk indexes equal to or greater than
505            this point switch to dynamic-style scheduling */
506         UT cross;
507 
508         /* commonly used term: (2 nproc - 1)/(2 nproc) */
509         x = (long double)1.0 - (long double)0.5 / nproc;
510 
511 #ifdef KMP_DEBUG
512         { // test natural alignment
513           struct _test_a {
514             char a;
515             union {
516               char b;
517               DBL d;
518             };
519           } t;
520           ptrdiff_t natural_alignment =
521               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
522           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
523           // long)natural_alignment );
524           KMP_DEBUG_ASSERT(
525               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
526         }
527 #endif // KMP_DEBUG
528 
529         /* save the term in thread private dispatch structure */
530         *(DBL *)&pr->u.p.parm3 = x;
531 
532         /* solve for the crossover point to the nearest integer i for which C_i
533            <= chunk */
534         {
535           UT left, right, mid;
536           long double p;
537 
538           /* estimate initial upper and lower bound */
539 
540           /* doesn't matter what value right is as long as it is positive, but
541              it affects performance of the solver */
542           right = 229;
543           p = __kmp_pow<UT>(x, right);
544           if (p > target) {
545             do {
546               p *= p;
547               right <<= 1;
548             } while (p > target && right < (1 << 27));
549             /* lower bound is previous (failed) estimate of upper bound */
550             left = right >> 1;
551           } else {
552             left = 0;
553           }
554 
555           /* bisection root-finding method */
556           while (left + 1 < right) {
557             mid = (left + right) / 2;
558             if (__kmp_pow<UT>(x, mid) > target) {
559               left = mid;
560             } else {
561               right = mid;
562             }
563           } // while
564           cross = right;
565         }
566         /* assert sanity of computed crossover point */
567         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568                    __kmp_pow<UT>(x, cross) <= target);
569 
570         /* save the crossover point in thread private dispatch structure */
571         pr->u.p.parm2 = cross;
572 
573 // C75803
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
576 #else
577 #define GUIDED_ANALYTICAL_WORKAROUND (x)
578 #endif
579         /* dynamic-style scheduling offset */
580         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582                         cross * chunk;
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86
584         // restore FPCW
585         _control87(oldFpcw, _MCW_PC);
586 #endif
587       } // if
588     } else {
589       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
590                      "kmp_sch_static_greedy\n",
591                      gtid));
592       schedule = kmp_sch_static_greedy;
593       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
594       pr->u.p.parm1 = tc;
595     } // if
596   } // case
597   break;
598   case kmp_sch_static_greedy:
599     KD_TRACE(
600         100,
601         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602          gtid));
603     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604     break;
605   case kmp_sch_static_chunked:
606   case kmp_sch_dynamic_chunked:
607     if (pr->u.p.parm1 <= 0) {
608       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609     }
610     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
611                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
612                    gtid));
613     break;
614   case kmp_sch_trapezoidal: {
615     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
616 
617     T parm1, parm2, parm3, parm4;
618     KD_TRACE(100,
619              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
620               gtid));
621 
622     parm1 = chunk;
623 
624     /* F : size of the first cycle */
625     parm2 = (tc / (2 * nproc));
626 
627     if (parm2 < 1) {
628       parm2 = 1;
629     }
630 
631     /* L : size of the last cycle.  Make sure the last cycle is not larger
632        than the first cycle. */
633     if (parm1 < 1) {
634       parm1 = 1;
635     } else if (parm1 > parm2) {
636       parm1 = parm2;
637     }
638 
639     /* N : number of cycles */
640     parm3 = (parm2 + parm1);
641     parm3 = (2 * tc + parm3 - 1) / parm3;
642 
643     if (parm3 < 2) {
644       parm3 = 2;
645     }
646 
647     /* sigma : decreasing incr of the trapezoid */
648     parm4 = (parm3 - 1);
649     parm4 = (parm2 - parm1) / parm4;
650 
651     // pointless check, because parm4 >= 0 always
652     // if ( parm4 < 0 ) {
653     //    parm4 = 0;
654     //}
655 
656     pr->u.p.parm1 = parm1;
657     pr->u.p.parm2 = parm2;
658     pr->u.p.parm3 = parm3;
659     pr->u.p.parm4 = parm4;
660   } // case
661   break;
662 
663   default: {
664     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
665                 KMP_HNT(GetNewerLibrary), // Hint
666                 __kmp_msg_null // Variadic argument list terminator
667                 );
668   } break;
669   } // switch
670   pr->schedule = schedule;
671 }
672 
673 #if KMP_USE_HIER_SCHED
674 template <typename T>
675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
676                                              typename traits_t<T>::signed_t st);
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
680                                             kmp_int32 ub, kmp_int32 st) {
681   __kmp_dispatch_init_hierarchy<kmp_int32>(
682       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
688                                              kmp_uint32 ub, kmp_int32 st) {
689   __kmp_dispatch_init_hierarchy<kmp_uint32>(
690       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
696                                             kmp_int64 ub, kmp_int64 st) {
697   __kmp_dispatch_init_hierarchy<kmp_int64>(
698       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 template <>
702 inline void
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
704                                              kmp_uint64 ub, kmp_int64 st) {
705   __kmp_dispatch_init_hierarchy<kmp_uint64>(
706       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
708 }
709 
710 // free all the hierarchy scheduling memory associated with the team
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713   for (int i = 0; i < num_disp_buff; ++i) {
714     // type does not matter here so use kmp_int32
715     auto sh =
716         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
717             &team->t.t_disp_buffer[i]);
718     if (sh->hier) {
719       sh->hier->deallocate();
720       __kmp_free(sh->hier);
721     }
722   }
723 }
724 #endif
725 
726 // UT - unsigned flavor of T, ST - signed flavor of T,
727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
728 template <typename T>
729 static void
730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
731                     T ub, typename traits_t<T>::signed_t st,
732                     typename traits_t<T>::signed_t chunk, int push_ws) {
733   typedef typename traits_t<T>::unsigned_t UT;
734   typedef typename traits_t<T>::signed_t ST;
735   typedef typename traits_t<T>::floating_t DBL;
736 
737   int active;
738   kmp_info_t *th;
739   kmp_team_t *team;
740   kmp_uint32 my_buffer_index;
741   dispatch_private_info_template<T> *pr;
742   dispatch_shared_info_template<T> volatile *sh;
743 
744   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
745                    sizeof(dispatch_private_info));
746   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
747                    sizeof(dispatch_shared_info));
748 
749   if (!TCR_4(__kmp_init_parallel))
750     __kmp_parallel_initialize();
751 
752 #if INCLUDE_SSC_MARKS
753   SSC_MARK_DISPATCH_INIT();
754 #endif
755 #ifdef KMP_DEBUG
756   {
757     char *buff;
758     // create format specifiers before the debug output
759     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
760                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
761                             traits_t<ST>::spec, traits_t<T>::spec,
762                             traits_t<T>::spec, traits_t<ST>::spec);
763     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
764     __kmp_str_free(&buff);
765   }
766 #endif
767   /* setup data */
768   th = __kmp_threads[gtid];
769   team = th->th.th_team;
770   active = !team->t.t_serialized;
771   th->th.th_ident = loc;
772 
773   // Any half-decent optimizer will remove this test when the blocks are empty
774   // since the macros expand to nothing
775   // when statistics are disabled.
776   if (schedule == __kmp_static) {
777     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
778   } else {
779     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
780   }
781 
782 #if KMP_USE_HIER_SCHED
783   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
784   // Hierarchical scheduling does not work with ordered, so if ordered is
785   // detected, then revert back to threaded scheduling.
786   bool ordered;
787   enum sched_type my_sched = schedule;
788   my_buffer_index = th->th.th_dispatch->th_disp_index;
789   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
790       &th->th.th_dispatch
791            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
792   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
793   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
794     my_sched =
795         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
796   ordered = (kmp_ord_lower & my_sched);
797   if (pr->flags.use_hier) {
798     if (ordered) {
799       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
800                      "Disabling hierarchical scheduling.\n",
801                      gtid));
802       pr->flags.use_hier = FALSE;
803     }
804   }
805   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
806     // Don't use hierarchical for ordered parallel loops and don't
807     // use the runtime hierarchy if one was specified in the program
808     if (!ordered && !pr->flags.use_hier)
809       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
810   }
811 #endif // KMP_USE_HIER_SCHED
812 
813 #if USE_ITT_BUILD
814   kmp_uint64 cur_chunk = chunk;
815   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
816                                     __kmp_forkjoin_frames_mode == 3 &&
817                                     KMP_MASTER_GTID(gtid) &&
818 #if OMP_40_ENABLED
819                                     th->th.th_teams_microtask == NULL &&
820 #endif
821                                     team->t.t_active_level == 1;
822 #endif
823   if (!active) {
824     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
825         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
826   } else {
827     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
828                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
829 
830     my_buffer_index = th->th.th_dispatch->th_disp_index++;
831 
832     /* What happens when number of threads changes, need to resize buffer? */
833     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
834         &th->th.th_dispatch
835              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
836     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
837         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
839                   my_buffer_index));
840   }
841 
842   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
843 #if USE_ITT_BUILD
844                                 &cur_chunk,
845 #endif
846                                 chunk, (T)th->th.th_team_nproc,
847                                 (T)th->th.th_info.ds.ds_tid);
848   if (active) {
849     if (pr->flags.ordered == 0) {
850       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
851       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
852     } else {
853       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
854       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
855     }
856   }
857 
858   if (active) {
859     /* The name of this buffer should be my_buffer_index when it's free to use
860      * it */
861 
862     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
863                    "sh->buffer_index:%d\n",
864                    gtid, my_buffer_index, sh->buffer_index));
865     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
866                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
867     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
868     // my_buffer_index are *always* 32-bit integers.
869     KMP_MB(); /* is this necessary? */
870     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
871                    "sh->buffer_index:%d\n",
872                    gtid, my_buffer_index, sh->buffer_index));
873 
874     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
875     th->th.th_dispatch->th_dispatch_sh_current =
876         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
877 #if USE_ITT_BUILD
878     if (pr->flags.ordered) {
879       __kmp_itt_ordered_init(gtid);
880     }
881     // Report loop metadata
882     if (itt_need_metadata_reporting) {
883       // Only report metadata by master of active team at level 1
884       kmp_uint64 schedtype = 0;
885       switch (schedule) {
886       case kmp_sch_static_chunked:
887       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
888         break;
889       case kmp_sch_static_greedy:
890         cur_chunk = pr->u.p.parm1;
891         break;
892       case kmp_sch_dynamic_chunked:
893         schedtype = 1;
894         break;
895       case kmp_sch_guided_iterative_chunked:
896       case kmp_sch_guided_analytical_chunked:
897 #if OMP_45_ENABLED
898       case kmp_sch_guided_simd:
899 #endif
900         schedtype = 2;
901         break;
902       default:
903         // Should we put this case under "static"?
904         // case kmp_sch_static_steal:
905         schedtype = 3;
906         break;
907       }
908       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
909     }
910 #if KMP_USE_HIER_SCHED
911     if (pr->flags.use_hier) {
912       pr->u.p.count = 0;
913       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
914     }
915 #endif // KMP_USER_HIER_SCHED
916 #endif /* USE_ITT_BUILD */
917   }
918 
919 #ifdef KMP_DEBUG
920   {
921     char *buff;
922     // create format specifiers before the debug output
923     buff = __kmp_str_format(
924         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
925         "lb:%%%s ub:%%%s"
926         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
927         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
928         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
929         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
930         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
931         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
932     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
933                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
934                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
935                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
936     __kmp_str_free(&buff);
937   }
938 #endif
939 #if (KMP_STATIC_STEAL_ENABLED)
940   // It cannot be guaranteed that after execution of a loop with some other
941   // schedule kind all the parm3 variables will contain the same value. Even if
942   // all parm3 will be the same, it still exists a bad case like using 0 and 1
943   // rather than program life-time increment. So the dedicated variable is
944   // required. The 'static_steal_counter' is used.
945   if (schedule == kmp_sch_static_steal) {
946     // Other threads will inspect this variable when searching for a victim.
947     // This is a flag showing that other threads may steal from this thread
948     // since then.
949     volatile T *p = &pr->u.p.static_steal_counter;
950     *p = *p + 1;
951   }
952 #endif // ( KMP_STATIC_STEAL_ENABLED )
953 
954 #if OMPT_SUPPORT && OMPT_OPTIONAL
955   if (ompt_enabled.ompt_callback_work) {
956     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
957     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
958     ompt_callbacks.ompt_callback(ompt_callback_work)(
959         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
960         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
961   }
962 #endif
963   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
964 }
965 
966 /* For ordered loops, either __kmp_dispatch_finish() should be called after
967  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
968  * every chunk of iterations.  If the ordered section(s) were not executed
969  * for this iteration (or every iteration in this chunk), we need to set the
970  * ordered iteration counters so that the next thread can proceed. */
971 template <typename UT>
972 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
973   typedef typename traits_t<UT>::signed_t ST;
974   kmp_info_t *th = __kmp_threads[gtid];
975 
976   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
977   if (!th->th.th_team->t.t_serialized) {
978 
979     dispatch_private_info_template<UT> *pr =
980         reinterpret_cast<dispatch_private_info_template<UT> *>(
981             th->th.th_dispatch->th_dispatch_pr_current);
982     dispatch_shared_info_template<UT> volatile *sh =
983         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
984             th->th.th_dispatch->th_dispatch_sh_current);
985     KMP_DEBUG_ASSERT(pr);
986     KMP_DEBUG_ASSERT(sh);
987     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
988                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
989 
990     if (pr->ordered_bumped) {
991       KD_TRACE(
992           1000,
993           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
994            gtid));
995       pr->ordered_bumped = 0;
996     } else {
997       UT lower = pr->u.p.ordered_lower;
998 
999 #ifdef KMP_DEBUG
1000       {
1001         char *buff;
1002         // create format specifiers before the debug output
1003         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1004                                 "ordered_iteration:%%%s lower:%%%s\n",
1005                                 traits_t<UT>::spec, traits_t<UT>::spec);
1006         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1007         __kmp_str_free(&buff);
1008       }
1009 #endif
1010 
1011       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1012                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1013       KMP_MB(); /* is this necessary? */
1014 #ifdef KMP_DEBUG
1015       {
1016         char *buff;
1017         // create format specifiers before the debug output
1018         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1019                                 "ordered_iteration:%%%s lower:%%%s\n",
1020                                 traits_t<UT>::spec, traits_t<UT>::spec);
1021         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1022         __kmp_str_free(&buff);
1023       }
1024 #endif
1025 
1026       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1027     } // if
1028   } // if
1029   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1030 }
1031 
1032 #ifdef KMP_GOMP_COMPAT
1033 
1034 template <typename UT>
1035 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1036   typedef typename traits_t<UT>::signed_t ST;
1037   kmp_info_t *th = __kmp_threads[gtid];
1038 
1039   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1040   if (!th->th.th_team->t.t_serialized) {
1041     //        int cid;
1042     dispatch_private_info_template<UT> *pr =
1043         reinterpret_cast<dispatch_private_info_template<UT> *>(
1044             th->th.th_dispatch->th_dispatch_pr_current);
1045     dispatch_shared_info_template<UT> volatile *sh =
1046         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1047             th->th.th_dispatch->th_dispatch_sh_current);
1048     KMP_DEBUG_ASSERT(pr);
1049     KMP_DEBUG_ASSERT(sh);
1050     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1051                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1052 
1053     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1054     UT lower = pr->u.p.ordered_lower;
1055     UT upper = pr->u.p.ordered_upper;
1056     UT inc = upper - lower + 1;
1057 
1058     if (pr->ordered_bumped == inc) {
1059       KD_TRACE(
1060           1000,
1061           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1062            gtid));
1063       pr->ordered_bumped = 0;
1064     } else {
1065       inc -= pr->ordered_bumped;
1066 
1067 #ifdef KMP_DEBUG
1068       {
1069         char *buff;
1070         // create format specifiers before the debug output
1071         buff = __kmp_str_format(
1072             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1073             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1074             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1075         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1076         __kmp_str_free(&buff);
1077       }
1078 #endif
1079 
1080       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1081                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1082 
1083       KMP_MB(); /* is this necessary? */
1084       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1085                       "ordered_bumped to zero\n",
1086                       gtid));
1087       pr->ordered_bumped = 0;
1088 //!!!!! TODO check if the inc should be unsigned, or signed???
1089 #ifdef KMP_DEBUG
1090       {
1091         char *buff;
1092         // create format specifiers before the debug output
1093         buff = __kmp_str_format(
1094             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1095             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1096             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1097             traits_t<UT>::spec);
1098         KD_TRACE(1000,
1099                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1100         __kmp_str_free(&buff);
1101       }
1102 #endif
1103 
1104       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1105     }
1106     //        }
1107   }
1108   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1109 }
1110 
1111 #endif /* KMP_GOMP_COMPAT */
1112 
1113 template <typename T>
1114 int __kmp_dispatch_next_algorithm(int gtid,
1115                                   dispatch_private_info_template<T> *pr,
1116                                   dispatch_shared_info_template<T> volatile *sh,
1117                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1118                                   typename traits_t<T>::signed_t *p_st, T nproc,
1119                                   T tid) {
1120   typedef typename traits_t<T>::unsigned_t UT;
1121   typedef typename traits_t<T>::signed_t ST;
1122   typedef typename traits_t<T>::floating_t DBL;
1123   int status = 0;
1124   kmp_int32 last = 0;
1125   T start;
1126   ST incr;
1127   UT limit, trip, init;
1128   kmp_info_t *th = __kmp_threads[gtid];
1129   kmp_team_t *team = th->th.th_team;
1130 
1131   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1132                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1133   KMP_DEBUG_ASSERT(pr);
1134   KMP_DEBUG_ASSERT(sh);
1135   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1136 #ifdef KMP_DEBUG
1137   {
1138     char *buff;
1139     // create format specifiers before the debug output
1140     buff =
1141         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1142                          "sh:%%p nproc:%%%s tid:%%%s\n",
1143                          traits_t<T>::spec, traits_t<T>::spec);
1144     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1145     __kmp_str_free(&buff);
1146   }
1147 #endif
1148 
1149   // zero trip count
1150   if (pr->u.p.tc == 0) {
1151     KD_TRACE(10,
1152              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1153               "zero status:%d\n",
1154               gtid, status));
1155     return 0;
1156   }
1157 
1158   switch (pr->schedule) {
1159 #if (KMP_STATIC_STEAL_ENABLED)
1160   case kmp_sch_static_steal: {
1161     T chunk = pr->u.p.parm1;
1162 
1163     KD_TRACE(100,
1164              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1165               gtid));
1166 
1167     trip = pr->u.p.tc - 1;
1168 
1169     if (traits_t<T>::type_size > 4) {
1170       // use lock for 8-byte and CAS for 4-byte induction
1171       // variable. TODO (optional): check and use 16-byte CAS
1172       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1173       KMP_DEBUG_ASSERT(lck != NULL);
1174       if (pr->u.p.count < (UT)pr->u.p.ub) {
1175         __kmp_acquire_lock(lck, gtid);
1176         // try to get own chunk of iterations
1177         init = (pr->u.p.count)++;
1178         status = (init < (UT)pr->u.p.ub);
1179         __kmp_release_lock(lck, gtid);
1180       } else {
1181         status = 0; // no own chunks
1182       }
1183       if (!status) { // try to steal
1184         kmp_info_t **other_threads = team->t.t_threads;
1185         int while_limit = nproc; // nproc attempts to find a victim
1186         int while_index = 0;
1187         // TODO: algorithm of searching for a victim
1188         // should be cleaned up and measured
1189         while ((!status) && (while_limit != ++while_index)) {
1190           T remaining;
1191           T victimIdx = pr->u.p.parm4;
1192           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1193           dispatch_private_info_template<T> *victim =
1194               reinterpret_cast<dispatch_private_info_template<T> *>(
1195                   other_threads[victimIdx]
1196                       ->th.th_dispatch->th_dispatch_pr_current);
1197           while ((victim == NULL || victim == pr ||
1198                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1199                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1200                  oldVictimIdx != victimIdx) {
1201             victimIdx = (victimIdx + 1) % nproc;
1202             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1203                 other_threads[victimIdx]
1204                     ->th.th_dispatch->th_dispatch_pr_current);
1205           }
1206           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1207                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1208             continue; // try once more (nproc attempts in total)
1209             // no victim is ready yet to participate in stealing
1210             // because all victims are still in kmp_init_dispatch
1211           }
1212           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1213             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1214             continue; // not enough chunks to steal, goto next victim
1215           }
1216 
1217           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1218           KMP_ASSERT(lck != NULL);
1219           __kmp_acquire_lock(lck, gtid);
1220           limit = victim->u.p.ub; // keep initial ub
1221           if (victim->u.p.count >= limit ||
1222               (remaining = limit - victim->u.p.count) < 2) {
1223             __kmp_release_lock(lck, gtid);
1224             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1225             continue; // not enough chunks to steal
1226           }
1227           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1228           // by 1
1229           if (remaining > 3) {
1230             // steal 1/4 of remaining
1231             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1232             init = (victim->u.p.ub -= (remaining >> 2));
1233           } else {
1234             // steal 1 chunk of 2 or 3 remaining
1235             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1236             init = (victim->u.p.ub -= 1);
1237           }
1238           __kmp_release_lock(lck, gtid);
1239 
1240           KMP_DEBUG_ASSERT(init + 1 <= limit);
1241           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1242           status = 1;
1243           while_index = 0;
1244           // now update own count and ub with stolen range but init chunk
1245           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1246           pr->u.p.count = init + 1;
1247           pr->u.p.ub = limit;
1248           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1249         } // while (search for victim)
1250       } // if (try to find victim and steal)
1251     } else {
1252       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1253       typedef union {
1254         struct {
1255           UT count;
1256           T ub;
1257         } p;
1258         kmp_int64 b;
1259       } union_i4;
1260       // All operations on 'count' or 'ub' must be combined atomically
1261       // together.
1262       {
1263         union_i4 vold, vnew;
1264         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1265         vnew = vold;
1266         vnew.p.count++;
1267         while (!KMP_COMPARE_AND_STORE_ACQ64(
1268             (volatile kmp_int64 *)&pr->u.p.count,
1269             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1270             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1271           KMP_CPU_PAUSE();
1272           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1273           vnew = vold;
1274           vnew.p.count++;
1275         }
1276         vnew = vold;
1277         init = vnew.p.count;
1278         status = (init < (UT)vnew.p.ub);
1279       }
1280 
1281       if (!status) {
1282         kmp_info_t **other_threads = team->t.t_threads;
1283         int while_limit = nproc; // nproc attempts to find a victim
1284         int while_index = 0;
1285 
1286         // TODO: algorithm of searching for a victim
1287         // should be cleaned up and measured
1288         while ((!status) && (while_limit != ++while_index)) {
1289           union_i4 vold, vnew;
1290           kmp_int32 remaining;
1291           T victimIdx = pr->u.p.parm4;
1292           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1293           dispatch_private_info_template<T> *victim =
1294               reinterpret_cast<dispatch_private_info_template<T> *>(
1295                   other_threads[victimIdx]
1296                       ->th.th_dispatch->th_dispatch_pr_current);
1297           while ((victim == NULL || victim == pr ||
1298                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1299                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1300                  oldVictimIdx != victimIdx) {
1301             victimIdx = (victimIdx + 1) % nproc;
1302             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1303                 other_threads[victimIdx]
1304                     ->th.th_dispatch->th_dispatch_pr_current);
1305           }
1306           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1307                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1308             continue; // try once more (nproc attempts in total)
1309             // no victim is ready yet to participate in stealing
1310             // because all victims are still in kmp_init_dispatch
1311           }
1312           pr->u.p.parm4 = victimIdx; // new victim found
1313           while (1) { // CAS loop if victim has enough chunks to steal
1314             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1315             vnew = vold;
1316 
1317             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1318             if (vnew.p.count >= (UT)vnew.p.ub ||
1319                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1320               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1321               break; // not enough chunks to steal, goto next victim
1322             }
1323             if (remaining > 3) {
1324               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1325             } else {
1326               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1327             }
1328             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1329             // TODO: Should this be acquire or release?
1330             if (KMP_COMPARE_AND_STORE_ACQ64(
1331                     (volatile kmp_int64 *)&victim->u.p.count,
1332                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1333                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1334               // stealing succedded
1335               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1336                                         vold.p.ub - vnew.p.ub);
1337               status = 1;
1338               while_index = 0;
1339               // now update own count and ub
1340               init = vnew.p.ub;
1341               vold.p.count = init + 1;
1342 #if KMP_ARCH_X86
1343               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1344 #else
1345               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1346 #endif
1347               break;
1348             } // if (check CAS result)
1349             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1350           } // while (try to steal from particular victim)
1351         } // while (search for victim)
1352       } // if (try to find victim and steal)
1353     } // if (4-byte induction variable)
1354     if (!status) {
1355       *p_lb = 0;
1356       *p_ub = 0;
1357       if (p_st != NULL)
1358         *p_st = 0;
1359     } else {
1360       start = pr->u.p.parm2;
1361       init *= chunk;
1362       limit = chunk + init - 1;
1363       incr = pr->u.p.st;
1364       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1365 
1366       KMP_DEBUG_ASSERT(init <= trip);
1367       if ((last = (limit >= trip)) != 0)
1368         limit = trip;
1369       if (p_st != NULL)
1370         *p_st = incr;
1371 
1372       if (incr == 1) {
1373         *p_lb = start + init;
1374         *p_ub = start + limit;
1375       } else {
1376         *p_lb = start + init * incr;
1377         *p_ub = start + limit * incr;
1378       }
1379 
1380       if (pr->flags.ordered) {
1381         pr->u.p.ordered_lower = init;
1382         pr->u.p.ordered_upper = limit;
1383       } // if
1384     } // if
1385     break;
1386   } // case
1387 #endif // ( KMP_STATIC_STEAL_ENABLED )
1388   case kmp_sch_static_balanced: {
1389     KD_TRACE(
1390         10,
1391         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1392          gtid));
1393     /* check if thread has any iteration to do */
1394     if ((status = !pr->u.p.count) != 0) {
1395       pr->u.p.count = 1;
1396       *p_lb = pr->u.p.lb;
1397       *p_ub = pr->u.p.ub;
1398       last = pr->u.p.parm1;
1399       if (p_st != NULL)
1400         *p_st = pr->u.p.st;
1401     } else { /* no iterations to do */
1402       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1403     }
1404   } // case
1405   break;
1406   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1407                                  merged here */
1408   case kmp_sch_static_chunked: {
1409     T parm1;
1410 
1411     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1412                    "kmp_sch_static_[affinity|chunked] case\n",
1413                    gtid));
1414     parm1 = pr->u.p.parm1;
1415 
1416     trip = pr->u.p.tc - 1;
1417     init = parm1 * (pr->u.p.count + tid);
1418 
1419     if ((status = (init <= trip)) != 0) {
1420       start = pr->u.p.lb;
1421       incr = pr->u.p.st;
1422       limit = parm1 + init - 1;
1423 
1424       if ((last = (limit >= trip)) != 0)
1425         limit = trip;
1426 
1427       if (p_st != NULL)
1428         *p_st = incr;
1429 
1430       pr->u.p.count += nproc;
1431 
1432       if (incr == 1) {
1433         *p_lb = start + init;
1434         *p_ub = start + limit;
1435       } else {
1436         *p_lb = start + init * incr;
1437         *p_ub = start + limit * incr;
1438       }
1439 
1440       if (pr->flags.ordered) {
1441         pr->u.p.ordered_lower = init;
1442         pr->u.p.ordered_upper = limit;
1443       } // if
1444     } // if
1445   } // case
1446   break;
1447 
1448   case kmp_sch_dynamic_chunked: {
1449     T chunk = pr->u.p.parm1;
1450 
1451     KD_TRACE(
1452         100,
1453         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1454          gtid));
1455 
1456     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1457     trip = pr->u.p.tc - 1;
1458 
1459     if ((status = (init <= trip)) == 0) {
1460       *p_lb = 0;
1461       *p_ub = 0;
1462       if (p_st != NULL)
1463         *p_st = 0;
1464     } else {
1465       start = pr->u.p.lb;
1466       limit = chunk + init - 1;
1467       incr = pr->u.p.st;
1468 
1469       if ((last = (limit >= trip)) != 0)
1470         limit = trip;
1471 
1472       if (p_st != NULL)
1473         *p_st = incr;
1474 
1475       if (incr == 1) {
1476         *p_lb = start + init;
1477         *p_ub = start + limit;
1478       } else {
1479         *p_lb = start + init * incr;
1480         *p_ub = start + limit * incr;
1481       }
1482 
1483       if (pr->flags.ordered) {
1484         pr->u.p.ordered_lower = init;
1485         pr->u.p.ordered_upper = limit;
1486       } // if
1487     } // if
1488   } // case
1489   break;
1490 
1491   case kmp_sch_guided_iterative_chunked: {
1492     T chunkspec = pr->u.p.parm1;
1493     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1494                    "iterative case\n",
1495                    gtid));
1496     trip = pr->u.p.tc;
1497     // Start atomic part of calculations
1498     while (1) {
1499       ST remaining; // signed, because can be < 0
1500       init = sh->u.s.iteration; // shared value
1501       remaining = trip - init;
1502       if (remaining <= 0) { // AC: need to compare with 0 first
1503         // nothing to do, don't try atomic op
1504         status = 0;
1505         break;
1506       }
1507       if ((T)remaining <
1508           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1509         // use dynamic-style shcedule
1510         // atomically inrement iterations, get old value
1511         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1512                                  (ST)chunkspec);
1513         remaining = trip - init;
1514         if (remaining <= 0) {
1515           status = 0; // all iterations got by other threads
1516         } else {
1517           // got some iterations to work on
1518           status = 1;
1519           if ((T)remaining > chunkspec) {
1520             limit = init + chunkspec - 1;
1521           } else {
1522             last = 1; // the last chunk
1523             limit = init + remaining - 1;
1524           } // if
1525         } // if
1526         break;
1527       } // if
1528       limit = init +
1529               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1530       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1531                                (ST)init, (ST)limit)) {
1532         // CAS was successful, chunk obtained
1533         status = 1;
1534         --limit;
1535         break;
1536       } // if
1537     } // while
1538     if (status != 0) {
1539       start = pr->u.p.lb;
1540       incr = pr->u.p.st;
1541       if (p_st != NULL)
1542         *p_st = incr;
1543       *p_lb = start + init * incr;
1544       *p_ub = start + limit * incr;
1545       if (pr->flags.ordered) {
1546         pr->u.p.ordered_lower = init;
1547         pr->u.p.ordered_upper = limit;
1548       } // if
1549     } else {
1550       *p_lb = 0;
1551       *p_ub = 0;
1552       if (p_st != NULL)
1553         *p_st = 0;
1554     } // if
1555   } // case
1556   break;
1557 
1558 #if OMP_45_ENABLED
1559   case kmp_sch_guided_simd: {
1560     // same as iterative but curr-chunk adjusted to be multiple of given
1561     // chunk
1562     T chunk = pr->u.p.parm1;
1563     KD_TRACE(100,
1564              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1565               gtid));
1566     trip = pr->u.p.tc;
1567     // Start atomic part of calculations
1568     while (1) {
1569       ST remaining; // signed, because can be < 0
1570       init = sh->u.s.iteration; // shared value
1571       remaining = trip - init;
1572       if (remaining <= 0) { // AC: need to compare with 0 first
1573         status = 0; // nothing to do, don't try atomic op
1574         break;
1575       }
1576       KMP_DEBUG_ASSERT(init % chunk == 0);
1577       // compare with K*nproc*(chunk+1), K=2 by default
1578       if ((T)remaining < pr->u.p.parm2) {
1579         // use dynamic-style shcedule
1580         // atomically inrement iterations, get old value
1581         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1582                                  (ST)chunk);
1583         remaining = trip - init;
1584         if (remaining <= 0) {
1585           status = 0; // all iterations got by other threads
1586         } else {
1587           // got some iterations to work on
1588           status = 1;
1589           if ((T)remaining > chunk) {
1590             limit = init + chunk - 1;
1591           } else {
1592             last = 1; // the last chunk
1593             limit = init + remaining - 1;
1594           } // if
1595         } // if
1596         break;
1597       } // if
1598       // divide by K*nproc
1599       UT span = remaining * (*(double *)&pr->u.p.parm3);
1600       UT rem = span % chunk;
1601       if (rem) // adjust so that span%chunk == 0
1602         span += chunk - rem;
1603       limit = init + span;
1604       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1605                                (ST)init, (ST)limit)) {
1606         // CAS was successful, chunk obtained
1607         status = 1;
1608         --limit;
1609         break;
1610       } // if
1611     } // while
1612     if (status != 0) {
1613       start = pr->u.p.lb;
1614       incr = pr->u.p.st;
1615       if (p_st != NULL)
1616         *p_st = incr;
1617       *p_lb = start + init * incr;
1618       *p_ub = start + limit * incr;
1619       if (pr->flags.ordered) {
1620         pr->u.p.ordered_lower = init;
1621         pr->u.p.ordered_upper = limit;
1622       } // if
1623     } else {
1624       *p_lb = 0;
1625       *p_ub = 0;
1626       if (p_st != NULL)
1627         *p_st = 0;
1628     } // if
1629   } // case
1630   break;
1631 #endif // OMP_45_ENABLED
1632 
1633   case kmp_sch_guided_analytical_chunked: {
1634     T chunkspec = pr->u.p.parm1;
1635     UT chunkIdx;
1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1637     /* for storing original FPCW value for Windows* OS on
1638        IA-32 architecture 8-byte version */
1639     unsigned int oldFpcw;
1640     unsigned int fpcwSet = 0;
1641 #endif
1642     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1643                    "kmp_sch_guided_analytical_chunked case\n",
1644                    gtid));
1645 
1646     trip = pr->u.p.tc;
1647 
1648     KMP_DEBUG_ASSERT(nproc > 1);
1649     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1650 
1651     while (1) { /* this while loop is a safeguard against unexpected zero
1652                    chunk sizes */
1653       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1654       if (chunkIdx >= (UT)pr->u.p.parm2) {
1655         --trip;
1656         /* use dynamic-style scheduling */
1657         init = chunkIdx * chunkspec + pr->u.p.count;
1658         /* need to verify init > 0 in case of overflow in the above
1659          * calculation */
1660         if ((status = (init > 0 && init <= trip)) != 0) {
1661           limit = init + chunkspec - 1;
1662 
1663           if ((last = (limit >= trip)) != 0)
1664             limit = trip;
1665         }
1666         break;
1667       } else {
1668 /* use exponential-style scheduling */
1669 /* The following check is to workaround the lack of long double precision on
1670    Windows* OS.
1671    This check works around the possible effect that init != 0 for chunkIdx == 0.
1672  */
1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1674         /* If we haven't already done so, save original
1675            FPCW and set precision to 64-bit, as Windows* OS
1676            on IA-32 architecture defaults to 53-bit */
1677         if (!fpcwSet) {
1678           oldFpcw = _control87(0, 0);
1679           _control87(_PC_64, _MCW_PC);
1680           fpcwSet = 0x30000;
1681         }
1682 #endif
1683         if (chunkIdx) {
1684           init = __kmp_dispatch_guided_remaining<T>(
1685               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1686           KMP_DEBUG_ASSERT(init);
1687           init = trip - init;
1688         } else
1689           init = 0;
1690         limit = trip - __kmp_dispatch_guided_remaining<T>(
1691                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1692         KMP_ASSERT(init <= limit);
1693         if (init < limit) {
1694           KMP_DEBUG_ASSERT(limit <= trip);
1695           --limit;
1696           status = 1;
1697           break;
1698         } // if
1699       } // if
1700     } // while (1)
1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1702     /* restore FPCW if necessary
1703        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1704     */
1705     if (fpcwSet && (oldFpcw & fpcwSet))
1706       _control87(oldFpcw, _MCW_PC);
1707 #endif
1708     if (status != 0) {
1709       start = pr->u.p.lb;
1710       incr = pr->u.p.st;
1711       if (p_st != NULL)
1712         *p_st = incr;
1713       *p_lb = start + init * incr;
1714       *p_ub = start + limit * incr;
1715       if (pr->flags.ordered) {
1716         pr->u.p.ordered_lower = init;
1717         pr->u.p.ordered_upper = limit;
1718       }
1719     } else {
1720       *p_lb = 0;
1721       *p_ub = 0;
1722       if (p_st != NULL)
1723         *p_st = 0;
1724     }
1725   } // case
1726   break;
1727 
1728   case kmp_sch_trapezoidal: {
1729     UT index;
1730     T parm2 = pr->u.p.parm2;
1731     T parm3 = pr->u.p.parm3;
1732     T parm4 = pr->u.p.parm4;
1733     KD_TRACE(100,
1734              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1735               gtid));
1736 
1737     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1738 
1739     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1740     trip = pr->u.p.tc - 1;
1741 
1742     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1743       *p_lb = 0;
1744       *p_ub = 0;
1745       if (p_st != NULL)
1746         *p_st = 0;
1747     } else {
1748       start = pr->u.p.lb;
1749       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1750       incr = pr->u.p.st;
1751 
1752       if ((last = (limit >= trip)) != 0)
1753         limit = trip;
1754 
1755       if (p_st != NULL)
1756         *p_st = incr;
1757 
1758       if (incr == 1) {
1759         *p_lb = start + init;
1760         *p_ub = start + limit;
1761       } else {
1762         *p_lb = start + init * incr;
1763         *p_ub = start + limit * incr;
1764       }
1765 
1766       if (pr->flags.ordered) {
1767         pr->u.p.ordered_lower = init;
1768         pr->u.p.ordered_upper = limit;
1769       } // if
1770     } // if
1771   } // case
1772   break;
1773   default: {
1774     status = 0; // to avoid complaints on uninitialized variable use
1775     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1776                 KMP_HNT(GetNewerLibrary), // Hint
1777                 __kmp_msg_null // Variadic argument list terminator
1778                 );
1779   } break;
1780   } // switch
1781   if (p_last)
1782     *p_last = last;
1783 #ifdef KMP_DEBUG
1784   if (pr->flags.ordered) {
1785     char *buff;
1786     // create format specifiers before the debug output
1787     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1788                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1789                             traits_t<UT>::spec, traits_t<UT>::spec);
1790     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1791     __kmp_str_free(&buff);
1792   }
1793   {
1794     char *buff;
1795     // create format specifiers before the debug output
1796     buff = __kmp_str_format(
1797         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1798         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1799         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1800     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1801     __kmp_str_free(&buff);
1802   }
1803 #endif
1804   return status;
1805 }
1806 
1807 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1808    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1809    is not called. */
1810 #if OMPT_SUPPORT && OMPT_OPTIONAL
1811 #define OMPT_LOOP_END                                                          \
1812   if (status == 0) {                                                           \
1813     if (ompt_enabled.ompt_callback_work) {                                     \
1814       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1815       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1816       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1817           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1818           &(task_info->task_data), 0, codeptr);                                \
1819     }                                                                          \
1820   }
1821 // TODO: implement count
1822 #else
1823 #define OMPT_LOOP_END // no-op
1824 #endif
1825 
1826 #if KMP_STATS_ENABLED
1827 #define KMP_STATS_LOOP_END                                                     \
1828   {                                                                            \
1829     kmp_int64 u, l, t, i;                                                      \
1830     l = (kmp_int64)(*p_lb);                                                    \
1831     u = (kmp_int64)(*p_ub);                                                    \
1832     i = (kmp_int64)(pr->u.p.st);                                               \
1833     if (status == 0) {                                                         \
1834       t = 0;                                                                   \
1835       KMP_POP_PARTITIONED_TIMER();                                             \
1836     } else if (i == 1) {                                                       \
1837       if (u >= l)                                                              \
1838         t = u - l + 1;                                                         \
1839       else                                                                     \
1840         t = 0;                                                                 \
1841     } else if (i < 0) {                                                        \
1842       if (l >= u)                                                              \
1843         t = (l - u) / (-i) + 1;                                                \
1844       else                                                                     \
1845         t = 0;                                                                 \
1846     } else {                                                                   \
1847       if (u >= l)                                                              \
1848         t = (u - l) / i + 1;                                                   \
1849       else                                                                     \
1850         t = 0;                                                                 \
1851     }                                                                          \
1852     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1853   }
1854 #else
1855 #define KMP_STATS_LOOP_END /* Nothing */
1856 #endif
1857 
1858 template <typename T>
1859 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1860                                T *p_lb, T *p_ub,
1861                                typename traits_t<T>::signed_t *p_st
1862 #if OMPT_SUPPORT && OMPT_OPTIONAL
1863                                ,
1864                                void *codeptr
1865 #endif
1866                                ) {
1867 
1868   typedef typename traits_t<T>::unsigned_t UT;
1869   typedef typename traits_t<T>::signed_t ST;
1870   typedef typename traits_t<T>::floating_t DBL;
1871   // This is potentially slightly misleading, schedule(runtime) will appear here
1872   // even if the actual runtme schedule is static. (Which points out a
1873   // disadavantage of schedule(runtime): even when static scheduling is used it
1874   // costs more than a compile time choice to use static scheduling would.)
1875   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1876 
1877   int status;
1878   dispatch_private_info_template<T> *pr;
1879   kmp_info_t *th = __kmp_threads[gtid];
1880   kmp_team_t *team = th->th.th_team;
1881 
1882   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1883   KD_TRACE(
1884       1000,
1885       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1886        gtid, p_lb, p_ub, p_st, p_last));
1887 
1888   if (team->t.t_serialized) {
1889     /* NOTE: serialize this dispatch becase we are not at the active level */
1890     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1891         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1892     KMP_DEBUG_ASSERT(pr);
1893 
1894     if ((status = (pr->u.p.tc != 0)) == 0) {
1895       *p_lb = 0;
1896       *p_ub = 0;
1897       //            if ( p_last != NULL )
1898       //                *p_last = 0;
1899       if (p_st != NULL)
1900         *p_st = 0;
1901       if (__kmp_env_consistency_check) {
1902         if (pr->pushed_ws != ct_none) {
1903           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1904         }
1905       }
1906     } else if (pr->flags.nomerge) {
1907       kmp_int32 last;
1908       T start;
1909       UT limit, trip, init;
1910       ST incr;
1911       T chunk = pr->u.p.parm1;
1912 
1913       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1914                      gtid));
1915 
1916       init = chunk * pr->u.p.count++;
1917       trip = pr->u.p.tc - 1;
1918 
1919       if ((status = (init <= trip)) == 0) {
1920         *p_lb = 0;
1921         *p_ub = 0;
1922         //                if ( p_last != NULL )
1923         //                    *p_last = 0;
1924         if (p_st != NULL)
1925           *p_st = 0;
1926         if (__kmp_env_consistency_check) {
1927           if (pr->pushed_ws != ct_none) {
1928             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1929           }
1930         }
1931       } else {
1932         start = pr->u.p.lb;
1933         limit = chunk + init - 1;
1934         incr = pr->u.p.st;
1935 
1936         if ((last = (limit >= trip)) != 0) {
1937           limit = trip;
1938 #if KMP_OS_WINDOWS
1939           pr->u.p.last_upper = pr->u.p.ub;
1940 #endif /* KMP_OS_WINDOWS */
1941         }
1942         if (p_last != NULL)
1943           *p_last = last;
1944         if (p_st != NULL)
1945           *p_st = incr;
1946         if (incr == 1) {
1947           *p_lb = start + init;
1948           *p_ub = start + limit;
1949         } else {
1950           *p_lb = start + init * incr;
1951           *p_ub = start + limit * incr;
1952         }
1953 
1954         if (pr->flags.ordered) {
1955           pr->u.p.ordered_lower = init;
1956           pr->u.p.ordered_upper = limit;
1957 #ifdef KMP_DEBUG
1958           {
1959             char *buff;
1960             // create format specifiers before the debug output
1961             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1962                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1963                                     traits_t<UT>::spec, traits_t<UT>::spec);
1964             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1965                             pr->u.p.ordered_upper));
1966             __kmp_str_free(&buff);
1967           }
1968 #endif
1969         } // if
1970       } // if
1971     } else {
1972       pr->u.p.tc = 0;
1973       *p_lb = pr->u.p.lb;
1974       *p_ub = pr->u.p.ub;
1975 #if KMP_OS_WINDOWS
1976       pr->u.p.last_upper = *p_ub;
1977 #endif /* KMP_OS_WINDOWS */
1978       if (p_last != NULL)
1979         *p_last = TRUE;
1980       if (p_st != NULL)
1981         *p_st = pr->u.p.st;
1982     } // if
1983 #ifdef KMP_DEBUG
1984     {
1985       char *buff;
1986       // create format specifiers before the debug output
1987       buff = __kmp_str_format(
1988           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1989           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1990           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1991       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1992       __kmp_str_free(&buff);
1993     }
1994 #endif
1995 #if INCLUDE_SSC_MARKS
1996     SSC_MARK_DISPATCH_NEXT();
1997 #endif
1998     OMPT_LOOP_END;
1999     KMP_STATS_LOOP_END;
2000     return status;
2001   } else {
2002     kmp_int32 last = 0;
2003     dispatch_shared_info_template<T> volatile *sh;
2004 
2005     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2006                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2007 
2008     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2009         th->th.th_dispatch->th_dispatch_pr_current);
2010     KMP_DEBUG_ASSERT(pr);
2011     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2012         th->th.th_dispatch->th_dispatch_sh_current);
2013     KMP_DEBUG_ASSERT(sh);
2014 
2015 #if KMP_USE_HIER_SCHED
2016     if (pr->flags.use_hier)
2017       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2018     else
2019 #endif // KMP_USE_HIER_SCHED
2020       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2021                                                 p_st, th->th.th_team_nproc,
2022                                                 th->th.th_info.ds.ds_tid);
2023     // status == 0: no more iterations to execute
2024     if (status == 0) {
2025       UT num_done;
2026 
2027       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2028 #ifdef KMP_DEBUG
2029       {
2030         char *buff;
2031         // create format specifiers before the debug output
2032         buff = __kmp_str_format(
2033             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2034             traits_t<UT>::spec);
2035         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2036         __kmp_str_free(&buff);
2037       }
2038 #endif
2039 
2040 #if KMP_USE_HIER_SCHED
2041       pr->flags.use_hier = FALSE;
2042 #endif
2043       if ((ST)num_done == th->th.th_team_nproc - 1) {
2044 #if (KMP_STATIC_STEAL_ENABLED)
2045         if (pr->schedule == kmp_sch_static_steal &&
2046             traits_t<T>::type_size > 4) {
2047           int i;
2048           kmp_info_t **other_threads = team->t.t_threads;
2049           // loop complete, safe to destroy locks used for stealing
2050           for (i = 0; i < th->th.th_team_nproc; ++i) {
2051             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2052             KMP_ASSERT(lck != NULL);
2053             __kmp_destroy_lock(lck);
2054             __kmp_free(lck);
2055             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2056           }
2057         }
2058 #endif
2059         /* NOTE: release this buffer to be reused */
2060 
2061         KMP_MB(); /* Flush all pending memory write invalidates.  */
2062 
2063         sh->u.s.num_done = 0;
2064         sh->u.s.iteration = 0;
2065 
2066         /* TODO replace with general release procedure? */
2067         if (pr->flags.ordered) {
2068           sh->u.s.ordered_iteration = 0;
2069         }
2070 
2071         KMP_MB(); /* Flush all pending memory write invalidates.  */
2072 
2073         sh->buffer_index += __kmp_dispatch_num_buffers;
2074         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2075                        gtid, sh->buffer_index));
2076 
2077         KMP_MB(); /* Flush all pending memory write invalidates.  */
2078 
2079       } // if
2080       if (__kmp_env_consistency_check) {
2081         if (pr->pushed_ws != ct_none) {
2082           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2083         }
2084       }
2085 
2086       th->th.th_dispatch->th_deo_fcn = NULL;
2087       th->th.th_dispatch->th_dxo_fcn = NULL;
2088       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2089       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2090     } // if (status == 0)
2091 #if KMP_OS_WINDOWS
2092     else if (last) {
2093       pr->u.p.last_upper = pr->u.p.ub;
2094     }
2095 #endif /* KMP_OS_WINDOWS */
2096     if (p_last != NULL && status != 0)
2097       *p_last = last;
2098   } // if
2099 
2100 #ifdef KMP_DEBUG
2101   {
2102     char *buff;
2103     // create format specifiers before the debug output
2104     buff = __kmp_str_format(
2105         "__kmp_dispatch_next: T#%%d normal case: "
2106         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2107         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2108     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2109                   (p_last ? *p_last : 0), status));
2110     __kmp_str_free(&buff);
2111   }
2112 #endif
2113 #if INCLUDE_SSC_MARKS
2114   SSC_MARK_DISPATCH_NEXT();
2115 #endif
2116   OMPT_LOOP_END;
2117   KMP_STATS_LOOP_END;
2118   return status;
2119 }
2120 
2121 template <typename T>
2122 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2123                                   kmp_int32 *plastiter, T *plower, T *pupper,
2124                                   typename traits_t<T>::signed_t incr) {
2125   typedef typename traits_t<T>::unsigned_t UT;
2126   typedef typename traits_t<T>::signed_t ST;
2127   kmp_uint32 team_id;
2128   kmp_uint32 nteams;
2129   UT trip_count;
2130   kmp_team_t *team;
2131   kmp_info_t *th;
2132 
2133   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2135 #ifdef KMP_DEBUG
2136   {
2137     char *buff;
2138     // create format specifiers before the debug output
2139     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2140                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2141                             traits_t<T>::spec, traits_t<T>::spec,
2142                             traits_t<ST>::spec, traits_t<T>::spec);
2143     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2144     __kmp_str_free(&buff);
2145   }
2146 #endif
2147 
2148   if (__kmp_env_consistency_check) {
2149     if (incr == 0) {
2150       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2151                             loc);
2152     }
2153     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2154       // The loop is illegal.
2155       // Some zero-trip loops maintained by compiler, e.g.:
2156       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2157       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2158       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2159       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2160       // Compiler does not check the following illegal loops:
2161       //   for(i=0;i<10;i+=incr) // where incr<0
2162       //   for(i=10;i>0;i-=incr) // where incr<0
2163       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2164     }
2165   }
2166   th = __kmp_threads[gtid];
2167   team = th->th.th_team;
2168 #if OMP_40_ENABLED
2169   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2170   nteams = th->th.th_teams_size.nteams;
2171 #endif
2172   team_id = team->t.t_master_tid;
2173   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2174 
2175   // compute global trip count
2176   if (incr == 1) {
2177     trip_count = *pupper - *plower + 1;
2178   } else if (incr == -1) {
2179     trip_count = *plower - *pupper + 1;
2180   } else if (incr > 0) {
2181     // upper-lower can exceed the limit of signed type
2182     trip_count = (UT)(*pupper - *plower) / incr + 1;
2183   } else {
2184     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2185   }
2186 
2187   if (trip_count <= nteams) {
2188     KMP_DEBUG_ASSERT(
2189         __kmp_static == kmp_sch_static_greedy ||
2190         __kmp_static ==
2191             kmp_sch_static_balanced); // Unknown static scheduling type.
2192     // only some teams get single iteration, others get nothing
2193     if (team_id < trip_count) {
2194       *pupper = *plower = *plower + team_id * incr;
2195     } else {
2196       *plower = *pupper + incr; // zero-trip loop
2197     }
2198     if (plastiter != NULL)
2199       *plastiter = (team_id == trip_count - 1);
2200   } else {
2201     if (__kmp_static == kmp_sch_static_balanced) {
2202       UT chunk = trip_count / nteams;
2203       UT extras = trip_count % nteams;
2204       *plower +=
2205           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2206       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2207       if (plastiter != NULL)
2208         *plastiter = (team_id == nteams - 1);
2209     } else {
2210       T chunk_inc_count =
2211           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2212       T upper = *pupper;
2213       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2214       // Unknown static scheduling type.
2215       *plower += team_id * chunk_inc_count;
2216       *pupper = *plower + chunk_inc_count - incr;
2217       // Check/correct bounds if needed
2218       if (incr > 0) {
2219         if (*pupper < *plower)
2220           *pupper = traits_t<T>::max_value;
2221         if (plastiter != NULL)
2222           *plastiter = *plower <= upper && *pupper > upper - incr;
2223         if (*pupper > upper)
2224           *pupper = upper; // tracker C73258
2225       } else {
2226         if (*pupper > *plower)
2227           *pupper = traits_t<T>::min_value;
2228         if (plastiter != NULL)
2229           *plastiter = *plower >= upper && *pupper < upper - incr;
2230         if (*pupper < upper)
2231           *pupper = upper; // tracker C73258
2232       }
2233     }
2234   }
2235 }
2236 
2237 //-----------------------------------------------------------------------------
2238 // Dispatch routines
2239 //    Transfer call to template< type T >
2240 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2241 //                         T lb, T ub, ST st, ST chunk )
2242 extern "C" {
2243 
2244 /*!
2245 @ingroup WORK_SHARING
2246 @{
2247 @param loc Source location
2248 @param gtid Global thread id
2249 @param schedule Schedule type
2250 @param lb  Lower bound
2251 @param ub  Upper bound
2252 @param st  Step (or increment if you prefer)
2253 @param chunk The chunk size to block with
2254 
2255 This function prepares the runtime to start a dynamically scheduled for loop,
2256 saving the loop arguments.
2257 These functions are all identical apart from the types of the arguments.
2258 */
2259 
2260 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2261                             enum sched_type schedule, kmp_int32 lb,
2262                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2263   KMP_DEBUG_ASSERT(__kmp_init_serial);
2264 #if OMPT_SUPPORT && OMPT_OPTIONAL
2265   OMPT_STORE_RETURN_ADDRESS(gtid);
2266 #endif
2267   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2268 }
2269 /*!
2270 See @ref __kmpc_dispatch_init_4
2271 */
2272 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2273                              enum sched_type schedule, kmp_uint32 lb,
2274                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2275   KMP_DEBUG_ASSERT(__kmp_init_serial);
2276 #if OMPT_SUPPORT && OMPT_OPTIONAL
2277   OMPT_STORE_RETURN_ADDRESS(gtid);
2278 #endif
2279   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2280 }
2281 
2282 /*!
2283 See @ref __kmpc_dispatch_init_4
2284 */
2285 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2286                             enum sched_type schedule, kmp_int64 lb,
2287                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2288   KMP_DEBUG_ASSERT(__kmp_init_serial);
2289 #if OMPT_SUPPORT && OMPT_OPTIONAL
2290   OMPT_STORE_RETURN_ADDRESS(gtid);
2291 #endif
2292   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2293 }
2294 
2295 /*!
2296 See @ref __kmpc_dispatch_init_4
2297 */
2298 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2299                              enum sched_type schedule, kmp_uint64 lb,
2300                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2301   KMP_DEBUG_ASSERT(__kmp_init_serial);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303   OMPT_STORE_RETURN_ADDRESS(gtid);
2304 #endif
2305   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2306 }
2307 
2308 /*!
2309 See @ref __kmpc_dispatch_init_4
2310 
2311 Difference from __kmpc_dispatch_init set of functions is these functions
2312 are called for composite distribute parallel for construct. Thus before
2313 regular iterations dispatching we need to calc per-team iteration space.
2314 
2315 These functions are all identical apart from the types of the arguments.
2316 */
2317 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2318                                  enum sched_type schedule, kmp_int32 *p_last,
2319                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2320                                  kmp_int32 chunk) {
2321   KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323   OMPT_STORE_RETURN_ADDRESS(gtid);
2324 #endif
2325   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2326   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2327 }
2328 
2329 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2330                                   enum sched_type schedule, kmp_int32 *p_last,
2331                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2332                                   kmp_int32 chunk) {
2333   KMP_DEBUG_ASSERT(__kmp_init_serial);
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335   OMPT_STORE_RETURN_ADDRESS(gtid);
2336 #endif
2337   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2338   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2339 }
2340 
2341 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2342                                  enum sched_type schedule, kmp_int32 *p_last,
2343                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2344                                  kmp_int64 chunk) {
2345   KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347   OMPT_STORE_RETURN_ADDRESS(gtid);
2348 #endif
2349   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2350   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2351 }
2352 
2353 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2354                                   enum sched_type schedule, kmp_int32 *p_last,
2355                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2356                                   kmp_int64 chunk) {
2357   KMP_DEBUG_ASSERT(__kmp_init_serial);
2358 #if OMPT_SUPPORT && OMPT_OPTIONAL
2359   OMPT_STORE_RETURN_ADDRESS(gtid);
2360 #endif
2361   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2362   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2363 }
2364 
2365 /*!
2366 @param loc Source code location
2367 @param gtid Global thread id
2368 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2369 otherwise
2370 @param p_lb   Pointer to the lower bound for the next chunk of work
2371 @param p_ub   Pointer to the upper bound for the next chunk of work
2372 @param p_st   Pointer to the stride for the next chunk of work
2373 @return one if there is work to be done, zero otherwise
2374 
2375 Get the next dynamically allocated chunk of work for this thread.
2376 If there is no more work, then the lb,ub and stride need not be modified.
2377 */
2378 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2379                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2380 #if OMPT_SUPPORT && OMPT_OPTIONAL
2381   OMPT_STORE_RETURN_ADDRESS(gtid);
2382 #endif
2383   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2384 #if OMPT_SUPPORT && OMPT_OPTIONAL
2385                                         ,
2386                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2387 #endif
2388                                             );
2389 }
2390 
2391 /*!
2392 See @ref __kmpc_dispatch_next_4
2393 */
2394 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2395                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2396                             kmp_int32 *p_st) {
2397 #if OMPT_SUPPORT && OMPT_OPTIONAL
2398   OMPT_STORE_RETURN_ADDRESS(gtid);
2399 #endif
2400   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402                                          ,
2403                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2404 #endif
2405                                              );
2406 }
2407 
2408 /*!
2409 See @ref __kmpc_dispatch_next_4
2410 */
2411 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2412                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414   OMPT_STORE_RETURN_ADDRESS(gtid);
2415 #endif
2416   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418                                         ,
2419                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2420 #endif
2421                                             );
2422 }
2423 
2424 /*!
2425 See @ref __kmpc_dispatch_next_4
2426 */
2427 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2428                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2429                             kmp_int64 *p_st) {
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431   OMPT_STORE_RETURN_ADDRESS(gtid);
2432 #endif
2433   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435                                          ,
2436                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2437 #endif
2438                                              );
2439 }
2440 
2441 /*!
2442 @param loc Source code location
2443 @param gtid Global thread id
2444 
2445 Mark the end of a dynamic loop.
2446 */
2447 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2448   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2449 }
2450 
2451 /*!
2452 See @ref __kmpc_dispatch_fini_4
2453 */
2454 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2455   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2456 }
2457 
2458 /*!
2459 See @ref __kmpc_dispatch_fini_4
2460 */
2461 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2462   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2463 }
2464 
2465 /*!
2466 See @ref __kmpc_dispatch_fini_4
2467 */
2468 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2469   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2470 }
2471 /*! @} */
2472 
2473 //-----------------------------------------------------------------------------
2474 // Non-template routines from kmp_dispatch.cpp used in other sources
2475 
2476 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2477   return value == checker;
2478 }
2479 
2480 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2481   return value != checker;
2482 }
2483 
2484 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2485   return value < checker;
2486 }
2487 
2488 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2489   return value >= checker;
2490 }
2491 
2492 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2493   return value <= checker;
2494 }
2495 
2496 kmp_uint32
2497 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2498                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2499                    void *obj // Higher-level synchronization object, or NULL.
2500                    ) {
2501   // note: we may not belong to a team at this point
2502   volatile kmp_uint32 *spin = spinner;
2503   kmp_uint32 check = checker;
2504   kmp_uint32 spins;
2505   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2506   kmp_uint32 r;
2507 
2508   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2509   KMP_INIT_YIELD(spins);
2510   // main wait spin loop
2511   while (!f(r = TCR_4(*spin), check)) {
2512     KMP_FSYNC_SPIN_PREPARE(obj);
2513     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2514        split. It causes problems with infinite recursion because of exit lock */
2515     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2516         __kmp_abort_thread(); */
2517 
2518     /* if we have waited a bit, or are oversubscribed, yield */
2519     /* pause is in the following code */
2520     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2521     KMP_YIELD_SPIN(spins);
2522   }
2523   KMP_FSYNC_SPIN_ACQUIRED(obj);
2524   return r;
2525 }
2526 
2527 void __kmp_wait_yield_4_ptr(
2528     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2529     void *obj // Higher-level synchronization object, or NULL.
2530     ) {
2531   // note: we may not belong to a team at this point
2532   void *spin = spinner;
2533   kmp_uint32 check = checker;
2534   kmp_uint32 spins;
2535   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2536 
2537   KMP_FSYNC_SPIN_INIT(obj, spin);
2538   KMP_INIT_YIELD(spins);
2539   // main wait spin loop
2540   while (!f(spin, check)) {
2541     KMP_FSYNC_SPIN_PREPARE(obj);
2542     /* if we have waited a bit, or are oversubscribed, yield */
2543     /* pause is in the following code */
2544     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2545     KMP_YIELD_SPIN(spins);
2546   }
2547   KMP_FSYNC_SPIN_ACQUIRED(obj);
2548 }
2549 
2550 } // extern "C"
2551 
2552 #ifdef KMP_GOMP_COMPAT
2553 
2554 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2555                                enum sched_type schedule, kmp_int32 lb,
2556                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2557                                int push_ws) {
2558   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2559                                  push_ws);
2560 }
2561 
2562 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2563                                 enum sched_type schedule, kmp_uint32 lb,
2564                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2565                                 int push_ws) {
2566   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2567                                   push_ws);
2568 }
2569 
2570 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2571                                enum sched_type schedule, kmp_int64 lb,
2572                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2573                                int push_ws) {
2574   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2575                                  push_ws);
2576 }
2577 
2578 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2579                                 enum sched_type schedule, kmp_uint64 lb,
2580                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2581                                 int push_ws) {
2582   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2583                                   push_ws);
2584 }
2585 
2586 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2587   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2588 }
2589 
2590 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2591   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2592 }
2593 
2594 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2595   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2596 }
2597 
2598 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2599   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2600 }
2601 
2602 #endif /* KMP_GOMP_COMPAT */
2603 
2604 /* ------------------------------------------------------------------------ */
2605