15e8470afSJim Cownie /*
25e8470afSJim Cownie  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
35e8470afSJim Cownie  */
45e8470afSJim Cownie 
55e8470afSJim Cownie //===----------------------------------------------------------------------===//
65e8470afSJim Cownie //
757b08b09SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
857b08b09SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
957b08b09SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
105e8470afSJim Cownie //
115e8470afSJim Cownie //===----------------------------------------------------------------------===//
125e8470afSJim Cownie 
133041982dSJonathan Peyton /* Dynamic scheduling initialization and dispatch.
145e8470afSJim Cownie  *
155e8470afSJim Cownie  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
165e8470afSJim Cownie  *       it may change values between parallel regions.  __kmp_max_nth
175e8470afSJim Cownie  *       is the largest value __kmp_nth may take, 1 is the smallest.
185e8470afSJim Cownie  */
195e8470afSJim Cownie 
205e8470afSJim Cownie #include "kmp.h"
213041982dSJonathan Peyton #include "kmp_error.h"
225e8470afSJim Cownie #include "kmp_i18n.h"
235e8470afSJim Cownie #include "kmp_itt.h"
244cc4bb4cSJim Cownie #include "kmp_stats.h"
253041982dSJonathan Peyton #include "kmp_str.h"
26f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
275e8470afSJim Cownie #include <float.h>
285e8470afSJim Cownie #endif
2939ada854SJonathan Peyton #include "kmp_lock.h"
3039ada854SJonathan Peyton #include "kmp_dispatch.h"
31f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
32f6399367SJonathan Peyton #include "kmp_dispatch_hier.h"
33f6399367SJonathan Peyton #endif
345e8470afSJim Cownie 
35d7d088f8SAndrey Churbanov #if OMPT_SUPPORT
36d7d088f8SAndrey Churbanov #include "ompt-specific.h"
37d7d088f8SAndrey Churbanov #endif
38d7d088f8SAndrey Churbanov 
395e8470afSJim Cownie /* ------------------------------------------------------------------------ */
405e8470afSJim Cownie /* ------------------------------------------------------------------------ */
415e8470afSJim Cownie 
4239ada854SJonathan Peyton void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
435e8470afSJim Cownie   kmp_info_t *th;
445e8470afSJim Cownie 
455e8470afSJim Cownie   KMP_DEBUG_ASSERT(gtid_ref);
465e8470afSJim Cownie 
475e8470afSJim Cownie   if (__kmp_env_consistency_check) {
485e8470afSJim Cownie     th = __kmp_threads[*gtid_ref];
493041982dSJonathan Peyton     if (th->th.th_root->r.r_active &&
503041982dSJonathan Peyton         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
515c56fb55SAndrey Churbanov #if KMP_USE_DYNAMIC_LOCK
525c56fb55SAndrey Churbanov       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
535c56fb55SAndrey Churbanov #else
545e8470afSJim Cownie       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
555c56fb55SAndrey Churbanov #endif
565e8470afSJim Cownie     }
575e8470afSJim Cownie   }
585e8470afSJim Cownie }
595e8470afSJim Cownie 
6039ada854SJonathan Peyton void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
615e8470afSJim Cownie   kmp_info_t *th;
625e8470afSJim Cownie 
635e8470afSJim Cownie   if (__kmp_env_consistency_check) {
645e8470afSJim Cownie     th = __kmp_threads[*gtid_ref];
655e8470afSJim Cownie     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
665e8470afSJim Cownie       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
675e8470afSJim Cownie     }
685e8470afSJim Cownie   }
695e8470afSJim Cownie }
705e8470afSJim Cownie 
7171abe28eSJonathan Peyton // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
725e348774SPeyton, Jonathan L static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
7371abe28eSJonathan Peyton                                          bool use_hier = false) {
7471abe28eSJonathan Peyton   // Pick up the nonmonotonic/monotonic bits from the scheduling type
755e348774SPeyton, Jonathan L   // TODO: make nonmonotonic when static_steal is fixed
765e348774SPeyton, Jonathan L   int monotonicity = SCHEDULE_MONOTONIC;
775e348774SPeyton, Jonathan L 
785e348774SPeyton, Jonathan L   // Let default be monotonic for executables
795e348774SPeyton, Jonathan L   // compiled with OpenMP* 4.5 or less compilers
805e348774SPeyton, Jonathan L   if (loc->get_openmp_version() < 50)
8171abe28eSJonathan Peyton     monotonicity = SCHEDULE_MONOTONIC;
825e348774SPeyton, Jonathan L 
8367773681SJonathan Peyton   if (use_hier || __kmp_force_monotonic)
845e348774SPeyton, Jonathan L     monotonicity = SCHEDULE_MONOTONIC;
855e348774SPeyton, Jonathan L   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
8671abe28eSJonathan Peyton     monotonicity = SCHEDULE_NONMONOTONIC;
8771abe28eSJonathan Peyton   else if (SCHEDULE_HAS_MONOTONIC(schedule))
8871abe28eSJonathan Peyton     monotonicity = SCHEDULE_MONOTONIC;
895e348774SPeyton, Jonathan L 
9071abe28eSJonathan Peyton   return monotonicity;
9171abe28eSJonathan Peyton }
9271abe28eSJonathan Peyton 
9339ada854SJonathan Peyton // Initialize a dispatch_private_info_template<T> buffer for a particular
9439ada854SJonathan Peyton // type of schedule,chunk.  The loop description is found in lb (lower bound),
9539ada854SJonathan Peyton // ub (upper bound), and st (stride).  nproc is the number of threads relevant
9639ada854SJonathan Peyton // to the scheduling (often the number of threads in a team, but not always if
9739ada854SJonathan Peyton // hierarchical scheduling is used).  tid is the id of the thread calling
9839ada854SJonathan Peyton // the function within the group of nproc threads.  It will have a value
9939ada854SJonathan Peyton // between 0 and nproc - 1.  This is often just the thread id within a team, but
10039ada854SJonathan Peyton // is not necessarily the case when using hierarchical scheduling.
10139ada854SJonathan Peyton // loc is the source file location of the corresponding loop
10239ada854SJonathan Peyton // gtid is the global thread id
1035e8470afSJim Cownie template <typename T>
10439ada854SJonathan Peyton void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
10539ada854SJonathan Peyton                                    dispatch_private_info_template<T> *pr,
10639ada854SJonathan Peyton                                    enum sched_type schedule, T lb, T ub,
10739ada854SJonathan Peyton                                    typename traits_t<T>::signed_t st,
10839ada854SJonathan Peyton #if USE_ITT_BUILD
10939ada854SJonathan Peyton                                    kmp_uint64 *cur_chunk,
11039ada854SJonathan Peyton #endif
11139ada854SJonathan Peyton                                    typename traits_t<T>::signed_t chunk,
11239ada854SJonathan Peyton                                    T nproc, T tid) {
1135e8470afSJim Cownie   typedef typename traits_t<T>::unsigned_t UT;
1145e8470afSJim Cownie   typedef typename traits_t<T>::floating_t DBL;
1155e8470afSJim Cownie 
1165e8470afSJim Cownie   int active;
1175e8470afSJim Cownie   T tc;
1185e8470afSJim Cownie   kmp_info_t *th;
1195e8470afSJim Cownie   kmp_team_t *team;
12071abe28eSJonathan Peyton   int monotonicity;
12171abe28eSJonathan Peyton   bool use_hier;
1225e8470afSJim Cownie 
1235e8470afSJim Cownie #ifdef KMP_DEBUG
124baad3f60SJonathan Peyton   typedef typename traits_t<T>::signed_t ST;
1255e8470afSJim Cownie   {
126aeb40adaSJonas Hahnfeld     char *buff;
1275e8470afSJim Cownie     // create format specifiers before the debug output
12839ada854SJonathan Peyton     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
12939ada854SJonathan Peyton                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
13039ada854SJonathan Peyton                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
13139ada854SJonathan Peyton                             traits_t<T>::spec, traits_t<T>::spec,
13239ada854SJonathan Peyton                             traits_t<ST>::spec, traits_t<ST>::spec,
13339ada854SJonathan Peyton                             traits_t<T>::spec, traits_t<T>::spec);
13439ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
1355e8470afSJim Cownie     __kmp_str_free(&buff);
1365e8470afSJim Cownie   }
1375e8470afSJim Cownie #endif
1385e8470afSJim Cownie   /* setup data */
1395e8470afSJim Cownie   th = __kmp_threads[gtid];
1405e8470afSJim Cownie   team = th->th.th_team;
1415e8470afSJim Cownie   active = !team->t.t_serialized;
1425e8470afSJim Cownie 
1434cc4bb4cSJim Cownie #if USE_ITT_BUILD
144e4b4f994SJonathan Peyton   int itt_need_metadata_reporting =
145e4b4f994SJonathan Peyton       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146e4b4f994SJonathan Peyton       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
14751aecb82SAndrey Churbanov       team->t.t_active_level == 1;
1484cc4bb4cSJim Cownie #endif
14971abe28eSJonathan Peyton 
15071abe28eSJonathan Peyton #if KMP_USE_HIER_SCHED
15171abe28eSJonathan Peyton   use_hier = pr->flags.use_hier;
15271abe28eSJonathan Peyton #else
15371abe28eSJonathan Peyton   use_hier = false;
154429dbc2aSAndrey Churbanov #endif
15571abe28eSJonathan Peyton 
15671abe28eSJonathan Peyton   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
1575e348774SPeyton, Jonathan L   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158ea0fe1dfSJonathan Peyton   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159ea0fe1dfSJonathan Peyton 
1605e8470afSJim Cownie   /* Pick up the nomerge/ordered bits from the scheduling type */
1615e8470afSJim Cownie   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
16239ada854SJonathan Peyton     pr->flags.nomerge = TRUE;
1633041982dSJonathan Peyton     schedule =
1643041982dSJonathan Peyton         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
1655e8470afSJim Cownie   } else {
16639ada854SJonathan Peyton     pr->flags.nomerge = FALSE;
1675e8470afSJim Cownie   }
16812313d44SJonathan Peyton   pr->type_size = traits_t<T>::type_size; // remember the size of variables
1695e8470afSJim Cownie   if (kmp_ord_lower & schedule) {
17039ada854SJonathan Peyton     pr->flags.ordered = TRUE;
1713041982dSJonathan Peyton     schedule =
1723041982dSJonathan Peyton         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
1735e8470afSJim Cownie   } else {
17439ada854SJonathan Peyton     pr->flags.ordered = FALSE;
1755e8470afSJim Cownie   }
17671abe28eSJonathan Peyton   // Ordered overrides nonmonotonic
17771abe28eSJonathan Peyton   if (pr->flags.ordered) {
17871abe28eSJonathan Peyton     monotonicity = SCHEDULE_MONOTONIC;
17971abe28eSJonathan Peyton   }
18045be4500SJonathan Peyton 
1815e8470afSJim Cownie   if (schedule == kmp_sch_static) {
1825e8470afSJim Cownie     schedule = __kmp_static;
1835e8470afSJim Cownie   } else {
1845e8470afSJim Cownie     if (schedule == kmp_sch_runtime) {
1853041982dSJonathan Peyton       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
1863041982dSJonathan Peyton       // not specified)
1875e8470afSJim Cownie       schedule = team->t.t_sched.r_sched_type;
1885e348774SPeyton, Jonathan L       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
18971abe28eSJonathan Peyton       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
1903041982dSJonathan Peyton       // Detail the schedule if needed (global controls are differentiated
1913041982dSJonathan Peyton       // appropriately)
1925e8470afSJim Cownie       if (schedule == kmp_sch_guided_chunked) {
1935e8470afSJim Cownie         schedule = __kmp_guided;
1945e8470afSJim Cownie       } else if (schedule == kmp_sch_static) {
1955e8470afSJim Cownie         schedule = __kmp_static;
1965e8470afSJim Cownie       }
1973041982dSJonathan Peyton       // Use the chunk size specified by OMP_SCHEDULE (or default if not
1983041982dSJonathan Peyton       // specified)
1995e8470afSJim Cownie       chunk = team->t.t_sched.chunk;
20000afbd01SJonathan Peyton #if USE_ITT_BUILD
20139ada854SJonathan Peyton       if (cur_chunk)
20239ada854SJonathan Peyton         *cur_chunk = chunk;
20300afbd01SJonathan Peyton #endif
2045e8470afSJim Cownie #ifdef KMP_DEBUG
2055e8470afSJim Cownie       {
206aeb40adaSJonas Hahnfeld         char *buff;
2075e8470afSJim Cownie         // create format specifiers before the debug output
20839ada854SJonathan Peyton         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
20939ada854SJonathan Peyton                                 "schedule:%%d chunk:%%%s\n",
2105e8470afSJim Cownie                                 traits_t<ST>::spec);
2115e8470afSJim Cownie         KD_TRACE(10, (buff, gtid, schedule, chunk));
2125e8470afSJim Cownie         __kmp_str_free(&buff);
2135e8470afSJim Cownie       }
2145e8470afSJim Cownie #endif
2155e8470afSJim Cownie     } else {
2165e8470afSJim Cownie       if (schedule == kmp_sch_guided_chunked) {
2175e8470afSJim Cownie         schedule = __kmp_guided;
2185e8470afSJim Cownie       }
2195e8470afSJim Cownie       if (chunk <= 0) {
2205e8470afSJim Cownie         chunk = KMP_DEFAULT_CHUNK;
2215e8470afSJim Cownie       }
2225e8470afSJim Cownie     }
2235e8470afSJim Cownie 
2245e8470afSJim Cownie     if (schedule == kmp_sch_auto) {
2255e8470afSJim Cownie       // mapping and differentiation: in the __kmp_do_serial_initialize()
2265e8470afSJim Cownie       schedule = __kmp_auto;
2275e8470afSJim Cownie #ifdef KMP_DEBUG
2285e8470afSJim Cownie       {
229aeb40adaSJonas Hahnfeld         char *buff;
2305e8470afSJim Cownie         // create format specifiers before the debug output
23139ada854SJonathan Peyton         buff = __kmp_str_format(
23239ada854SJonathan Peyton             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
2333041982dSJonathan Peyton             "schedule:%%d chunk:%%%s\n",
2345e8470afSJim Cownie             traits_t<ST>::spec);
2355e8470afSJim Cownie         KD_TRACE(10, (buff, gtid, schedule, chunk));
2365e8470afSJim Cownie         __kmp_str_free(&buff);
2375e8470afSJim Cownie       }
2385e8470afSJim Cownie #endif
2395e8470afSJim Cownie     }
24071abe28eSJonathan Peyton #if KMP_STATIC_STEAL_ENABLED
24171abe28eSJonathan Peyton     // map nonmonotonic:dynamic to static steal
24271abe28eSJonathan Peyton     if (schedule == kmp_sch_dynamic_chunked) {
24371abe28eSJonathan Peyton       if (monotonicity == SCHEDULE_NONMONOTONIC)
24471abe28eSJonathan Peyton         schedule = kmp_sch_static_steal;
24571abe28eSJonathan Peyton     }
24671abe28eSJonathan Peyton #endif
2475e8470afSJim Cownie     /* guided analytical not safe for too many threads */
24839ada854SJonathan Peyton     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
2495e8470afSJim Cownie       schedule = kmp_sch_guided_iterative_chunked;
2505e8470afSJim Cownie       KMP_WARNING(DispatchManyThreads);
2515e8470afSJim Cownie     }
252d454c73cSAndrey Churbanov     if (schedule == kmp_sch_runtime_simd) {
253d454c73cSAndrey Churbanov       // compiler provides simd_width in the chunk parameter
254d454c73cSAndrey Churbanov       schedule = team->t.t_sched.r_sched_type;
2555e348774SPeyton, Jonathan L       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
25671abe28eSJonathan Peyton       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257d454c73cSAndrey Churbanov       // Detail the schedule if needed (global controls are differentiated
258d454c73cSAndrey Churbanov       // appropriately)
259d454c73cSAndrey Churbanov       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260d454c73cSAndrey Churbanov           schedule == __kmp_static) {
261d454c73cSAndrey Churbanov         schedule = kmp_sch_static_balanced_chunked;
262d454c73cSAndrey Churbanov       } else {
263d454c73cSAndrey Churbanov         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264d454c73cSAndrey Churbanov           schedule = kmp_sch_guided_simd;
265d454c73cSAndrey Churbanov         }
266d454c73cSAndrey Churbanov         chunk = team->t.t_sched.chunk * chunk;
267d454c73cSAndrey Churbanov       }
268d454c73cSAndrey Churbanov #if USE_ITT_BUILD
26939ada854SJonathan Peyton       if (cur_chunk)
27039ada854SJonathan Peyton         *cur_chunk = chunk;
271d454c73cSAndrey Churbanov #endif
272d454c73cSAndrey Churbanov #ifdef KMP_DEBUG
273d454c73cSAndrey Churbanov       {
274aeb40adaSJonas Hahnfeld         char *buff;
275d454c73cSAndrey Churbanov         // create format specifiers before the debug output
27671abe28eSJonathan Peyton         buff = __kmp_str_format(
27771abe28eSJonathan Peyton             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278d454c73cSAndrey Churbanov             " chunk:%%%s\n",
279d454c73cSAndrey Churbanov             traits_t<ST>::spec);
280d454c73cSAndrey Churbanov         KD_TRACE(10, (buff, gtid, schedule, chunk));
281d454c73cSAndrey Churbanov         __kmp_str_free(&buff);
282d454c73cSAndrey Churbanov       }
283d454c73cSAndrey Churbanov #endif
284d454c73cSAndrey Churbanov     }
2855e8470afSJim Cownie     pr->u.p.parm1 = chunk;
2865e8470afSJim Cownie   }
2875e8470afSJim Cownie   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
2885e8470afSJim Cownie               "unknown scheduling type");
2895e8470afSJim Cownie 
2905e8470afSJim Cownie   pr->u.p.count = 0;
2915e8470afSJim Cownie 
2925e8470afSJim Cownie   if (__kmp_env_consistency_check) {
2935e8470afSJim Cownie     if (st == 0) {
2943041982dSJonathan Peyton       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
29539ada854SJonathan Peyton                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
2965e8470afSJim Cownie     }
2975e8470afSJim Cownie   }
2985235a1b6SJonathan Peyton   // compute trip count
2995235a1b6SJonathan Peyton   if (st == 1) { // most common case
3005235a1b6SJonathan Peyton     if (ub >= lb) {
3015235a1b6SJonathan Peyton       tc = ub - lb + 1;
3025235a1b6SJonathan Peyton     } else { // ub < lb
3035e8470afSJim Cownie       tc = 0; // zero-trip
3045235a1b6SJonathan Peyton     }
3055235a1b6SJonathan Peyton   } else if (st < 0) {
3065235a1b6SJonathan Peyton     if (lb >= ub) {
3075235a1b6SJonathan Peyton       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
3085235a1b6SJonathan Peyton       // where the division needs to be unsigned regardless of the result type
3095235a1b6SJonathan Peyton       tc = (UT)(lb - ub) / (-st) + 1;
3105235a1b6SJonathan Peyton     } else { // lb < ub
3115235a1b6SJonathan Peyton       tc = 0; // zero-trip
3125e8470afSJim Cownie     }
3135e8470afSJim Cownie   } else { // st > 0
3145235a1b6SJonathan Peyton     if (ub >= lb) {
3155235a1b6SJonathan Peyton       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
3165235a1b6SJonathan Peyton       // where the division needs to be unsigned regardless of the result type
3175235a1b6SJonathan Peyton       tc = (UT)(ub - lb) / st + 1;
3185235a1b6SJonathan Peyton     } else { // ub < lb
3195e8470afSJim Cownie       tc = 0; // zero-trip
3205e8470afSJim Cownie     }
3215e8470afSJim Cownie   }
3225e8470afSJim Cownie 
323d2b53cadSJonathan Peyton #if KMP_STATS_ENABLED
324d2b53cadSJonathan Peyton   if (KMP_MASTER_GTID(gtid)) {
325d2b53cadSJonathan Peyton     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326d2b53cadSJonathan Peyton   }
327d2b53cadSJonathan Peyton #endif
328d2b53cadSJonathan Peyton 
3295e8470afSJim Cownie   pr->u.p.lb = lb;
3305e8470afSJim Cownie   pr->u.p.ub = ub;
3315e8470afSJim Cownie   pr->u.p.st = st;
3325e8470afSJim Cownie   pr->u.p.tc = tc;
3335e8470afSJim Cownie 
3345e8470afSJim Cownie #if KMP_OS_WINDOWS
3355e8470afSJim Cownie   pr->u.p.last_upper = ub + st;
3365e8470afSJim Cownie #endif /* KMP_OS_WINDOWS */
3375e8470afSJim Cownie 
3385e8470afSJim Cownie   /* NOTE: only the active parallel region(s) has active ordered sections */
3395e8470afSJim Cownie 
3405e8470afSJim Cownie   if (active) {
34139ada854SJonathan Peyton     if (pr->flags.ordered) {
3425e8470afSJim Cownie       pr->ordered_bumped = 0;
3435e8470afSJim Cownie       pr->u.p.ordered_lower = 1;
3445e8470afSJim Cownie       pr->u.p.ordered_upper = 0;
3455e8470afSJim Cownie     }
3465e8470afSJim Cownie   }
3475e8470afSJim Cownie 
3485e8470afSJim Cownie   switch (schedule) {
349429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED)
3503041982dSJonathan Peyton   case kmp_sch_static_steal: {
3515e8470afSJim Cownie     T ntc, init;
3525e8470afSJim Cownie 
3533041982dSJonathan Peyton     KD_TRACE(100,
35439ada854SJonathan Peyton              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
35539ada854SJonathan Peyton               gtid));
3565e8470afSJim Cownie 
3575e8470afSJim Cownie     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
3585e8470afSJim Cownie     if (nproc > 1 && ntc >= nproc) {
359f0682ac4SJonathan Peyton       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
36039ada854SJonathan Peyton       T id = tid;
3615e8470afSJim Cownie       T small_chunk, extras;
3625e8470afSJim Cownie 
3635e8470afSJim Cownie       small_chunk = ntc / nproc;
3645e8470afSJim Cownie       extras = ntc % nproc;
3655e8470afSJim Cownie 
3665e8470afSJim Cownie       init = id * small_chunk + (id < extras ? id : extras);
3675e8470afSJim Cownie       pr->u.p.count = init;
3685e8470afSJim Cownie       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
3695e8470afSJim Cownie 
3705e8470afSJim Cownie       pr->u.p.parm2 = lb;
37171abe28eSJonathan Peyton       // parm3 is the number of times to attempt stealing which is
37271abe28eSJonathan Peyton       // proportional to the number of chunks per thread up until
37371abe28eSJonathan Peyton       // the maximum value of nproc.
37471abe28eSJonathan Peyton       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375429dbc2aSAndrey Churbanov       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
3765e8470afSJim Cownie       pr->u.p.st = st;
37712313d44SJonathan Peyton       if (traits_t<T>::type_size > 4) {
378429dbc2aSAndrey Churbanov         // AC: TODO: check if 16-byte CAS available and use it to
379429dbc2aSAndrey Churbanov         // improve performance (probably wait for explicit request
380429dbc2aSAndrey Churbanov         // before spending time on this).
381429dbc2aSAndrey Churbanov         // For now use dynamically allocated per-thread lock,
382429dbc2aSAndrey Churbanov         // free memory in __kmp_dispatch_next when status==0.
383abe64360SAndreyChurbanov         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384abe64360SAndreyChurbanov         pr->u.p.th_steal_lock =
385429dbc2aSAndrey Churbanov             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386abe64360SAndreyChurbanov         __kmp_init_lock(pr->u.p.th_steal_lock);
387429dbc2aSAndrey Churbanov       }
3885e8470afSJim Cownie       break;
3895e8470afSJim Cownie     } else {
390bd2fb41cSAndreyChurbanov       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391bd2fb41cSAndreyChurbanov       schedule = kmp_sch_dynamic_chunked;
392bd2fb41cSAndreyChurbanov       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393bd2fb41cSAndreyChurbanov                      "kmp_sch_dynamic_chunked\n",
3945e8470afSJim Cownie                      gtid));
395bd2fb41cSAndreyChurbanov       if (pr->u.p.parm1 <= 0)
396bd2fb41cSAndreyChurbanov         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397bd2fb41cSAndreyChurbanov       break;
3985e8470afSJim Cownie     } // if
3995e8470afSJim Cownie   } // case
4005e8470afSJim Cownie #endif
4013041982dSJonathan Peyton   case kmp_sch_static_balanced: {
4025e8470afSJim Cownie     T init, limit;
4035e8470afSJim Cownie 
40439ada854SJonathan Peyton     KD_TRACE(
40539ada854SJonathan Peyton         100,
40639ada854SJonathan Peyton         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
4075e8470afSJim Cownie          gtid));
4085e8470afSJim Cownie 
4095e8470afSJim Cownie     if (nproc > 1) {
41039ada854SJonathan Peyton       T id = tid;
4115e8470afSJim Cownie 
4125e8470afSJim Cownie       if (tc < nproc) {
4135e8470afSJim Cownie         if (id < tc) {
4145e8470afSJim Cownie           init = id;
4155e8470afSJim Cownie           limit = id;
4165e8470afSJim Cownie           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
4175e8470afSJim Cownie         } else {
4185e8470afSJim Cownie           pr->u.p.count = 1; /* means no more chunks to execute */
4195e8470afSJim Cownie           pr->u.p.parm1 = FALSE;
4205e8470afSJim Cownie           break;
4215e8470afSJim Cownie         }
4225e8470afSJim Cownie       } else {
4235e8470afSJim Cownie         T small_chunk = tc / nproc;
4245e8470afSJim Cownie         T extras = tc % nproc;
4255e8470afSJim Cownie         init = id * small_chunk + (id < extras ? id : extras);
4265e8470afSJim Cownie         limit = init + small_chunk - (id < extras ? 0 : 1);
4275e8470afSJim Cownie         pr->u.p.parm1 = (id == nproc - 1);
4285e8470afSJim Cownie       }
4295e8470afSJim Cownie     } else {
4305e8470afSJim Cownie       if (tc > 0) {
4315e8470afSJim Cownie         init = 0;
4325e8470afSJim Cownie         limit = tc - 1;
4335e8470afSJim Cownie         pr->u.p.parm1 = TRUE;
43439ada854SJonathan Peyton       } else {
43539ada854SJonathan Peyton         // zero trip count
4365e8470afSJim Cownie         pr->u.p.count = 1; /* means no more chunks to execute */
4375e8470afSJim Cownie         pr->u.p.parm1 = FALSE;
4385e8470afSJim Cownie         break;
4395e8470afSJim Cownie       }
4405e8470afSJim Cownie     }
4414cc4bb4cSJim Cownie #if USE_ITT_BUILD
4424cc4bb4cSJim Cownie     // Calculate chunk for metadata report
44351aecb82SAndrey Churbanov     if (itt_need_metadata_reporting)
44439ada854SJonathan Peyton       if (cur_chunk)
44539ada854SJonathan Peyton         *cur_chunk = limit - init + 1;
4464cc4bb4cSJim Cownie #endif
4475e8470afSJim Cownie     if (st == 1) {
4485e8470afSJim Cownie       pr->u.p.lb = lb + init;
4495e8470afSJim Cownie       pr->u.p.ub = lb + limit;
4505e8470afSJim Cownie     } else {
4513041982dSJonathan Peyton       // calculated upper bound, "ub" is user-defined upper bound
4523041982dSJonathan Peyton       T ub_tmp = lb + limit * st;
4535e8470afSJim Cownie       pr->u.p.lb = lb + init * st;
4543041982dSJonathan Peyton       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
4553041982dSJonathan Peyton       // it exactly
4565e8470afSJim Cownie       if (st > 0) {
4575e8470afSJim Cownie         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
4585e8470afSJim Cownie       } else {
4595e8470afSJim Cownie         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
4605e8470afSJim Cownie       }
4615e8470afSJim Cownie     }
46239ada854SJonathan Peyton     if (pr->flags.ordered) {
4635e8470afSJim Cownie       pr->u.p.ordered_lower = init;
4645e8470afSJim Cownie       pr->u.p.ordered_upper = limit;
4655e8470afSJim Cownie     }
4665e8470afSJim Cownie     break;
4675e8470afSJim Cownie   } // case
468d454c73cSAndrey Churbanov   case kmp_sch_static_balanced_chunked: {
469d454c73cSAndrey Churbanov     // similar to balanced, but chunk adjusted to multiple of simd width
47039ada854SJonathan Peyton     T nth = nproc;
47139ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472d454c73cSAndrey Churbanov                    " -> falling-through to static_greedy\n",
473d454c73cSAndrey Churbanov                    gtid));
474d454c73cSAndrey Churbanov     schedule = kmp_sch_static_greedy;
475d454c73cSAndrey Churbanov     if (nth > 1)
476d454c73cSAndrey Churbanov       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477d454c73cSAndrey Churbanov     else
478d454c73cSAndrey Churbanov       pr->u.p.parm1 = tc;
479d454c73cSAndrey Churbanov     break;
480d454c73cSAndrey Churbanov   } // case
48139ada854SJonathan Peyton   case kmp_sch_guided_simd:
48239ada854SJonathan Peyton   case kmp_sch_guided_iterative_chunked: {
48339ada854SJonathan Peyton     KD_TRACE(
48439ada854SJonathan Peyton         100,
48539ada854SJonathan Peyton         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
4863041982dSJonathan Peyton          " case\n",
4873041982dSJonathan Peyton          gtid));
4885e8470afSJim Cownie 
4895e8470afSJim Cownie     if (nproc > 1) {
4905e8470afSJim Cownie       if ((2L * chunk + 1) * nproc >= tc) {
4915e8470afSJim Cownie         /* chunk size too large, switch to dynamic */
4925e8470afSJim Cownie         schedule = kmp_sch_dynamic_chunked;
4935e8470afSJim Cownie       } else {
4945e8470afSJim Cownie         // when remaining iters become less than parm2 - switch to dynamic
4955e8470afSJim Cownie         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
4963041982dSJonathan Peyton         *(double *)&pr->u.p.parm3 =
4976b316febSTerry Wilmarth             guided_flt_param / (double)nproc; // may occupy parm3 and parm4
4985e8470afSJim Cownie       }
4995e8470afSJim Cownie     } else {
50039ada854SJonathan Peyton       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
5013041982dSJonathan Peyton                      "kmp_sch_static_greedy\n",
5023041982dSJonathan Peyton                      gtid));
5035e8470afSJim Cownie       schedule = kmp_sch_static_greedy;
5045e8470afSJim Cownie       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
50539ada854SJonathan Peyton       KD_TRACE(
50639ada854SJonathan Peyton           100,
50739ada854SJonathan Peyton           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
5083041982dSJonathan Peyton            gtid));
5095e8470afSJim Cownie       pr->u.p.parm1 = tc;
5105e8470afSJim Cownie     } // if
5115e8470afSJim Cownie   } // case
5125e8470afSJim Cownie   break;
5133041982dSJonathan Peyton   case kmp_sch_guided_analytical_chunked: {
51439ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
51539ada854SJonathan Peyton                    "kmp_sch_guided_analytical_chunked case\n",
5163041982dSJonathan Peyton                    gtid));
51739ada854SJonathan Peyton 
5185e8470afSJim Cownie     if (nproc > 1) {
5195e8470afSJim Cownie       if ((2L * chunk + 1) * nproc >= tc) {
5205e8470afSJim Cownie         /* chunk size too large, switch to dynamic */
5215e8470afSJim Cownie         schedule = kmp_sch_dynamic_chunked;
5225e8470afSJim Cownie       } else {
5235e8470afSJim Cownie         /* commonly used term: (2 nproc - 1)/(2 nproc) */
5245e8470afSJim Cownie         DBL x;
5255e8470afSJim Cownie 
526f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
5273041982dSJonathan Peyton         /* Linux* OS already has 64-bit computation by default for long double,
5283041982dSJonathan Peyton            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
5293041982dSJonathan Peyton            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
5303041982dSJonathan Peyton            instead of the default 53-bit. Even though long double doesn't work
5313041982dSJonathan Peyton            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
5323041982dSJonathan Peyton            expected to impact the correctness of the algorithm, but this has not
5333041982dSJonathan Peyton            been mathematically proven. */
5345e8470afSJim Cownie         // save original FPCW and set precision to 64-bit, as
5355e8470afSJim Cownie         // Windows* OS on IA-32 architecture defaults to 53-bit
536181b4bb3SJim Cownie         unsigned int oldFpcw = _control87(0, 0);
537181b4bb3SJim Cownie         _control87(_PC_64, _MCW_PC); // 0,0x30000
5385e8470afSJim Cownie #endif
5395e8470afSJim Cownie         /* value used for comparison in solver for cross-over point */
5405e8470afSJim Cownie         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
5415e8470afSJim Cownie 
5425e8470afSJim Cownie         /* crossover point--chunk indexes equal to or greater than
5435e8470afSJim Cownie            this point switch to dynamic-style scheduling */
5445e8470afSJim Cownie         UT cross;
5455e8470afSJim Cownie 
5465e8470afSJim Cownie         /* commonly used term: (2 nproc - 1)/(2 nproc) */
5476b316febSTerry Wilmarth         x = 1.0 - 0.5 / (double)nproc;
5485e8470afSJim Cownie 
5495e8470afSJim Cownie #ifdef KMP_DEBUG
5505e8470afSJim Cownie         { // test natural alignment
5515e8470afSJim Cownie           struct _test_a {
5525e8470afSJim Cownie             char a;
5535e8470afSJim Cownie             union {
5545e8470afSJim Cownie               char b;
5555e8470afSJim Cownie               DBL d;
5565e8470afSJim Cownie             };
5575e8470afSJim Cownie           } t;
5583041982dSJonathan Peyton           ptrdiff_t natural_alignment =
5593041982dSJonathan Peyton               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
5603041982dSJonathan Peyton           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
5613041982dSJonathan Peyton           // long)natural_alignment );
5623041982dSJonathan Peyton           KMP_DEBUG_ASSERT(
5633041982dSJonathan Peyton               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
5645e8470afSJim Cownie         }
5655e8470afSJim Cownie #endif // KMP_DEBUG
5665e8470afSJim Cownie 
5675e8470afSJim Cownie         /* save the term in thread private dispatch structure */
5685e8470afSJim Cownie         *(DBL *)&pr->u.p.parm3 = x;
5695e8470afSJim Cownie 
5703041982dSJonathan Peyton         /* solve for the crossover point to the nearest integer i for which C_i
5713041982dSJonathan Peyton            <= chunk */
5725e8470afSJim Cownie         {
5735e8470afSJim Cownie           UT left, right, mid;
5745e8470afSJim Cownie           long double p;
5755e8470afSJim Cownie 
5765e8470afSJim Cownie           /* estimate initial upper and lower bound */
5775e8470afSJim Cownie 
5785e8470afSJim Cownie           /* doesn't matter what value right is as long as it is positive, but
5793041982dSJonathan Peyton              it affects performance of the solver */
5805e8470afSJim Cownie           right = 229;
5815e8470afSJim Cownie           p = __kmp_pow<UT>(x, right);
5825e8470afSJim Cownie           if (p > target) {
5835e8470afSJim Cownie             do {
5845e8470afSJim Cownie               p *= p;
5855e8470afSJim Cownie               right <<= 1;
5865e8470afSJim Cownie             } while (p > target && right < (1 << 27));
5873041982dSJonathan Peyton             /* lower bound is previous (failed) estimate of upper bound */
5883041982dSJonathan Peyton             left = right >> 1;
5895e8470afSJim Cownie           } else {
5905e8470afSJim Cownie             left = 0;
5915e8470afSJim Cownie           }
5925e8470afSJim Cownie 
5935e8470afSJim Cownie           /* bisection root-finding method */
5945e8470afSJim Cownie           while (left + 1 < right) {
5955e8470afSJim Cownie             mid = (left + right) / 2;
5965e8470afSJim Cownie             if (__kmp_pow<UT>(x, mid) > target) {
5975e8470afSJim Cownie               left = mid;
5985e8470afSJim Cownie             } else {
5995e8470afSJim Cownie               right = mid;
6005e8470afSJim Cownie             }
6015e8470afSJim Cownie           } // while
6025e8470afSJim Cownie           cross = right;
6035e8470afSJim Cownie         }
6045e8470afSJim Cownie         /* assert sanity of computed crossover point */
6053041982dSJonathan Peyton         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
6063041982dSJonathan Peyton                    __kmp_pow<UT>(x, cross) <= target);
6075e8470afSJim Cownie 
6085e8470afSJim Cownie         /* save the crossover point in thread private dispatch structure */
6095e8470afSJim Cownie         pr->u.p.parm2 = cross;
6105e8470afSJim Cownie 
6115e8470afSJim Cownie // C75803
6125e8470afSJim Cownie #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
6135e8470afSJim Cownie #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
6145e8470afSJim Cownie #else
6155e8470afSJim Cownie #define GUIDED_ANALYTICAL_WORKAROUND (x)
6165e8470afSJim Cownie #endif
6175e8470afSJim Cownie         /* dynamic-style scheduling offset */
618*309b00a4SShilei Tian         pr->u.p.count = tc -
619*309b00a4SShilei Tian                         __kmp_dispatch_guided_remaining(
6203041982dSJonathan Peyton                             tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
6213041982dSJonathan Peyton                         cross * chunk;
622f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
6235e8470afSJim Cownie         // restore FPCW
624181b4bb3SJim Cownie         _control87(oldFpcw, _MCW_PC);
6255e8470afSJim Cownie #endif
6265e8470afSJim Cownie       } // if
6275e8470afSJim Cownie     } else {
62839ada854SJonathan Peyton       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
6293041982dSJonathan Peyton                      "kmp_sch_static_greedy\n",
6305e8470afSJim Cownie                      gtid));
6315e8470afSJim Cownie       schedule = kmp_sch_static_greedy;
6325e8470afSJim Cownie       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
6335e8470afSJim Cownie       pr->u.p.parm1 = tc;
6345e8470afSJim Cownie     } // if
6355e8470afSJim Cownie   } // case
6365e8470afSJim Cownie   break;
6375e8470afSJim Cownie   case kmp_sch_static_greedy:
63839ada854SJonathan Peyton     KD_TRACE(
63939ada854SJonathan Peyton         100,
64039ada854SJonathan Peyton         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
64139ada854SJonathan Peyton          gtid));
64239ada854SJonathan Peyton     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
6435e8470afSJim Cownie     break;
6445e8470afSJim Cownie   case kmp_sch_static_chunked:
6455e8470afSJim Cownie   case kmp_sch_dynamic_chunked:
64670bda912SJonathan Peyton     if (pr->u.p.parm1 <= 0) {
64770bda912SJonathan Peyton       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
64870bda912SJonathan Peyton     }
64939ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
6503041982dSJonathan Peyton                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
6513041982dSJonathan Peyton                    gtid));
6525e8470afSJim Cownie     break;
6533041982dSJonathan Peyton   case kmp_sch_trapezoidal: {
6545e8470afSJim Cownie     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
6555e8470afSJim Cownie 
6565e8470afSJim Cownie     T parm1, parm2, parm3, parm4;
6573041982dSJonathan Peyton     KD_TRACE(100,
65839ada854SJonathan Peyton              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
65939ada854SJonathan Peyton               gtid));
6605e8470afSJim Cownie 
6615e8470afSJim Cownie     parm1 = chunk;
6625e8470afSJim Cownie 
6635e8470afSJim Cownie     /* F : size of the first cycle */
66439ada854SJonathan Peyton     parm2 = (tc / (2 * nproc));
6655e8470afSJim Cownie 
6665e8470afSJim Cownie     if (parm2 < 1) {
6675e8470afSJim Cownie       parm2 = 1;
6685e8470afSJim Cownie     }
6695e8470afSJim Cownie 
6703041982dSJonathan Peyton     /* L : size of the last cycle.  Make sure the last cycle is not larger
6713041982dSJonathan Peyton        than the first cycle. */
6725e8470afSJim Cownie     if (parm1 < 1) {
6735e8470afSJim Cownie       parm1 = 1;
6745e8470afSJim Cownie     } else if (parm1 > parm2) {
6755e8470afSJim Cownie       parm1 = parm2;
6765e8470afSJim Cownie     }
6775e8470afSJim Cownie 
6785e8470afSJim Cownie     /* N : number of cycles */
6795e8470afSJim Cownie     parm3 = (parm2 + parm1);
6805e8470afSJim Cownie     parm3 = (2 * tc + parm3 - 1) / parm3;
6815e8470afSJim Cownie 
6825e8470afSJim Cownie     if (parm3 < 2) {
6835e8470afSJim Cownie       parm3 = 2;
6845e8470afSJim Cownie     }
6855e8470afSJim Cownie 
6865e8470afSJim Cownie     /* sigma : decreasing incr of the trapezoid */
6875e8470afSJim Cownie     parm4 = (parm3 - 1);
6885e8470afSJim Cownie     parm4 = (parm2 - parm1) / parm4;
6895e8470afSJim Cownie 
6905e8470afSJim Cownie     // pointless check, because parm4 >= 0 always
6915e8470afSJim Cownie     // if ( parm4 < 0 ) {
6925e8470afSJim Cownie     //    parm4 = 0;
6935e8470afSJim Cownie     //}
6945e8470afSJim Cownie 
6955e8470afSJim Cownie     pr->u.p.parm1 = parm1;
6965e8470afSJim Cownie     pr->u.p.parm2 = parm2;
6975e8470afSJim Cownie     pr->u.p.parm3 = parm3;
6985e8470afSJim Cownie     pr->u.p.parm4 = parm4;
6995e8470afSJim Cownie   } // case
7005e8470afSJim Cownie   break;
7015e8470afSJim Cownie 
7023041982dSJonathan Peyton   default: {
7036a393f75SJonathan Peyton     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
7045e8470afSJim Cownie                 KMP_HNT(GetNewerLibrary), // Hint
7055e8470afSJim Cownie                 __kmp_msg_null // Variadic argument list terminator
7065e8470afSJim Cownie     );
7073041982dSJonathan Peyton   } break;
7085e8470afSJim Cownie   } // switch
7095e8470afSJim Cownie   pr->schedule = schedule;
71039ada854SJonathan Peyton }
71139ada854SJonathan Peyton 
712f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
713f6399367SJonathan Peyton template <typename T>
714f6399367SJonathan Peyton inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
715f6399367SJonathan Peyton                                              typename traits_t<T>::signed_t st);
716f6399367SJonathan Peyton template <>
717f6399367SJonathan Peyton inline void
718f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
719f6399367SJonathan Peyton                                             kmp_int32 ub, kmp_int32 st) {
720f6399367SJonathan Peyton   __kmp_dispatch_init_hierarchy<kmp_int32>(
721f6399367SJonathan Peyton       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
722f6399367SJonathan Peyton       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
723f6399367SJonathan Peyton }
724f6399367SJonathan Peyton template <>
725f6399367SJonathan Peyton inline void
726f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
727f6399367SJonathan Peyton                                              kmp_uint32 ub, kmp_int32 st) {
728f6399367SJonathan Peyton   __kmp_dispatch_init_hierarchy<kmp_uint32>(
729f6399367SJonathan Peyton       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
730f6399367SJonathan Peyton       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
731f6399367SJonathan Peyton }
732f6399367SJonathan Peyton template <>
733f6399367SJonathan Peyton inline void
734f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
735f6399367SJonathan Peyton                                             kmp_int64 ub, kmp_int64 st) {
736f6399367SJonathan Peyton   __kmp_dispatch_init_hierarchy<kmp_int64>(
737f6399367SJonathan Peyton       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
738f6399367SJonathan Peyton       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
739f6399367SJonathan Peyton }
740f6399367SJonathan Peyton template <>
741f6399367SJonathan Peyton inline void
742f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
743f6399367SJonathan Peyton                                              kmp_uint64 ub, kmp_int64 st) {
744f6399367SJonathan Peyton   __kmp_dispatch_init_hierarchy<kmp_uint64>(
745f6399367SJonathan Peyton       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
746f6399367SJonathan Peyton       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
747f6399367SJonathan Peyton }
748f6399367SJonathan Peyton 
749f6399367SJonathan Peyton // free all the hierarchy scheduling memory associated with the team
750f6399367SJonathan Peyton void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
751f6399367SJonathan Peyton   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
752f6399367SJonathan Peyton   for (int i = 0; i < num_disp_buff; ++i) {
753f6399367SJonathan Peyton     // type does not matter here so use kmp_int32
754f6399367SJonathan Peyton     auto sh =
755f6399367SJonathan Peyton         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
756f6399367SJonathan Peyton             &team->t.t_disp_buffer[i]);
757f6399367SJonathan Peyton     if (sh->hier) {
758f6399367SJonathan Peyton       sh->hier->deallocate();
759f6399367SJonathan Peyton       __kmp_free(sh->hier);
760f6399367SJonathan Peyton     }
761f6399367SJonathan Peyton   }
762f6399367SJonathan Peyton }
763f6399367SJonathan Peyton #endif
764f6399367SJonathan Peyton 
76539ada854SJonathan Peyton // UT - unsigned flavor of T, ST - signed flavor of T,
76639ada854SJonathan Peyton // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
76739ada854SJonathan Peyton template <typename T>
76839ada854SJonathan Peyton static void
76939ada854SJonathan Peyton __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
77039ada854SJonathan Peyton                     T ub, typename traits_t<T>::signed_t st,
77139ada854SJonathan Peyton                     typename traits_t<T>::signed_t chunk, int push_ws) {
77239ada854SJonathan Peyton   typedef typename traits_t<T>::unsigned_t UT;
77339ada854SJonathan Peyton 
77439ada854SJonathan Peyton   int active;
77539ada854SJonathan Peyton   kmp_info_t *th;
77639ada854SJonathan Peyton   kmp_team_t *team;
77739ada854SJonathan Peyton   kmp_uint32 my_buffer_index;
77839ada854SJonathan Peyton   dispatch_private_info_template<T> *pr;
77939ada854SJonathan Peyton   dispatch_shared_info_template<T> volatile *sh;
78039ada854SJonathan Peyton 
78139ada854SJonathan Peyton   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
78239ada854SJonathan Peyton                    sizeof(dispatch_private_info));
78339ada854SJonathan Peyton   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
78439ada854SJonathan Peyton                    sizeof(dispatch_shared_info));
785787eb0c6SAndreyChurbanov   __kmp_assert_valid_gtid(gtid);
78639ada854SJonathan Peyton 
78739ada854SJonathan Peyton   if (!TCR_4(__kmp_init_parallel))
78839ada854SJonathan Peyton     __kmp_parallel_initialize();
78939ada854SJonathan Peyton 
7909b8bb323SJonathan Peyton   __kmp_resume_if_soft_paused();
7919b8bb323SJonathan Peyton 
79239ada854SJonathan Peyton #if INCLUDE_SSC_MARKS
79339ada854SJonathan Peyton   SSC_MARK_DISPATCH_INIT();
79439ada854SJonathan Peyton #endif
79539ada854SJonathan Peyton #ifdef KMP_DEBUG
796baad3f60SJonathan Peyton   typedef typename traits_t<T>::signed_t ST;
79739ada854SJonathan Peyton   {
79839ada854SJonathan Peyton     char *buff;
79939ada854SJonathan Peyton     // create format specifiers before the debug output
80039ada854SJonathan Peyton     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
80139ada854SJonathan Peyton                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
80239ada854SJonathan Peyton                             traits_t<ST>::spec, traits_t<T>::spec,
80339ada854SJonathan Peyton                             traits_t<T>::spec, traits_t<ST>::spec);
80439ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
80539ada854SJonathan Peyton     __kmp_str_free(&buff);
80639ada854SJonathan Peyton   }
80739ada854SJonathan Peyton #endif
80839ada854SJonathan Peyton   /* setup data */
80939ada854SJonathan Peyton   th = __kmp_threads[gtid];
81039ada854SJonathan Peyton   team = th->th.th_team;
81139ada854SJonathan Peyton   active = !team->t.t_serialized;
81239ada854SJonathan Peyton   th->th.th_ident = loc;
81339ada854SJonathan Peyton 
814f0682ac4SJonathan Peyton   // Any half-decent optimizer will remove this test when the blocks are empty
815f0682ac4SJonathan Peyton   // since the macros expand to nothing
816f0682ac4SJonathan Peyton   // when statistics are disabled.
817f0682ac4SJonathan Peyton   if (schedule == __kmp_static) {
818f0682ac4SJonathan Peyton     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
819f0682ac4SJonathan Peyton   } else {
820f0682ac4SJonathan Peyton     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
821f0682ac4SJonathan Peyton   }
822f0682ac4SJonathan Peyton 
823f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
824f6399367SJonathan Peyton   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
825f6399367SJonathan Peyton   // Hierarchical scheduling does not work with ordered, so if ordered is
826f6399367SJonathan Peyton   // detected, then revert back to threaded scheduling.
827f6399367SJonathan Peyton   bool ordered;
828f6399367SJonathan Peyton   enum sched_type my_sched = schedule;
829f6399367SJonathan Peyton   my_buffer_index = th->th.th_dispatch->th_disp_index;
830f6399367SJonathan Peyton   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
831f6399367SJonathan Peyton       &th->th.th_dispatch
832f6399367SJonathan Peyton            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
833f6399367SJonathan Peyton   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
834f6399367SJonathan Peyton   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
835f6399367SJonathan Peyton     my_sched =
836f6399367SJonathan Peyton         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
837f6399367SJonathan Peyton   ordered = (kmp_ord_lower & my_sched);
838f6399367SJonathan Peyton   if (pr->flags.use_hier) {
839f6399367SJonathan Peyton     if (ordered) {
840f6399367SJonathan Peyton       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
841f6399367SJonathan Peyton                      "Disabling hierarchical scheduling.\n",
842f6399367SJonathan Peyton                      gtid));
843f6399367SJonathan Peyton       pr->flags.use_hier = FALSE;
844f6399367SJonathan Peyton     }
845f6399367SJonathan Peyton   }
846f6399367SJonathan Peyton   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
847f6399367SJonathan Peyton     // Don't use hierarchical for ordered parallel loops and don't
848f6399367SJonathan Peyton     // use the runtime hierarchy if one was specified in the program
849f6399367SJonathan Peyton     if (!ordered && !pr->flags.use_hier)
850f6399367SJonathan Peyton       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
851f6399367SJonathan Peyton   }
852f6399367SJonathan Peyton #endif // KMP_USE_HIER_SCHED
853f6399367SJonathan Peyton 
85439ada854SJonathan Peyton #if USE_ITT_BUILD
85539ada854SJonathan Peyton   kmp_uint64 cur_chunk = chunk;
856e4b4f994SJonathan Peyton   int itt_need_metadata_reporting =
857e4b4f994SJonathan Peyton       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
858e4b4f994SJonathan Peyton       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
85939ada854SJonathan Peyton       team->t.t_active_level == 1;
86039ada854SJonathan Peyton #endif
86139ada854SJonathan Peyton   if (!active) {
86239ada854SJonathan Peyton     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
86339ada854SJonathan Peyton         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
86439ada854SJonathan Peyton   } else {
86539ada854SJonathan Peyton     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
86639ada854SJonathan Peyton                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
86739ada854SJonathan Peyton 
86839ada854SJonathan Peyton     my_buffer_index = th->th.th_dispatch->th_disp_index++;
86939ada854SJonathan Peyton 
87039ada854SJonathan Peyton     /* What happens when number of threads changes, need to resize buffer? */
87139ada854SJonathan Peyton     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
87239ada854SJonathan Peyton         &th->th.th_dispatch
87339ada854SJonathan Peyton              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
87439ada854SJonathan Peyton     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
87539ada854SJonathan Peyton         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
87639ada854SJonathan Peyton     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
87739ada854SJonathan Peyton                   my_buffer_index));
87839ada854SJonathan Peyton   }
87939ada854SJonathan Peyton 
88039ada854SJonathan Peyton   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
88139ada854SJonathan Peyton #if USE_ITT_BUILD
88239ada854SJonathan Peyton                                 &cur_chunk,
88339ada854SJonathan Peyton #endif
88439ada854SJonathan Peyton                                 chunk, (T)th->th.th_team_nproc,
88539ada854SJonathan Peyton                                 (T)th->th.th_info.ds.ds_tid);
88639ada854SJonathan Peyton   if (active) {
88739ada854SJonathan Peyton     if (pr->flags.ordered == 0) {
88839ada854SJonathan Peyton       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
88939ada854SJonathan Peyton       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
89039ada854SJonathan Peyton     } else {
89139ada854SJonathan Peyton       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
89239ada854SJonathan Peyton       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
89339ada854SJonathan Peyton     }
89439ada854SJonathan Peyton   }
89539ada854SJonathan Peyton 
8965e8470afSJim Cownie   if (active) {
8973041982dSJonathan Peyton     /* The name of this buffer should be my_buffer_index when it's free to use
8983041982dSJonathan Peyton      * it */
8995e8470afSJim Cownie 
9003041982dSJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
9013041982dSJonathan Peyton                    "sh->buffer_index:%d\n",
9025e8470afSJim Cownie                    gtid, my_buffer_index, sh->buffer_index));
903e47d32f1SJonathan Peyton     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
9043041982dSJonathan Peyton                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
905e47d32f1SJonathan Peyton     // Note: KMP_WAIT() cannot be used there: buffer index and
9063041982dSJonathan Peyton     // my_buffer_index are *always* 32-bit integers.
9075e8470afSJim Cownie     KMP_MB(); /* is this necessary? */
9083041982dSJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
9093041982dSJonathan Peyton                    "sh->buffer_index:%d\n",
9105e8470afSJim Cownie                    gtid, my_buffer_index, sh->buffer_index));
9115e8470afSJim Cownie 
9125e8470afSJim Cownie     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
913c47afcd9SAndrey Churbanov     th->th.th_dispatch->th_dispatch_sh_current =
9145ba90c79SAndrey Churbanov         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
9155e8470afSJim Cownie #if USE_ITT_BUILD
91639ada854SJonathan Peyton     if (pr->flags.ordered) {
9175e8470afSJim Cownie       __kmp_itt_ordered_init(gtid);
918bd3a7633SJonathan Peyton     }
9194cc4bb4cSJim Cownie     // Report loop metadata
92051aecb82SAndrey Churbanov     if (itt_need_metadata_reporting) {
92151aecb82SAndrey Churbanov       // Only report metadata by master of active team at level 1
9224cc4bb4cSJim Cownie       kmp_uint64 schedtype = 0;
9234cc4bb4cSJim Cownie       switch (schedule) {
9244cc4bb4cSJim Cownie       case kmp_sch_static_chunked:
9254cc4bb4cSJim Cownie       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
9264cc4bb4cSJim Cownie         break;
9274cc4bb4cSJim Cownie       case kmp_sch_static_greedy:
9284cc4bb4cSJim Cownie         cur_chunk = pr->u.p.parm1;
9294cc4bb4cSJim Cownie         break;
9304cc4bb4cSJim Cownie       case kmp_sch_dynamic_chunked:
9314cc4bb4cSJim Cownie         schedtype = 1;
9324cc4bb4cSJim Cownie         break;
9334cc4bb4cSJim Cownie       case kmp_sch_guided_iterative_chunked:
9344cc4bb4cSJim Cownie       case kmp_sch_guided_analytical_chunked:
935d454c73cSAndrey Churbanov       case kmp_sch_guided_simd:
9364cc4bb4cSJim Cownie         schedtype = 2;
9374cc4bb4cSJim Cownie         break;
9384cc4bb4cSJim Cownie       default:
9394cc4bb4cSJim Cownie         // Should we put this case under "static"?
9404cc4bb4cSJim Cownie         // case kmp_sch_static_steal:
9414cc4bb4cSJim Cownie         schedtype = 3;
9424cc4bb4cSJim Cownie         break;
9434cc4bb4cSJim Cownie       }
94439ada854SJonathan Peyton       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
9454cc4bb4cSJim Cownie     }
946f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
947f6399367SJonathan Peyton     if (pr->flags.use_hier) {
948f6399367SJonathan Peyton       pr->u.p.count = 0;
949f6399367SJonathan Peyton       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
950f6399367SJonathan Peyton     }
951f6399367SJonathan Peyton #endif // KMP_USER_HIER_SCHED
9524cc4bb4cSJim Cownie #endif /* USE_ITT_BUILD */
953bd3a7633SJonathan Peyton   }
9544cc4bb4cSJim Cownie 
9555e8470afSJim Cownie #ifdef KMP_DEBUG
9565e8470afSJim Cownie   {
957aeb40adaSJonas Hahnfeld     char *buff;
9585e8470afSJim Cownie     // create format specifiers before the debug output
9595e8470afSJim Cownie     buff = __kmp_str_format(
9603041982dSJonathan Peyton         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
9613041982dSJonathan Peyton         "lb:%%%s ub:%%%s"
9623041982dSJonathan Peyton         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
9635e8470afSJim Cownie         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
9645e8470afSJim Cownie         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
9655e8470afSJim Cownie         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
9665e8470afSJim Cownie         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
9675e8470afSJim Cownie         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
96839ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
96939ada854SJonathan Peyton                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
97039ada854SJonathan Peyton                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
97139ada854SJonathan Peyton                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
9725e8470afSJim Cownie     __kmp_str_free(&buff);
9735e8470afSJim Cownie   }
9745e8470afSJim Cownie #endif
9755e8470afSJim Cownie #if (KMP_STATIC_STEAL_ENABLED)
9763041982dSJonathan Peyton   // It cannot be guaranteed that after execution of a loop with some other
9773041982dSJonathan Peyton   // schedule kind all the parm3 variables will contain the same value. Even if
9783041982dSJonathan Peyton   // all parm3 will be the same, it still exists a bad case like using 0 and 1
9793041982dSJonathan Peyton   // rather than program life-time increment. So the dedicated variable is
9803041982dSJonathan Peyton   // required. The 'static_steal_counter' is used.
981abe64360SAndreyChurbanov   if (pr->schedule == kmp_sch_static_steal) {
9825e8470afSJim Cownie     // Other threads will inspect this variable when searching for a victim.
9833041982dSJonathan Peyton     // This is a flag showing that other threads may steal from this thread
9843041982dSJonathan Peyton     // since then.
9855e8470afSJim Cownie     volatile T *p = &pr->u.p.static_steal_counter;
9865e8470afSJim Cownie     *p = *p + 1;
9875e8470afSJim Cownie   }
988429dbc2aSAndrey Churbanov #endif // ( KMP_STATIC_STEAL_ENABLED )
989d7d088f8SAndrey Churbanov 
99082e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
99182e94a59SJoachim Protze   if (ompt_enabled.ompt_callback_work) {
992d7d088f8SAndrey Churbanov     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
99382e94a59SJoachim Protze     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
99482e94a59SJoachim Protze     ompt_callbacks.ompt_callback(ompt_callback_work)(
99582e94a59SJoachim Protze         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
99639ada854SJonathan Peyton         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
997d7d088f8SAndrey Churbanov   }
998d7d088f8SAndrey Churbanov #endif
999f0682ac4SJonathan Peyton   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
10005e8470afSJim Cownie }
10015e8470afSJim Cownie 
10023041982dSJonathan Peyton /* For ordered loops, either __kmp_dispatch_finish() should be called after
10035e8470afSJim Cownie  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
10045e8470afSJim Cownie  * every chunk of iterations.  If the ordered section(s) were not executed
10055e8470afSJim Cownie  * for this iteration (or every iteration in this chunk), we need to set the
10063041982dSJonathan Peyton  * ordered iteration counters so that the next thread can proceed. */
10075e8470afSJim Cownie template <typename UT>
10083041982dSJonathan Peyton static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
10095e8470afSJim Cownie   typedef typename traits_t<UT>::signed_t ST;
1010787eb0c6SAndreyChurbanov   __kmp_assert_valid_gtid(gtid);
10115e8470afSJim Cownie   kmp_info_t *th = __kmp_threads[gtid];
10125e8470afSJim Cownie 
10135e8470afSJim Cownie   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
10145e8470afSJim Cownie   if (!th->th.th_team->t.t_serialized) {
10155e8470afSJim Cownie 
10165e8470afSJim Cownie     dispatch_private_info_template<UT> *pr =
10173041982dSJonathan Peyton         reinterpret_cast<dispatch_private_info_template<UT> *>(
10183041982dSJonathan Peyton             th->th.th_dispatch->th_dispatch_pr_current);
10195e8470afSJim Cownie     dispatch_shared_info_template<UT> volatile *sh =
10203041982dSJonathan Peyton         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
10213041982dSJonathan Peyton             th->th.th_dispatch->th_dispatch_sh_current);
10225e8470afSJim Cownie     KMP_DEBUG_ASSERT(pr);
10235e8470afSJim Cownie     KMP_DEBUG_ASSERT(sh);
10245e8470afSJim Cownie     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
10255e8470afSJim Cownie                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
10265e8470afSJim Cownie 
10275e8470afSJim Cownie     if (pr->ordered_bumped) {
10283041982dSJonathan Peyton       KD_TRACE(
10293041982dSJonathan Peyton           1000,
10303041982dSJonathan Peyton           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
10315e8470afSJim Cownie            gtid));
10325e8470afSJim Cownie       pr->ordered_bumped = 0;
10335e8470afSJim Cownie     } else {
10345e8470afSJim Cownie       UT lower = pr->u.p.ordered_lower;
10355e8470afSJim Cownie 
10365e8470afSJim Cownie #ifdef KMP_DEBUG
10375e8470afSJim Cownie       {
1038aeb40adaSJonas Hahnfeld         char *buff;
10395e8470afSJim Cownie         // create format specifiers before the debug output
10403041982dSJonathan Peyton         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
10413041982dSJonathan Peyton                                 "ordered_iteration:%%%s lower:%%%s\n",
10425e8470afSJim Cownie                                 traits_t<UT>::spec, traits_t<UT>::spec);
10435e8470afSJim Cownie         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
10445e8470afSJim Cownie         __kmp_str_free(&buff);
10455e8470afSJim Cownie       }
10465e8470afSJim Cownie #endif
10475e8470afSJim Cownie 
1048e47d32f1SJonathan Peyton       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
10493041982dSJonathan Peyton                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
10505e8470afSJim Cownie       KMP_MB(); /* is this necessary? */
10515e8470afSJim Cownie #ifdef KMP_DEBUG
10525e8470afSJim Cownie       {
1053aeb40adaSJonas Hahnfeld         char *buff;
10545e8470afSJim Cownie         // create format specifiers before the debug output
10553041982dSJonathan Peyton         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
10563041982dSJonathan Peyton                                 "ordered_iteration:%%%s lower:%%%s\n",
10575e8470afSJim Cownie                                 traits_t<UT>::spec, traits_t<UT>::spec);
10585e8470afSJim Cownie         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
10595e8470afSJim Cownie         __kmp_str_free(&buff);
10605e8470afSJim Cownie       }
10615e8470afSJim Cownie #endif
10625e8470afSJim Cownie 
10635e8470afSJim Cownie       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
10645e8470afSJim Cownie     } // if
10655e8470afSJim Cownie   } // if
10665e8470afSJim Cownie   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
10675e8470afSJim Cownie }
10685e8470afSJim Cownie 
10695e8470afSJim Cownie #ifdef KMP_GOMP_COMPAT
10705e8470afSJim Cownie 
10715e8470afSJim Cownie template <typename UT>
10723041982dSJonathan Peyton static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
10735e8470afSJim Cownie   typedef typename traits_t<UT>::signed_t ST;
1074787eb0c6SAndreyChurbanov   __kmp_assert_valid_gtid(gtid);
10755e8470afSJim Cownie   kmp_info_t *th = __kmp_threads[gtid];
10765e8470afSJim Cownie 
10775e8470afSJim Cownie   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
10785e8470afSJim Cownie   if (!th->th.th_team->t.t_serialized) {
10795e8470afSJim Cownie     //        int cid;
10805e8470afSJim Cownie     dispatch_private_info_template<UT> *pr =
10813041982dSJonathan Peyton         reinterpret_cast<dispatch_private_info_template<UT> *>(
10823041982dSJonathan Peyton             th->th.th_dispatch->th_dispatch_pr_current);
10835e8470afSJim Cownie     dispatch_shared_info_template<UT> volatile *sh =
10843041982dSJonathan Peyton         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
10853041982dSJonathan Peyton             th->th.th_dispatch->th_dispatch_sh_current);
10865e8470afSJim Cownie     KMP_DEBUG_ASSERT(pr);
10875e8470afSJim Cownie     KMP_DEBUG_ASSERT(sh);
10885e8470afSJim Cownie     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
10895e8470afSJim Cownie                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
10905e8470afSJim Cownie 
10915e8470afSJim Cownie     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
10925e8470afSJim Cownie     UT lower = pr->u.p.ordered_lower;
10935e8470afSJim Cownie     UT upper = pr->u.p.ordered_upper;
10945e8470afSJim Cownie     UT inc = upper - lower + 1;
10955e8470afSJim Cownie 
10965e8470afSJim Cownie     if (pr->ordered_bumped == inc) {
10973041982dSJonathan Peyton       KD_TRACE(
10983041982dSJonathan Peyton           1000,
10993041982dSJonathan Peyton           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
11005e8470afSJim Cownie            gtid));
11015e8470afSJim Cownie       pr->ordered_bumped = 0;
11025e8470afSJim Cownie     } else {
11035e8470afSJim Cownie       inc -= pr->ordered_bumped;
11045e8470afSJim Cownie 
11055e8470afSJim Cownie #ifdef KMP_DEBUG
11065e8470afSJim Cownie       {
1107aeb40adaSJonas Hahnfeld         char *buff;
11085e8470afSJim Cownie         // create format specifiers before the debug output
11095e8470afSJim Cownie         buff = __kmp_str_format(
11103041982dSJonathan Peyton             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
11115e8470afSJim Cownie             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
11125e8470afSJim Cownie             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
11135e8470afSJim Cownie         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
11145e8470afSJim Cownie         __kmp_str_free(&buff);
11155e8470afSJim Cownie       }
11165e8470afSJim Cownie #endif
11175e8470afSJim Cownie 
1118e47d32f1SJonathan Peyton       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
11193041982dSJonathan Peyton                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
11205e8470afSJim Cownie 
11215e8470afSJim Cownie       KMP_MB(); /* is this necessary? */
11223041982dSJonathan Peyton       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
11233041982dSJonathan Peyton                       "ordered_bumped to zero\n",
11245e8470afSJim Cownie                       gtid));
11255e8470afSJim Cownie       pr->ordered_bumped = 0;
11265e8470afSJim Cownie //!!!!! TODO check if the inc should be unsigned, or signed???
11275e8470afSJim Cownie #ifdef KMP_DEBUG
11285e8470afSJim Cownie       {
1129aeb40adaSJonas Hahnfeld         char *buff;
11305e8470afSJim Cownie         // create format specifiers before the debug output
11315e8470afSJim Cownie         buff = __kmp_str_format(
11323041982dSJonathan Peyton             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
11335e8470afSJim Cownie             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
11343041982dSJonathan Peyton             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
11353041982dSJonathan Peyton             traits_t<UT>::spec);
11363041982dSJonathan Peyton         KD_TRACE(1000,
11373041982dSJonathan Peyton                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
11385e8470afSJim Cownie         __kmp_str_free(&buff);
11395e8470afSJim Cownie       }
11405e8470afSJim Cownie #endif
11415e8470afSJim Cownie 
11425e8470afSJim Cownie       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
11435e8470afSJim Cownie     }
11445e8470afSJim Cownie     //        }
11455e8470afSJim Cownie   }
11465e8470afSJim Cownie   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
11475e8470afSJim Cownie }
11485e8470afSJim Cownie 
11495e8470afSJim Cownie #endif /* KMP_GOMP_COMPAT */
11505e8470afSJim Cownie 
11515e8470afSJim Cownie template <typename T>
115239ada854SJonathan Peyton int __kmp_dispatch_next_algorithm(int gtid,
115339ada854SJonathan Peyton                                   dispatch_private_info_template<T> *pr,
115439ada854SJonathan Peyton                                   dispatch_shared_info_template<T> volatile *sh,
115539ada854SJonathan Peyton                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
115639ada854SJonathan Peyton                                   typename traits_t<T>::signed_t *p_st, T nproc,
115739ada854SJonathan Peyton                                   T tid) {
11585e8470afSJim Cownie   typedef typename traits_t<T>::unsigned_t UT;
11595e8470afSJim Cownie   typedef typename traits_t<T>::signed_t ST;
11605e8470afSJim Cownie   typedef typename traits_t<T>::floating_t DBL;
116139ada854SJonathan Peyton   int status = 0;
11626b316febSTerry Wilmarth   bool last = false;
116339ada854SJonathan Peyton   T start;
116439ada854SJonathan Peyton   ST incr;
116539ada854SJonathan Peyton   UT limit, trip, init;
11665e8470afSJim Cownie   kmp_info_t *th = __kmp_threads[gtid];
11675e8470afSJim Cownie   kmp_team_t *team = th->th.th_team;
11685e8470afSJim Cownie 
11695e8470afSJim Cownie   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
11705e8470afSJim Cownie                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
11715e8470afSJim Cownie   KMP_DEBUG_ASSERT(pr);
11725e8470afSJim Cownie   KMP_DEBUG_ASSERT(sh);
117339ada854SJonathan Peyton   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
117439ada854SJonathan Peyton #ifdef KMP_DEBUG
117539ada854SJonathan Peyton   {
117639ada854SJonathan Peyton     char *buff;
117739ada854SJonathan Peyton     // create format specifiers before the debug output
117839ada854SJonathan Peyton     buff =
117939ada854SJonathan Peyton         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
118039ada854SJonathan Peyton                          "sh:%%p nproc:%%%s tid:%%%s\n",
118139ada854SJonathan Peyton                          traits_t<T>::spec, traits_t<T>::spec);
118239ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
118339ada854SJonathan Peyton     __kmp_str_free(&buff);
118439ada854SJonathan Peyton   }
118539ada854SJonathan Peyton #endif
11865e8470afSJim Cownie 
11875e8470afSJim Cownie   // zero trip count
118839ada854SJonathan Peyton   if (pr->u.p.tc == 0) {
118939ada854SJonathan Peyton     KD_TRACE(10,
119039ada854SJonathan Peyton              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
119139ada854SJonathan Peyton               "zero status:%d\n",
119239ada854SJonathan Peyton               gtid, status));
119339ada854SJonathan Peyton     return 0;
119439ada854SJonathan Peyton   }
119539ada854SJonathan Peyton 
11965e8470afSJim Cownie   switch (pr->schedule) {
1197429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED)
11983041982dSJonathan Peyton   case kmp_sch_static_steal: {
11995e8470afSJim Cownie     T chunk = pr->u.p.parm1;
12005e8470afSJim Cownie 
120139ada854SJonathan Peyton     KD_TRACE(100,
120239ada854SJonathan Peyton              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
12033041982dSJonathan Peyton               gtid));
12045e8470afSJim Cownie 
12055e8470afSJim Cownie     trip = pr->u.p.tc - 1;
12065e8470afSJim Cownie 
120712313d44SJonathan Peyton     if (traits_t<T>::type_size > 4) {
1208429dbc2aSAndrey Churbanov       // use lock for 8-byte and CAS for 4-byte induction
1209429dbc2aSAndrey Churbanov       // variable. TODO (optional): check and use 16-byte CAS
1210abe64360SAndreyChurbanov       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1211429dbc2aSAndrey Churbanov       KMP_DEBUG_ASSERT(lck != NULL);
1212429dbc2aSAndrey Churbanov       if (pr->u.p.count < (UT)pr->u.p.ub) {
1213429dbc2aSAndrey Churbanov         __kmp_acquire_lock(lck, gtid);
1214429dbc2aSAndrey Churbanov         // try to get own chunk of iterations
12155e8470afSJim Cownie         init = (pr->u.p.count)++;
12165e8470afSJim Cownie         status = (init < (UT)pr->u.p.ub);
1217429dbc2aSAndrey Churbanov         __kmp_release_lock(lck, gtid);
12185e8470afSJim Cownie       } else {
1219429dbc2aSAndrey Churbanov         status = 0; // no own chunks
1220429dbc2aSAndrey Churbanov       }
1221429dbc2aSAndrey Churbanov       if (!status) { // try to steal
1222429dbc2aSAndrey Churbanov         kmp_info_t **other_threads = team->t.t_threads;
12236b316febSTerry Wilmarth         T while_limit = pr->u.p.parm3;
12246b316febSTerry Wilmarth         T while_index = 0;
1225abe64360SAndreyChurbanov         T id = pr->u.p.static_steal_counter; // loop id
1226abe64360SAndreyChurbanov         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1227abe64360SAndreyChurbanov                   __kmp_dispatch_num_buffers; // current loop index
1228abe64360SAndreyChurbanov         // note: victim thread can potentially execute another loop
1229429dbc2aSAndrey Churbanov         // TODO: algorithm of searching for a victim
1230429dbc2aSAndrey Churbanov         // should be cleaned up and measured
1231429dbc2aSAndrey Churbanov         while ((!status) && (while_limit != ++while_index)) {
1232abe64360SAndreyChurbanov           dispatch_private_info_template<T> *victim;
1233429dbc2aSAndrey Churbanov           T remaining;
1234429dbc2aSAndrey Churbanov           T victimIdx = pr->u.p.parm4;
1235429dbc2aSAndrey Churbanov           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1236abe64360SAndreyChurbanov           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1237abe64360SAndreyChurbanov               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1238abe64360SAndreyChurbanov           KMP_DEBUG_ASSERT(victim);
1239abe64360SAndreyChurbanov           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
12403041982dSJonathan Peyton                  oldVictimIdx != victimIdx) {
1241429dbc2aSAndrey Churbanov             victimIdx = (victimIdx + 1) % nproc;
12423041982dSJonathan Peyton             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1243abe64360SAndreyChurbanov                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1244abe64360SAndreyChurbanov             KMP_DEBUG_ASSERT(victim);
1245bd3a7633SJonathan Peyton           }
1246abe64360SAndreyChurbanov           if (victim == pr || id != victim->u.p.static_steal_counter) {
1247429dbc2aSAndrey Churbanov             continue; // try once more (nproc attempts in total)
1248429dbc2aSAndrey Churbanov             // no victim is ready yet to participate in stealing
1249abe64360SAndreyChurbanov             // because no victim passed kmp_init_dispatch yet
1250429dbc2aSAndrey Churbanov           }
1251429dbc2aSAndrey Churbanov           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1252429dbc2aSAndrey Churbanov             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1253429dbc2aSAndrey Churbanov             continue; // not enough chunks to steal, goto next victim
1254429dbc2aSAndrey Churbanov           }
1255429dbc2aSAndrey Churbanov 
1256abe64360SAndreyChurbanov           lck = victim->u.p.th_steal_lock;
1257429dbc2aSAndrey Churbanov           KMP_ASSERT(lck != NULL);
1258429dbc2aSAndrey Churbanov           __kmp_acquire_lock(lck, gtid);
1259429dbc2aSAndrey Churbanov           limit = victim->u.p.ub; // keep initial ub
1260429dbc2aSAndrey Churbanov           if (victim->u.p.count >= limit ||
12613041982dSJonathan Peyton               (remaining = limit - victim->u.p.count) < 2) {
1262429dbc2aSAndrey Churbanov             __kmp_release_lock(lck, gtid);
1263429dbc2aSAndrey Churbanov             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1264429dbc2aSAndrey Churbanov             continue; // not enough chunks to steal
1265429dbc2aSAndrey Churbanov           }
126642016791SKazuaki Ishizaki           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
126739ada854SJonathan Peyton           // by 1
1268429dbc2aSAndrey Churbanov           if (remaining > 3) {
126939ada854SJonathan Peyton             // steal 1/4 of remaining
1270f0682ac4SJonathan Peyton             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
127139ada854SJonathan Peyton             init = (victim->u.p.ub -= (remaining >> 2));
1272429dbc2aSAndrey Churbanov           } else {
127339ada854SJonathan Peyton             // steal 1 chunk of 2 or 3 remaining
1274f0682ac4SJonathan Peyton             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
127539ada854SJonathan Peyton             init = (victim->u.p.ub -= 1);
1276429dbc2aSAndrey Churbanov           }
1277429dbc2aSAndrey Churbanov           __kmp_release_lock(lck, gtid);
1278429dbc2aSAndrey Churbanov 
1279429dbc2aSAndrey Churbanov           KMP_DEBUG_ASSERT(init + 1 <= limit);
1280429dbc2aSAndrey Churbanov           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1281429dbc2aSAndrey Churbanov           status = 1;
1282429dbc2aSAndrey Churbanov           while_index = 0;
1283429dbc2aSAndrey Churbanov           // now update own count and ub with stolen range but init chunk
1284abe64360SAndreyChurbanov           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1285429dbc2aSAndrey Churbanov           pr->u.p.count = init + 1;
1286429dbc2aSAndrey Churbanov           pr->u.p.ub = limit;
1287abe64360SAndreyChurbanov           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1288429dbc2aSAndrey Churbanov         } // while (search for victim)
1289429dbc2aSAndrey Churbanov       } // if (try to find victim and steal)
1290429dbc2aSAndrey Churbanov     } else {
1291429dbc2aSAndrey Churbanov       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
12925e8470afSJim Cownie       typedef union {
12935e8470afSJim Cownie         struct {
12945e8470afSJim Cownie           UT count;
12955e8470afSJim Cownie           T ub;
12965e8470afSJim Cownie         } p;
12975e8470afSJim Cownie         kmp_int64 b;
12985e8470afSJim Cownie       } union_i4;
12993041982dSJonathan Peyton       // All operations on 'count' or 'ub' must be combined atomically
13003041982dSJonathan Peyton       // together.
13015e8470afSJim Cownie       {
13025e8470afSJim Cownie         union_i4 vold, vnew;
13035e8470afSJim Cownie         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
13045e8470afSJim Cownie         vnew = vold;
13055e8470afSJim Cownie         vnew.p.count++;
13065e8470afSJim Cownie         while (!KMP_COMPARE_AND_STORE_ACQ64(
13075e8470afSJim Cownie             (volatile kmp_int64 *)&pr->u.p.count,
13085e8470afSJim Cownie             *VOLATILE_CAST(kmp_int64 *) & vold.b,
13095e8470afSJim Cownie             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
13105e8470afSJim Cownie           KMP_CPU_PAUSE();
13115e8470afSJim Cownie           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
13125e8470afSJim Cownie           vnew = vold;
13135e8470afSJim Cownie           vnew.p.count++;
13145e8470afSJim Cownie         }
13155e8470afSJim Cownie         vnew = vold;
13165e8470afSJim Cownie         init = vnew.p.count;
13175e8470afSJim Cownie         status = (init < (UT)vnew.p.ub);
13185e8470afSJim Cownie       }
13195e8470afSJim Cownie 
13205e8470afSJim Cownie       if (!status) {
13215e8470afSJim Cownie         kmp_info_t **other_threads = team->t.t_threads;
13226b316febSTerry Wilmarth         T while_limit = pr->u.p.parm3;
13236b316febSTerry Wilmarth         T while_index = 0;
1324abe64360SAndreyChurbanov         T id = pr->u.p.static_steal_counter; // loop id
1325abe64360SAndreyChurbanov         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1326abe64360SAndreyChurbanov                   __kmp_dispatch_num_buffers; // current loop index
1327abe64360SAndreyChurbanov         // note: victim thread can potentially execute another loop
13285e8470afSJim Cownie         // TODO: algorithm of searching for a victim
13295e8470afSJim Cownie         // should be cleaned up and measured
13305e8470afSJim Cownie         while ((!status) && (while_limit != ++while_index)) {
1331abe64360SAndreyChurbanov           dispatch_private_info_template<T> *victim;
13325e8470afSJim Cownie           union_i4 vold, vnew;
13336b316febSTerry Wilmarth           T remaining;
13345e8470afSJim Cownie           T victimIdx = pr->u.p.parm4;
1335429dbc2aSAndrey Churbanov           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1336abe64360SAndreyChurbanov           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1337abe64360SAndreyChurbanov               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1338abe64360SAndreyChurbanov           KMP_DEBUG_ASSERT(victim);
1339abe64360SAndreyChurbanov           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
13403041982dSJonathan Peyton                  oldVictimIdx != victimIdx) {
1341429dbc2aSAndrey Churbanov             victimIdx = (victimIdx + 1) % nproc;
13423041982dSJonathan Peyton             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1343abe64360SAndreyChurbanov                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1344abe64360SAndreyChurbanov             KMP_DEBUG_ASSERT(victim);
1345bd3a7633SJonathan Peyton           }
1346abe64360SAndreyChurbanov           if (victim == pr || id != victim->u.p.static_steal_counter) {
1347429dbc2aSAndrey Churbanov             continue; // try once more (nproc attempts in total)
1348429dbc2aSAndrey Churbanov             // no victim is ready yet to participate in stealing
1349abe64360SAndreyChurbanov             // because no victim passed kmp_init_dispatch yet
13505e8470afSJim Cownie           }
1351429dbc2aSAndrey Churbanov           pr->u.p.parm4 = victimIdx; // new victim found
1352429dbc2aSAndrey Churbanov           while (1) { // CAS loop if victim has enough chunks to steal
13535e8470afSJim Cownie             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
13545e8470afSJim Cownie             vnew = vold;
13555e8470afSJim Cownie 
13565e8470afSJim Cownie             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1357429dbc2aSAndrey Churbanov             if (vnew.p.count >= (UT)vnew.p.ub ||
13583041982dSJonathan Peyton                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
135939ada854SJonathan Peyton               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1360429dbc2aSAndrey Churbanov               break; // not enough chunks to steal, goto next victim
13615e8470afSJim Cownie             }
1362429dbc2aSAndrey Churbanov             if (remaining > 3) {
13636b316febSTerry Wilmarth               // try to steal 1/4 of remaining
13646b316febSTerry Wilmarth               vnew.p.ub -= remaining >> 2;
1365429dbc2aSAndrey Churbanov             } else {
1366429dbc2aSAndrey Churbanov               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1367429dbc2aSAndrey Churbanov             }
13685e8470afSJim Cownie             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
13695e8470afSJim Cownie             // TODO: Should this be acquire or release?
13705e8470afSJim Cownie             if (KMP_COMPARE_AND_STORE_ACQ64(
13715e8470afSJim Cownie                     (volatile kmp_int64 *)&victim->u.p.count,
13725e8470afSJim Cownie                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
13735e8470afSJim Cownie                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
137442016791SKazuaki Ishizaki               // stealing succeeded
1375f0682ac4SJonathan Peyton               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1376f0682ac4SJonathan Peyton                                         vold.p.ub - vnew.p.ub);
13775e8470afSJim Cownie               status = 1;
13785e8470afSJim Cownie               while_index = 0;
13795e8470afSJim Cownie               // now update own count and ub
13805e8470afSJim Cownie               init = vnew.p.ub;
13815e8470afSJim Cownie               vold.p.count = init + 1;
1382429dbc2aSAndrey Churbanov #if KMP_ARCH_X86
138339ada854SJonathan Peyton               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1384429dbc2aSAndrey Churbanov #else
13855e8470afSJim Cownie               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1386429dbc2aSAndrey Churbanov #endif
13875e8470afSJim Cownie               break;
1388429dbc2aSAndrey Churbanov             } // if (check CAS result)
138942016791SKazuaki Ishizaki             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1390429dbc2aSAndrey Churbanov           } // while (try to steal from particular victim)
1391429dbc2aSAndrey Churbanov         } // while (search for victim)
1392429dbc2aSAndrey Churbanov       } // if (try to find victim and steal)
1393429dbc2aSAndrey Churbanov     } // if (4-byte induction variable)
13945e8470afSJim Cownie     if (!status) {
13955e8470afSJim Cownie       *p_lb = 0;
13965e8470afSJim Cownie       *p_ub = 0;
13973041982dSJonathan Peyton       if (p_st != NULL)
13983041982dSJonathan Peyton         *p_st = 0;
13995e8470afSJim Cownie     } else {
14005e8470afSJim Cownie       start = pr->u.p.parm2;
14015e8470afSJim Cownie       init *= chunk;
14025e8470afSJim Cownie       limit = chunk + init - 1;
14035e8470afSJim Cownie       incr = pr->u.p.st;
1404f0682ac4SJonathan Peyton       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
14055e8470afSJim Cownie 
14065e8470afSJim Cownie       KMP_DEBUG_ASSERT(init <= trip);
14075e8470afSJim Cownie       if ((last = (limit >= trip)) != 0)
14085e8470afSJim Cownie         limit = trip;
14093041982dSJonathan Peyton       if (p_st != NULL)
14103041982dSJonathan Peyton         *p_st = incr;
14115e8470afSJim Cownie 
14125e8470afSJim Cownie       if (incr == 1) {
14135e8470afSJim Cownie         *p_lb = start + init;
14145e8470afSJim Cownie         *p_ub = start + limit;
14155e8470afSJim Cownie       } else {
14165e8470afSJim Cownie         *p_lb = start + init * incr;
14175e8470afSJim Cownie         *p_ub = start + limit * incr;
14185e8470afSJim Cownie       }
14195e8470afSJim Cownie 
142039ada854SJonathan Peyton       if (pr->flags.ordered) {
14215e8470afSJim Cownie         pr->u.p.ordered_lower = init;
14225e8470afSJim Cownie         pr->u.p.ordered_upper = limit;
14235e8470afSJim Cownie       } // if
14245e8470afSJim Cownie     } // if
14255e8470afSJim Cownie     break;
14265e8470afSJim Cownie   } // case
1427429dbc2aSAndrey Churbanov #endif // ( KMP_STATIC_STEAL_ENABLED )
14283041982dSJonathan Peyton   case kmp_sch_static_balanced: {
14293041982dSJonathan Peyton     KD_TRACE(
143039ada854SJonathan Peyton         10,
143139ada854SJonathan Peyton         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
143239ada854SJonathan Peyton          gtid));
143339ada854SJonathan Peyton     /* check if thread has any iteration to do */
143439ada854SJonathan Peyton     if ((status = !pr->u.p.count) != 0) {
14355e8470afSJim Cownie       pr->u.p.count = 1;
14365e8470afSJim Cownie       *p_lb = pr->u.p.lb;
14375e8470afSJim Cownie       *p_ub = pr->u.p.ub;
14386b316febSTerry Wilmarth       last = (pr->u.p.parm1 != 0);
14394cc4bb4cSJim Cownie       if (p_st != NULL)
14405e8470afSJim Cownie         *p_st = pr->u.p.st;
14415e8470afSJim Cownie     } else { /* no iterations to do */
14425e8470afSJim Cownie       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
14435e8470afSJim Cownie     }
14445e8470afSJim Cownie   } // case
14455e8470afSJim Cownie   break;
14463041982dSJonathan Peyton   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
14473041982dSJonathan Peyton                                  merged here */
14483041982dSJonathan Peyton   case kmp_sch_static_chunked: {
14495e8470afSJim Cownie     T parm1;
14505e8470afSJim Cownie 
145139ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
14523041982dSJonathan Peyton                    "kmp_sch_static_[affinity|chunked] case\n",
14535e8470afSJim Cownie                    gtid));
14545e8470afSJim Cownie     parm1 = pr->u.p.parm1;
14555e8470afSJim Cownie 
14565e8470afSJim Cownie     trip = pr->u.p.tc - 1;
145739ada854SJonathan Peyton     init = parm1 * (pr->u.p.count + tid);
14585e8470afSJim Cownie 
14595e8470afSJim Cownie     if ((status = (init <= trip)) != 0) {
14605e8470afSJim Cownie       start = pr->u.p.lb;
14615e8470afSJim Cownie       incr = pr->u.p.st;
14625e8470afSJim Cownie       limit = parm1 + init - 1;
14635e8470afSJim Cownie 
14645e8470afSJim Cownie       if ((last = (limit >= trip)) != 0)
14655e8470afSJim Cownie         limit = trip;
14665e8470afSJim Cownie 
14673041982dSJonathan Peyton       if (p_st != NULL)
14683041982dSJonathan Peyton         *p_st = incr;
14695e8470afSJim Cownie 
147039ada854SJonathan Peyton       pr->u.p.count += nproc;
14715e8470afSJim Cownie 
14725e8470afSJim Cownie       if (incr == 1) {
14735e8470afSJim Cownie         *p_lb = start + init;
14745e8470afSJim Cownie         *p_ub = start + limit;
14753041982dSJonathan Peyton       } else {
14765e8470afSJim Cownie         *p_lb = start + init * incr;
14775e8470afSJim Cownie         *p_ub = start + limit * incr;
14785e8470afSJim Cownie       }
14795e8470afSJim Cownie 
148039ada854SJonathan Peyton       if (pr->flags.ordered) {
14815e8470afSJim Cownie         pr->u.p.ordered_lower = init;
14825e8470afSJim Cownie         pr->u.p.ordered_upper = limit;
14835e8470afSJim Cownie       } // if
14845e8470afSJim Cownie     } // if
14855e8470afSJim Cownie   } // case
14865e8470afSJim Cownie   break;
14875e8470afSJim Cownie 
14883041982dSJonathan Peyton   case kmp_sch_dynamic_chunked: {
14895e8470afSJim Cownie     T chunk = pr->u.p.parm1;
14905e8470afSJim Cownie 
14913041982dSJonathan Peyton     KD_TRACE(
14923041982dSJonathan Peyton         100,
149339ada854SJonathan Peyton         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
149439ada854SJonathan Peyton          gtid));
14955e8470afSJim Cownie 
14965e8470afSJim Cownie     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
14975e8470afSJim Cownie     trip = pr->u.p.tc - 1;
14985e8470afSJim Cownie 
14995e8470afSJim Cownie     if ((status = (init <= trip)) == 0) {
15005e8470afSJim Cownie       *p_lb = 0;
15015e8470afSJim Cownie       *p_ub = 0;
15023041982dSJonathan Peyton       if (p_st != NULL)
15033041982dSJonathan Peyton         *p_st = 0;
15045e8470afSJim Cownie     } else {
15055e8470afSJim Cownie       start = pr->u.p.lb;
15065e8470afSJim Cownie       limit = chunk + init - 1;
15075e8470afSJim Cownie       incr = pr->u.p.st;
15085e8470afSJim Cownie 
15095e8470afSJim Cownie       if ((last = (limit >= trip)) != 0)
15105e8470afSJim Cownie         limit = trip;
15114cc4bb4cSJim Cownie 
15123041982dSJonathan Peyton       if (p_st != NULL)
15133041982dSJonathan Peyton         *p_st = incr;
15145e8470afSJim Cownie 
15155e8470afSJim Cownie       if (incr == 1) {
15165e8470afSJim Cownie         *p_lb = start + init;
15175e8470afSJim Cownie         *p_ub = start + limit;
15185e8470afSJim Cownie       } else {
15195e8470afSJim Cownie         *p_lb = start + init * incr;
15205e8470afSJim Cownie         *p_ub = start + limit * incr;
15215e8470afSJim Cownie       }
15225e8470afSJim Cownie 
152339ada854SJonathan Peyton       if (pr->flags.ordered) {
15245e8470afSJim Cownie         pr->u.p.ordered_lower = init;
15255e8470afSJim Cownie         pr->u.p.ordered_upper = limit;
15265e8470afSJim Cownie       } // if
15275e8470afSJim Cownie     } // if
15285e8470afSJim Cownie   } // case
15295e8470afSJim Cownie   break;
15305e8470afSJim Cownie 
15313041982dSJonathan Peyton   case kmp_sch_guided_iterative_chunked: {
15325e8470afSJim Cownie     T chunkspec = pr->u.p.parm1;
153339ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
15343041982dSJonathan Peyton                    "iterative case\n",
15353041982dSJonathan Peyton                    gtid));
15365e8470afSJim Cownie     trip = pr->u.p.tc;
15375e8470afSJim Cownie     // Start atomic part of calculations
15385e8470afSJim Cownie     while (1) {
15395e8470afSJim Cownie       ST remaining; // signed, because can be < 0
15405e8470afSJim Cownie       init = sh->u.s.iteration; // shared value
15415e8470afSJim Cownie       remaining = trip - init;
15425e8470afSJim Cownie       if (remaining <= 0) { // AC: need to compare with 0 first
15435e8470afSJim Cownie         // nothing to do, don't try atomic op
15445e8470afSJim Cownie         status = 0;
15455e8470afSJim Cownie         break;
15465e8470afSJim Cownie       }
15473041982dSJonathan Peyton       if ((T)remaining <
15483041982dSJonathan Peyton           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
154942016791SKazuaki Ishizaki         // use dynamic-style schedule
15504c6a098aSKazuaki Ishizaki         // atomically increment iterations, get old value
155194a114fcSJonathan Peyton         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
155294a114fcSJonathan Peyton                                  (ST)chunkspec);
15535e8470afSJim Cownie         remaining = trip - init;
15545e8470afSJim Cownie         if (remaining <= 0) {
15555e8470afSJim Cownie           status = 0; // all iterations got by other threads
155639ada854SJonathan Peyton         } else {
155739ada854SJonathan Peyton           // got some iterations to work on
15585e8470afSJim Cownie           status = 1;
15595e8470afSJim Cownie           if ((T)remaining > chunkspec) {
15605e8470afSJim Cownie             limit = init + chunkspec - 1;
15615e8470afSJim Cownie           } else {
15626b316febSTerry Wilmarth             last = true; // the last chunk
15635e8470afSJim Cownie             limit = init + remaining - 1;
15645e8470afSJim Cownie           } // if
15655e8470afSJim Cownie         } // if
15665e8470afSJim Cownie         break;
15675e8470afSJim Cownie       } // if
15686b316febSTerry Wilmarth       limit = init + (UT)((double)remaining *
15696b316febSTerry Wilmarth                           *(double *)&pr->u.p.parm3); // divide by K*nproc
15705ba90c79SAndrey Churbanov       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1571c47afcd9SAndrey Churbanov                                (ST)init, (ST)limit)) {
15725e8470afSJim Cownie         // CAS was successful, chunk obtained
15735e8470afSJim Cownie         status = 1;
15745e8470afSJim Cownie         --limit;
15755e8470afSJim Cownie         break;
15765e8470afSJim Cownie       } // if
15775e8470afSJim Cownie     } // while
15785e8470afSJim Cownie     if (status != 0) {
15795e8470afSJim Cownie       start = pr->u.p.lb;
15805e8470afSJim Cownie       incr = pr->u.p.st;
15815e8470afSJim Cownie       if (p_st != NULL)
15825e8470afSJim Cownie         *p_st = incr;
15835e8470afSJim Cownie       *p_lb = start + init * incr;
15845e8470afSJim Cownie       *p_ub = start + limit * incr;
158539ada854SJonathan Peyton       if (pr->flags.ordered) {
15865e8470afSJim Cownie         pr->u.p.ordered_lower = init;
15875e8470afSJim Cownie         pr->u.p.ordered_upper = limit;
15885e8470afSJim Cownie       } // if
15895e8470afSJim Cownie     } else {
15905e8470afSJim Cownie       *p_lb = 0;
15915e8470afSJim Cownie       *p_ub = 0;
15925e8470afSJim Cownie       if (p_st != NULL)
15935e8470afSJim Cownie         *p_st = 0;
15945e8470afSJim Cownie     } // if
15955e8470afSJim Cownie   } // case
15965e8470afSJim Cownie   break;
15975e8470afSJim Cownie 
1598d454c73cSAndrey Churbanov   case kmp_sch_guided_simd: {
1599d454c73cSAndrey Churbanov     // same as iterative but curr-chunk adjusted to be multiple of given
1600d454c73cSAndrey Churbanov     // chunk
1601d454c73cSAndrey Churbanov     T chunk = pr->u.p.parm1;
160239ada854SJonathan Peyton     KD_TRACE(100,
160339ada854SJonathan Peyton              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1604d454c73cSAndrey Churbanov               gtid));
1605d454c73cSAndrey Churbanov     trip = pr->u.p.tc;
1606d454c73cSAndrey Churbanov     // Start atomic part of calculations
1607d454c73cSAndrey Churbanov     while (1) {
1608d454c73cSAndrey Churbanov       ST remaining; // signed, because can be < 0
1609d454c73cSAndrey Churbanov       init = sh->u.s.iteration; // shared value
1610d454c73cSAndrey Churbanov       remaining = trip - init;
1611d454c73cSAndrey Churbanov       if (remaining <= 0) { // AC: need to compare with 0 first
1612d454c73cSAndrey Churbanov         status = 0; // nothing to do, don't try atomic op
1613d454c73cSAndrey Churbanov         break;
1614d454c73cSAndrey Churbanov       }
1615d454c73cSAndrey Churbanov       KMP_DEBUG_ASSERT(init % chunk == 0);
1616d454c73cSAndrey Churbanov       // compare with K*nproc*(chunk+1), K=2 by default
1617d454c73cSAndrey Churbanov       if ((T)remaining < pr->u.p.parm2) {
161842016791SKazuaki Ishizaki         // use dynamic-style schedule
16194c6a098aSKazuaki Ishizaki         // atomically increment iterations, get old value
162094a114fcSJonathan Peyton         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
162194a114fcSJonathan Peyton                                  (ST)chunk);
1622d454c73cSAndrey Churbanov         remaining = trip - init;
1623d454c73cSAndrey Churbanov         if (remaining <= 0) {
1624d454c73cSAndrey Churbanov           status = 0; // all iterations got by other threads
1625d454c73cSAndrey Churbanov         } else {
1626d454c73cSAndrey Churbanov           // got some iterations to work on
1627d454c73cSAndrey Churbanov           status = 1;
1628d454c73cSAndrey Churbanov           if ((T)remaining > chunk) {
1629d454c73cSAndrey Churbanov             limit = init + chunk - 1;
1630d454c73cSAndrey Churbanov           } else {
16316b316febSTerry Wilmarth             last = true; // the last chunk
1632d454c73cSAndrey Churbanov             limit = init + remaining - 1;
1633d454c73cSAndrey Churbanov           } // if
1634d454c73cSAndrey Churbanov         } // if
1635d454c73cSAndrey Churbanov         break;
1636d454c73cSAndrey Churbanov       } // if
1637d454c73cSAndrey Churbanov       // divide by K*nproc
16386b316febSTerry Wilmarth       UT span;
16396b316febSTerry Wilmarth       __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
16406b316febSTerry Wilmarth                          &span);
1641d454c73cSAndrey Churbanov       UT rem = span % chunk;
1642d454c73cSAndrey Churbanov       if (rem) // adjust so that span%chunk == 0
1643d454c73cSAndrey Churbanov         span += chunk - rem;
1644d454c73cSAndrey Churbanov       limit = init + span;
16455ba90c79SAndrey Churbanov       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1646c47afcd9SAndrey Churbanov                                (ST)init, (ST)limit)) {
1647d454c73cSAndrey Churbanov         // CAS was successful, chunk obtained
1648d454c73cSAndrey Churbanov         status = 1;
1649d454c73cSAndrey Churbanov         --limit;
1650d454c73cSAndrey Churbanov         break;
1651d454c73cSAndrey Churbanov       } // if
1652d454c73cSAndrey Churbanov     } // while
1653d454c73cSAndrey Churbanov     if (status != 0) {
1654d454c73cSAndrey Churbanov       start = pr->u.p.lb;
1655d454c73cSAndrey Churbanov       incr = pr->u.p.st;
1656d454c73cSAndrey Churbanov       if (p_st != NULL)
1657d454c73cSAndrey Churbanov         *p_st = incr;
1658d454c73cSAndrey Churbanov       *p_lb = start + init * incr;
1659d454c73cSAndrey Churbanov       *p_ub = start + limit * incr;
166039ada854SJonathan Peyton       if (pr->flags.ordered) {
1661d454c73cSAndrey Churbanov         pr->u.p.ordered_lower = init;
1662d454c73cSAndrey Churbanov         pr->u.p.ordered_upper = limit;
1663d454c73cSAndrey Churbanov       } // if
1664d454c73cSAndrey Churbanov     } else {
1665d454c73cSAndrey Churbanov       *p_lb = 0;
1666d454c73cSAndrey Churbanov       *p_ub = 0;
1667d454c73cSAndrey Churbanov       if (p_st != NULL)
1668d454c73cSAndrey Churbanov         *p_st = 0;
1669d454c73cSAndrey Churbanov     } // if
1670d454c73cSAndrey Churbanov   } // case
1671d454c73cSAndrey Churbanov   break;
1672d454c73cSAndrey Churbanov 
16733041982dSJonathan Peyton   case kmp_sch_guided_analytical_chunked: {
16745e8470afSJim Cownie     T chunkspec = pr->u.p.parm1;
16755e8470afSJim Cownie     UT chunkIdx;
1676f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
16775e8470afSJim Cownie     /* for storing original FPCW value for Windows* OS on
16785e8470afSJim Cownie        IA-32 architecture 8-byte version */
16795e8470afSJim Cownie     unsigned int oldFpcw;
1680181b4bb3SJim Cownie     unsigned int fpcwSet = 0;
16815e8470afSJim Cownie #endif
168239ada854SJonathan Peyton     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
168339ada854SJonathan Peyton                    "kmp_sch_guided_analytical_chunked case\n",
16845e8470afSJim Cownie                    gtid));
16855e8470afSJim Cownie 
16865e8470afSJim Cownie     trip = pr->u.p.tc;
16875e8470afSJim Cownie 
168839ada854SJonathan Peyton     KMP_DEBUG_ASSERT(nproc > 1);
168939ada854SJonathan Peyton     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
16905e8470afSJim Cownie 
16913041982dSJonathan Peyton     while (1) { /* this while loop is a safeguard against unexpected zero
16923041982dSJonathan Peyton                    chunk sizes */
16935e8470afSJim Cownie       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
16945e8470afSJim Cownie       if (chunkIdx >= (UT)pr->u.p.parm2) {
16955e8470afSJim Cownie         --trip;
16965e8470afSJim Cownie         /* use dynamic-style scheduling */
16975e8470afSJim Cownie         init = chunkIdx * chunkspec + pr->u.p.count;
16983041982dSJonathan Peyton         /* need to verify init > 0 in case of overflow in the above
16993041982dSJonathan Peyton          * calculation */
17005e8470afSJim Cownie         if ((status = (init > 0 && init <= trip)) != 0) {
17015e8470afSJim Cownie           limit = init + chunkspec - 1;
17025e8470afSJim Cownie 
17035e8470afSJim Cownie           if ((last = (limit >= trip)) != 0)
17045e8470afSJim Cownie             limit = trip;
17055e8470afSJim Cownie         }
17065e8470afSJim Cownie         break;
17075e8470afSJim Cownie       } else {
17085e8470afSJim Cownie /* use exponential-style scheduling */
17093041982dSJonathan Peyton /* The following check is to workaround the lack of long double precision on
17103041982dSJonathan Peyton    Windows* OS.
17115e8470afSJim Cownie    This check works around the possible effect that init != 0 for chunkIdx == 0.
17125e8470afSJim Cownie  */
1713f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
171439ada854SJonathan Peyton         /* If we haven't already done so, save original
171539ada854SJonathan Peyton            FPCW and set precision to 64-bit, as Windows* OS
171639ada854SJonathan Peyton            on IA-32 architecture defaults to 53-bit */
17175e8470afSJim Cownie         if (!fpcwSet) {
1718181b4bb3SJim Cownie           oldFpcw = _control87(0, 0);
1719181b4bb3SJim Cownie           _control87(_PC_64, _MCW_PC);
17205e8470afSJim Cownie           fpcwSet = 0x30000;
17215e8470afSJim Cownie         }
17225e8470afSJim Cownie #endif
17235e8470afSJim Cownie         if (chunkIdx) {
17245e8470afSJim Cownie           init = __kmp_dispatch_guided_remaining<T>(
17255e8470afSJim Cownie               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
17265e8470afSJim Cownie           KMP_DEBUG_ASSERT(init);
17275e8470afSJim Cownie           init = trip - init;
17285e8470afSJim Cownie         } else
17295e8470afSJim Cownie           init = 0;
17305e8470afSJim Cownie         limit = trip - __kmp_dispatch_guided_remaining<T>(
17315e8470afSJim Cownie                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
17325e8470afSJim Cownie         KMP_ASSERT(init <= limit);
17335e8470afSJim Cownie         if (init < limit) {
17345e8470afSJim Cownie           KMP_DEBUG_ASSERT(limit <= trip);
17355e8470afSJim Cownie           --limit;
17365e8470afSJim Cownie           status = 1;
17375e8470afSJim Cownie           break;
17385e8470afSJim Cownie         } // if
17395e8470afSJim Cownie       } // if
17405e8470afSJim Cownie     } // while (1)
1741f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL
1742181b4bb3SJim Cownie     /* restore FPCW if necessary
174339ada854SJonathan Peyton        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
174439ada854SJonathan Peyton     */
1745181b4bb3SJim Cownie     if (fpcwSet && (oldFpcw & fpcwSet))
1746181b4bb3SJim Cownie       _control87(oldFpcw, _MCW_PC);
17475e8470afSJim Cownie #endif
17485e8470afSJim Cownie     if (status != 0) {
17495e8470afSJim Cownie       start = pr->u.p.lb;
17505e8470afSJim Cownie       incr = pr->u.p.st;
17515e8470afSJim Cownie       if (p_st != NULL)
17525e8470afSJim Cownie         *p_st = incr;
17535e8470afSJim Cownie       *p_lb = start + init * incr;
17545e8470afSJim Cownie       *p_ub = start + limit * incr;
175539ada854SJonathan Peyton       if (pr->flags.ordered) {
17565e8470afSJim Cownie         pr->u.p.ordered_lower = init;
17575e8470afSJim Cownie         pr->u.p.ordered_upper = limit;
17585e8470afSJim Cownie       }
17595e8470afSJim Cownie     } else {
17605e8470afSJim Cownie       *p_lb = 0;
17615e8470afSJim Cownie       *p_ub = 0;
17625e8470afSJim Cownie       if (p_st != NULL)
17635e8470afSJim Cownie         *p_st = 0;
17645e8470afSJim Cownie     }
17655e8470afSJim Cownie   } // case
17665e8470afSJim Cownie   break;
17675e8470afSJim Cownie 
17683041982dSJonathan Peyton   case kmp_sch_trapezoidal: {
17695e8470afSJim Cownie     UT index;
17705e8470afSJim Cownie     T parm2 = pr->u.p.parm2;
17715e8470afSJim Cownie     T parm3 = pr->u.p.parm3;
17725e8470afSJim Cownie     T parm4 = pr->u.p.parm4;
177339ada854SJonathan Peyton     KD_TRACE(100,
177439ada854SJonathan Peyton              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
17755e8470afSJim Cownie               gtid));
17765e8470afSJim Cownie 
17775e8470afSJim Cownie     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
17785e8470afSJim Cownie 
17795e8470afSJim Cownie     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
17805e8470afSJim Cownie     trip = pr->u.p.tc - 1;
17815e8470afSJim Cownie 
17825e8470afSJim Cownie     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
17835e8470afSJim Cownie       *p_lb = 0;
17845e8470afSJim Cownie       *p_ub = 0;
17853041982dSJonathan Peyton       if (p_st != NULL)
17863041982dSJonathan Peyton         *p_st = 0;
17875e8470afSJim Cownie     } else {
17885e8470afSJim Cownie       start = pr->u.p.lb;
17895e8470afSJim Cownie       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
17905e8470afSJim Cownie       incr = pr->u.p.st;
17915e8470afSJim Cownie 
17925e8470afSJim Cownie       if ((last = (limit >= trip)) != 0)
17935e8470afSJim Cownie         limit = trip;
17945e8470afSJim Cownie 
17953041982dSJonathan Peyton       if (p_st != NULL)
17963041982dSJonathan Peyton         *p_st = incr;
17975e8470afSJim Cownie 
17985e8470afSJim Cownie       if (incr == 1) {
17995e8470afSJim Cownie         *p_lb = start + init;
18005e8470afSJim Cownie         *p_ub = start + limit;
18015e8470afSJim Cownie       } else {
18025e8470afSJim Cownie         *p_lb = start + init * incr;
18035e8470afSJim Cownie         *p_ub = start + limit * incr;
18045e8470afSJim Cownie       }
18055e8470afSJim Cownie 
180639ada854SJonathan Peyton       if (pr->flags.ordered) {
180739ada854SJonathan Peyton         pr->u.p.ordered_lower = init;
180839ada854SJonathan Peyton         pr->u.p.ordered_upper = limit;
180939ada854SJonathan Peyton       } // if
181039ada854SJonathan Peyton     } // if
181139ada854SJonathan Peyton   } // case
181239ada854SJonathan Peyton   break;
181339ada854SJonathan Peyton   default: {
181439ada854SJonathan Peyton     status = 0; // to avoid complaints on uninitialized variable use
181539ada854SJonathan Peyton     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
181639ada854SJonathan Peyton                 KMP_HNT(GetNewerLibrary), // Hint
181739ada854SJonathan Peyton                 __kmp_msg_null // Variadic argument list terminator
181839ada854SJonathan Peyton     );
181939ada854SJonathan Peyton   } break;
182039ada854SJonathan Peyton   } // switch
182139ada854SJonathan Peyton   if (p_last)
182239ada854SJonathan Peyton     *p_last = last;
182339ada854SJonathan Peyton #ifdef KMP_DEBUG
182439ada854SJonathan Peyton   if (pr->flags.ordered) {
182539ada854SJonathan Peyton     char *buff;
182639ada854SJonathan Peyton     // create format specifiers before the debug output
182739ada854SJonathan Peyton     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
182839ada854SJonathan Peyton                             "ordered_lower:%%%s ordered_upper:%%%s\n",
182939ada854SJonathan Peyton                             traits_t<UT>::spec, traits_t<UT>::spec);
183039ada854SJonathan Peyton     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
183139ada854SJonathan Peyton     __kmp_str_free(&buff);
183239ada854SJonathan Peyton   }
183339ada854SJonathan Peyton   {
183439ada854SJonathan Peyton     char *buff;
183539ada854SJonathan Peyton     // create format specifiers before the debug output
183639ada854SJonathan Peyton     buff = __kmp_str_format(
183739ada854SJonathan Peyton         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
183839ada854SJonathan Peyton         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
183939ada854SJonathan Peyton         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
184039ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
184139ada854SJonathan Peyton     __kmp_str_free(&buff);
184239ada854SJonathan Peyton   }
184339ada854SJonathan Peyton #endif
184439ada854SJonathan Peyton   return status;
184539ada854SJonathan Peyton }
184639ada854SJonathan Peyton 
184739ada854SJonathan Peyton /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
184839ada854SJonathan Peyton    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
184939ada854SJonathan Peyton    is not called. */
185039ada854SJonathan Peyton #if OMPT_SUPPORT && OMPT_OPTIONAL
185139ada854SJonathan Peyton #define OMPT_LOOP_END                                                          \
185239ada854SJonathan Peyton   if (status == 0) {                                                           \
185339ada854SJonathan Peyton     if (ompt_enabled.ompt_callback_work) {                                     \
185439ada854SJonathan Peyton       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
185539ada854SJonathan Peyton       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
185639ada854SJonathan Peyton       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
185739ada854SJonathan Peyton           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
185839ada854SJonathan Peyton           &(task_info->task_data), 0, codeptr);                                \
185939ada854SJonathan Peyton     }                                                                          \
186039ada854SJonathan Peyton   }
186139ada854SJonathan Peyton // TODO: implement count
186239ada854SJonathan Peyton #else
186339ada854SJonathan Peyton #define OMPT_LOOP_END // no-op
186439ada854SJonathan Peyton #endif
186539ada854SJonathan Peyton 
1866f0682ac4SJonathan Peyton #if KMP_STATS_ENABLED
1867f0682ac4SJonathan Peyton #define KMP_STATS_LOOP_END                                                     \
1868f0682ac4SJonathan Peyton   {                                                                            \
1869f0682ac4SJonathan Peyton     kmp_int64 u, l, t, i;                                                      \
1870f0682ac4SJonathan Peyton     l = (kmp_int64)(*p_lb);                                                    \
1871f0682ac4SJonathan Peyton     u = (kmp_int64)(*p_ub);                                                    \
1872f0682ac4SJonathan Peyton     i = (kmp_int64)(pr->u.p.st);                                               \
1873f0682ac4SJonathan Peyton     if (status == 0) {                                                         \
1874f0682ac4SJonathan Peyton       t = 0;                                                                   \
1875f0682ac4SJonathan Peyton       KMP_POP_PARTITIONED_TIMER();                                             \
1876f0682ac4SJonathan Peyton     } else if (i == 1) {                                                       \
1877f0682ac4SJonathan Peyton       if (u >= l)                                                              \
1878f0682ac4SJonathan Peyton         t = u - l + 1;                                                         \
1879f0682ac4SJonathan Peyton       else                                                                     \
1880f0682ac4SJonathan Peyton         t = 0;                                                                 \
1881f0682ac4SJonathan Peyton     } else if (i < 0) {                                                        \
1882f0682ac4SJonathan Peyton       if (l >= u)                                                              \
1883f0682ac4SJonathan Peyton         t = (l - u) / (-i) + 1;                                                \
1884f0682ac4SJonathan Peyton       else                                                                     \
1885f0682ac4SJonathan Peyton         t = 0;                                                                 \
1886f0682ac4SJonathan Peyton     } else {                                                                   \
1887f0682ac4SJonathan Peyton       if (u >= l)                                                              \
1888f0682ac4SJonathan Peyton         t = (u - l) / i + 1;                                                   \
1889f0682ac4SJonathan Peyton       else                                                                     \
1890f0682ac4SJonathan Peyton         t = 0;                                                                 \
1891f0682ac4SJonathan Peyton     }                                                                          \
1892f0682ac4SJonathan Peyton     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1893f0682ac4SJonathan Peyton   }
1894f0682ac4SJonathan Peyton #else
1895f0682ac4SJonathan Peyton #define KMP_STATS_LOOP_END /* Nothing */
1896f0682ac4SJonathan Peyton #endif
1897f0682ac4SJonathan Peyton 
189839ada854SJonathan Peyton template <typename T>
189939ada854SJonathan Peyton static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
190039ada854SJonathan Peyton                                T *p_lb, T *p_ub,
190139ada854SJonathan Peyton                                typename traits_t<T>::signed_t *p_st
190239ada854SJonathan Peyton #if OMPT_SUPPORT && OMPT_OPTIONAL
190339ada854SJonathan Peyton                                ,
190439ada854SJonathan Peyton                                void *codeptr
190539ada854SJonathan Peyton #endif
190639ada854SJonathan Peyton ) {
190739ada854SJonathan Peyton 
190839ada854SJonathan Peyton   typedef typename traits_t<T>::unsigned_t UT;
190939ada854SJonathan Peyton   typedef typename traits_t<T>::signed_t ST;
191039ada854SJonathan Peyton   // This is potentially slightly misleading, schedule(runtime) will appear here
191142016791SKazuaki Ishizaki   // even if the actual runtime schedule is static. (Which points out a
19124c6a098aSKazuaki Ishizaki   // disadvantage of schedule(runtime): even when static scheduling is used it
191339ada854SJonathan Peyton   // costs more than a compile time choice to use static scheduling would.)
1914f0682ac4SJonathan Peyton   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
191539ada854SJonathan Peyton 
191639ada854SJonathan Peyton   int status;
191739ada854SJonathan Peyton   dispatch_private_info_template<T> *pr;
1918787eb0c6SAndreyChurbanov   __kmp_assert_valid_gtid(gtid);
191939ada854SJonathan Peyton   kmp_info_t *th = __kmp_threads[gtid];
192039ada854SJonathan Peyton   kmp_team_t *team = th->th.th_team;
192139ada854SJonathan Peyton 
192239ada854SJonathan Peyton   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
192339ada854SJonathan Peyton   KD_TRACE(
192439ada854SJonathan Peyton       1000,
192539ada854SJonathan Peyton       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
192639ada854SJonathan Peyton        gtid, p_lb, p_ub, p_st, p_last));
192739ada854SJonathan Peyton 
192839ada854SJonathan Peyton   if (team->t.t_serialized) {
192942016791SKazuaki Ishizaki     /* NOTE: serialize this dispatch because we are not at the active level */
193039ada854SJonathan Peyton     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
193139ada854SJonathan Peyton         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
193239ada854SJonathan Peyton     KMP_DEBUG_ASSERT(pr);
193339ada854SJonathan Peyton 
193439ada854SJonathan Peyton     if ((status = (pr->u.p.tc != 0)) == 0) {
193539ada854SJonathan Peyton       *p_lb = 0;
193639ada854SJonathan Peyton       *p_ub = 0;
193739ada854SJonathan Peyton       //            if ( p_last != NULL )
193839ada854SJonathan Peyton       //                *p_last = 0;
193939ada854SJonathan Peyton       if (p_st != NULL)
194039ada854SJonathan Peyton         *p_st = 0;
194139ada854SJonathan Peyton       if (__kmp_env_consistency_check) {
194239ada854SJonathan Peyton         if (pr->pushed_ws != ct_none) {
194339ada854SJonathan Peyton           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
194439ada854SJonathan Peyton         }
194539ada854SJonathan Peyton       }
194639ada854SJonathan Peyton     } else if (pr->flags.nomerge) {
194739ada854SJonathan Peyton       kmp_int32 last;
194839ada854SJonathan Peyton       T start;
194939ada854SJonathan Peyton       UT limit, trip, init;
195039ada854SJonathan Peyton       ST incr;
195139ada854SJonathan Peyton       T chunk = pr->u.p.parm1;
195239ada854SJonathan Peyton 
195339ada854SJonathan Peyton       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
195439ada854SJonathan Peyton                      gtid));
195539ada854SJonathan Peyton 
195639ada854SJonathan Peyton       init = chunk * pr->u.p.count++;
195739ada854SJonathan Peyton       trip = pr->u.p.tc - 1;
195839ada854SJonathan Peyton 
195939ada854SJonathan Peyton       if ((status = (init <= trip)) == 0) {
196039ada854SJonathan Peyton         *p_lb = 0;
196139ada854SJonathan Peyton         *p_ub = 0;
196239ada854SJonathan Peyton         //                if ( p_last != NULL )
196339ada854SJonathan Peyton         //                    *p_last = 0;
196439ada854SJonathan Peyton         if (p_st != NULL)
196539ada854SJonathan Peyton           *p_st = 0;
196639ada854SJonathan Peyton         if (__kmp_env_consistency_check) {
196739ada854SJonathan Peyton           if (pr->pushed_ws != ct_none) {
196839ada854SJonathan Peyton             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
196939ada854SJonathan Peyton           }
197039ada854SJonathan Peyton         }
197139ada854SJonathan Peyton       } else {
197239ada854SJonathan Peyton         start = pr->u.p.lb;
197339ada854SJonathan Peyton         limit = chunk + init - 1;
197439ada854SJonathan Peyton         incr = pr->u.p.st;
197539ada854SJonathan Peyton 
197639ada854SJonathan Peyton         if ((last = (limit >= trip)) != 0) {
197739ada854SJonathan Peyton           limit = trip;
197839ada854SJonathan Peyton #if KMP_OS_WINDOWS
197939ada854SJonathan Peyton           pr->u.p.last_upper = pr->u.p.ub;
198039ada854SJonathan Peyton #endif /* KMP_OS_WINDOWS */
198139ada854SJonathan Peyton         }
198239ada854SJonathan Peyton         if (p_last != NULL)
198339ada854SJonathan Peyton           *p_last = last;
198439ada854SJonathan Peyton         if (p_st != NULL)
198539ada854SJonathan Peyton           *p_st = incr;
198639ada854SJonathan Peyton         if (incr == 1) {
198739ada854SJonathan Peyton           *p_lb = start + init;
198839ada854SJonathan Peyton           *p_ub = start + limit;
198939ada854SJonathan Peyton         } else {
199039ada854SJonathan Peyton           *p_lb = start + init * incr;
199139ada854SJonathan Peyton           *p_ub = start + limit * incr;
199239ada854SJonathan Peyton         }
199339ada854SJonathan Peyton 
199439ada854SJonathan Peyton         if (pr->flags.ordered) {
19955e8470afSJim Cownie           pr->u.p.ordered_lower = init;
19965e8470afSJim Cownie           pr->u.p.ordered_upper = limit;
19975e8470afSJim Cownie #ifdef KMP_DEBUG
19985e8470afSJim Cownie           {
1999aeb40adaSJonas Hahnfeld             char *buff;
20005e8470afSJim Cownie             // create format specifiers before the debug output
20013041982dSJonathan Peyton             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
20023041982dSJonathan Peyton                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
20035e8470afSJim Cownie                                     traits_t<UT>::spec, traits_t<UT>::spec);
20043041982dSJonathan Peyton             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
20053041982dSJonathan Peyton                             pr->u.p.ordered_upper));
20065e8470afSJim Cownie             __kmp_str_free(&buff);
20075e8470afSJim Cownie           }
20085e8470afSJim Cownie #endif
20095e8470afSJim Cownie         } // if
20105e8470afSJim Cownie       } // if
201139ada854SJonathan Peyton     } else {
201239ada854SJonathan Peyton       pr->u.p.tc = 0;
201339ada854SJonathan Peyton       *p_lb = pr->u.p.lb;
201439ada854SJonathan Peyton       *p_ub = pr->u.p.ub;
201539ada854SJonathan Peyton #if KMP_OS_WINDOWS
201639ada854SJonathan Peyton       pr->u.p.last_upper = *p_ub;
201739ada854SJonathan Peyton #endif /* KMP_OS_WINDOWS */
201839ada854SJonathan Peyton       if (p_last != NULL)
201939ada854SJonathan Peyton         *p_last = TRUE;
202039ada854SJonathan Peyton       if (p_st != NULL)
202139ada854SJonathan Peyton         *p_st = pr->u.p.st;
202239ada854SJonathan Peyton     } // if
202339ada854SJonathan Peyton #ifdef KMP_DEBUG
202439ada854SJonathan Peyton     {
202539ada854SJonathan Peyton       char *buff;
202639ada854SJonathan Peyton       // create format specifiers before the debug output
202739ada854SJonathan Peyton       buff = __kmp_str_format(
202839ada854SJonathan Peyton           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
202939ada854SJonathan Peyton           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
203039ada854SJonathan Peyton           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2031771f0fb9SPeyton, Jonathan L       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2032771f0fb9SPeyton, Jonathan L                     (p_last ? *p_last : 0), status));
203339ada854SJonathan Peyton       __kmp_str_free(&buff);
203439ada854SJonathan Peyton     }
203539ada854SJonathan Peyton #endif
203639ada854SJonathan Peyton #if INCLUDE_SSC_MARKS
203739ada854SJonathan Peyton     SSC_MARK_DISPATCH_NEXT();
203839ada854SJonathan Peyton #endif
203939ada854SJonathan Peyton     OMPT_LOOP_END;
2040f0682ac4SJonathan Peyton     KMP_STATS_LOOP_END;
204139ada854SJonathan Peyton     return status;
204239ada854SJonathan Peyton   } else {
204339ada854SJonathan Peyton     kmp_int32 last = 0;
204439ada854SJonathan Peyton     dispatch_shared_info_template<T> volatile *sh;
20455e8470afSJim Cownie 
204639ada854SJonathan Peyton     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
204739ada854SJonathan Peyton                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
204839ada854SJonathan Peyton 
204939ada854SJonathan Peyton     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
205039ada854SJonathan Peyton         th->th.th_dispatch->th_dispatch_pr_current);
205139ada854SJonathan Peyton     KMP_DEBUG_ASSERT(pr);
205239ada854SJonathan Peyton     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
205339ada854SJonathan Peyton         th->th.th_dispatch->th_dispatch_sh_current);
205439ada854SJonathan Peyton     KMP_DEBUG_ASSERT(sh);
205539ada854SJonathan Peyton 
2056f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
2057f6399367SJonathan Peyton     if (pr->flags.use_hier)
2058f6399367SJonathan Peyton       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2059f6399367SJonathan Peyton     else
2060f6399367SJonathan Peyton #endif // KMP_USE_HIER_SCHED
206139ada854SJonathan Peyton       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
206239ada854SJonathan Peyton                                                 p_st, th->th.th_team_nproc,
206339ada854SJonathan Peyton                                                 th->th.th_info.ds.ds_tid);
206439ada854SJonathan Peyton     // status == 0: no more iterations to execute
20655e8470afSJim Cownie     if (status == 0) {
20665e8470afSJim Cownie       UT num_done;
20675e8470afSJim Cownie 
20685e8470afSJim Cownie       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
20695e8470afSJim Cownie #ifdef KMP_DEBUG
20705e8470afSJim Cownie       {
2071aeb40adaSJonas Hahnfeld         char *buff;
20725e8470afSJim Cownie         // create format specifiers before the debug output
20735e8470afSJim Cownie         buff = __kmp_str_format(
20745e8470afSJim Cownie             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
20755e8470afSJim Cownie             traits_t<UT>::spec);
207639ada854SJonathan Peyton         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
20775e8470afSJim Cownie         __kmp_str_free(&buff);
20785e8470afSJim Cownie       }
20795e8470afSJim Cownie #endif
20805e8470afSJim Cownie 
2081f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED
2082f6399367SJonathan Peyton       pr->flags.use_hier = FALSE;
2083f6399367SJonathan Peyton #endif
2084ff5ca8b4SJonathan Peyton       if ((ST)num_done == th->th.th_team_nproc - 1) {
2085429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED)
20863041982dSJonathan Peyton         if (pr->schedule == kmp_sch_static_steal &&
20873041982dSJonathan Peyton             traits_t<T>::type_size > 4) {
2088429dbc2aSAndrey Churbanov           int i;
2089abe64360SAndreyChurbanov           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2090abe64360SAndreyChurbanov                     __kmp_dispatch_num_buffers; // current loop index
2091429dbc2aSAndrey Churbanov           kmp_info_t **other_threads = team->t.t_threads;
2092429dbc2aSAndrey Churbanov           // loop complete, safe to destroy locks used for stealing
2093429dbc2aSAndrey Churbanov           for (i = 0; i < th->th.th_team_nproc; ++i) {
2094abe64360SAndreyChurbanov             dispatch_private_info_template<T> *buf =
2095abe64360SAndreyChurbanov                 reinterpret_cast<dispatch_private_info_template<T> *>(
2096abe64360SAndreyChurbanov                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2097abe64360SAndreyChurbanov             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2098429dbc2aSAndrey Churbanov             KMP_ASSERT(lck != NULL);
2099429dbc2aSAndrey Churbanov             __kmp_destroy_lock(lck);
2100429dbc2aSAndrey Churbanov             __kmp_free(lck);
2101abe64360SAndreyChurbanov             buf->u.p.th_steal_lock = NULL;
2102429dbc2aSAndrey Churbanov           }
2103429dbc2aSAndrey Churbanov         }
2104429dbc2aSAndrey Churbanov #endif
21055e8470afSJim Cownie         /* NOTE: release this buffer to be reused */
21065e8470afSJim Cownie 
21075e8470afSJim Cownie         KMP_MB(); /* Flush all pending memory write invalidates.  */
21085e8470afSJim Cownie 
21095e8470afSJim Cownie         sh->u.s.num_done = 0;
21105e8470afSJim Cownie         sh->u.s.iteration = 0;
21115e8470afSJim Cownie 
21125e8470afSJim Cownie         /* TODO replace with general release procedure? */
211339ada854SJonathan Peyton         if (pr->flags.ordered) {
21145e8470afSJim Cownie           sh->u.s.ordered_iteration = 0;
21155e8470afSJim Cownie         }
21165e8470afSJim Cownie 
21175e8470afSJim Cownie         KMP_MB(); /* Flush all pending memory write invalidates.  */
21185e8470afSJim Cownie 
2119067325f9SJonathan Peyton         sh->buffer_index += __kmp_dispatch_num_buffers;
21205e8470afSJim Cownie         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
21215e8470afSJim Cownie                        gtid, sh->buffer_index));
21225e8470afSJim Cownie 
21235e8470afSJim Cownie         KMP_MB(); /* Flush all pending memory write invalidates.  */
21245e8470afSJim Cownie 
21255e8470afSJim Cownie       } // if
21265e8470afSJim Cownie       if (__kmp_env_consistency_check) {
21275e8470afSJim Cownie         if (pr->pushed_ws != ct_none) {
21285e8470afSJim Cownie           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
21295e8470afSJim Cownie         }
21305e8470afSJim Cownie       }
21315e8470afSJim Cownie 
21325e8470afSJim Cownie       th->th.th_dispatch->th_deo_fcn = NULL;
21335e8470afSJim Cownie       th->th.th_dispatch->th_dxo_fcn = NULL;
21345e8470afSJim Cownie       th->th.th_dispatch->th_dispatch_sh_current = NULL;
21355e8470afSJim Cownie       th->th.th_dispatch->th_dispatch_pr_current = NULL;
21365e8470afSJim Cownie     } // if (status == 0)
21375e8470afSJim Cownie #if KMP_OS_WINDOWS
21385e8470afSJim Cownie     else if (last) {
21395e8470afSJim Cownie       pr->u.p.last_upper = pr->u.p.ub;
21405e8470afSJim Cownie     }
21415e8470afSJim Cownie #endif /* KMP_OS_WINDOWS */
21424cc4bb4cSJim Cownie     if (p_last != NULL && status != 0)
21434cc4bb4cSJim Cownie       *p_last = last;
21445e8470afSJim Cownie   } // if
21455e8470afSJim Cownie 
21465e8470afSJim Cownie #ifdef KMP_DEBUG
21475e8470afSJim Cownie   {
2148aeb40adaSJonas Hahnfeld     char *buff;
21495e8470afSJim Cownie     // create format specifiers before the debug output
21505e8470afSJim Cownie     buff = __kmp_str_format(
21513041982dSJonathan Peyton         "__kmp_dispatch_next: T#%%d normal case: "
215239ada854SJonathan Peyton         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
21535e8470afSJim Cownie         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
215439ada854SJonathan Peyton     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
215539ada854SJonathan Peyton                   (p_last ? *p_last : 0), status));
21565e8470afSJim Cownie     __kmp_str_free(&buff);
21575e8470afSJim Cownie   }
21585e8470afSJim Cownie #endif
21594cc4bb4cSJim Cownie #if INCLUDE_SSC_MARKS
21604cc4bb4cSJim Cownie   SSC_MARK_DISPATCH_NEXT();
21614cc4bb4cSJim Cownie #endif
2162d7d088f8SAndrey Churbanov   OMPT_LOOP_END;
2163f0682ac4SJonathan Peyton   KMP_STATS_LOOP_END;
21645e8470afSJim Cownie   return status;
21655e8470afSJim Cownie }
21665e8470afSJim Cownie 
21674cc4bb4cSJim Cownie template <typename T>
21683041982dSJonathan Peyton static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
21693041982dSJonathan Peyton                                   kmp_int32 *plastiter, T *plower, T *pupper,
21703041982dSJonathan Peyton                                   typename traits_t<T>::signed_t incr) {
21714cc4bb4cSJim Cownie   typedef typename traits_t<T>::unsigned_t UT;
2172414544c9SEd Maste   kmp_uint32 team_id;
2173414544c9SEd Maste   kmp_uint32 nteams;
2174414544c9SEd Maste   UT trip_count;
2175414544c9SEd Maste   kmp_team_t *team;
21764cc4bb4cSJim Cownie   kmp_info_t *th;
21774cc4bb4cSJim Cownie 
21784cc4bb4cSJim Cownie   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
21794cc4bb4cSJim Cownie   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
21804cc4bb4cSJim Cownie #ifdef KMP_DEBUG
2181baad3f60SJonathan Peyton   typedef typename traits_t<T>::signed_t ST;
21824cc4bb4cSJim Cownie   {
2183aeb40adaSJonas Hahnfeld     char *buff;
21844cc4bb4cSJim Cownie     // create format specifiers before the debug output
21853041982dSJonathan Peyton     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
21864cc4bb4cSJim Cownie                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
21873041982dSJonathan Peyton                             traits_t<T>::spec, traits_t<T>::spec,
21883041982dSJonathan Peyton                             traits_t<ST>::spec, traits_t<T>::spec);
21894cc4bb4cSJim Cownie     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
21904cc4bb4cSJim Cownie     __kmp_str_free(&buff);
21914cc4bb4cSJim Cownie   }
21924cc4bb4cSJim Cownie #endif
21934cc4bb4cSJim Cownie 
21944cc4bb4cSJim Cownie   if (__kmp_env_consistency_check) {
21954cc4bb4cSJim Cownie     if (incr == 0) {
21963041982dSJonathan Peyton       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
21973041982dSJonathan Peyton                             loc);
21984cc4bb4cSJim Cownie     }
21994cc4bb4cSJim Cownie     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
22004cc4bb4cSJim Cownie       // The loop is illegal.
22014cc4bb4cSJim Cownie       // Some zero-trip loops maintained by compiler, e.g.:
22024cc4bb4cSJim Cownie       //   for(i=10;i<0;++i) // lower >= upper - run-time check
22034cc4bb4cSJim Cownie       //   for(i=0;i>10;--i) // lower <= upper - run-time check
22044cc4bb4cSJim Cownie       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
22054cc4bb4cSJim Cownie       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
22064cc4bb4cSJim Cownie       // Compiler does not check the following illegal loops:
22074cc4bb4cSJim Cownie       //   for(i=0;i<10;i+=incr) // where incr<0
22084cc4bb4cSJim Cownie       //   for(i=10;i>0;i-=incr) // where incr<0
22094cc4bb4cSJim Cownie       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
22104cc4bb4cSJim Cownie     }
22114cc4bb4cSJim Cownie   }
2212787eb0c6SAndreyChurbanov   __kmp_assert_valid_gtid(gtid);
22134cc4bb4cSJim Cownie   th = __kmp_threads[gtid];
22144cc4bb4cSJim Cownie   team = th->th.th_team;
2215441f3376SJonathan Peyton   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
22164cc4bb4cSJim Cownie   nteams = th->th.th_teams_size.nteams;
22174cc4bb4cSJim Cownie   team_id = team->t.t_master_tid;
2218baad3f60SJonathan Peyton   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
22194cc4bb4cSJim Cownie 
22204cc4bb4cSJim Cownie   // compute global trip count
22214cc4bb4cSJim Cownie   if (incr == 1) {
22224cc4bb4cSJim Cownie     trip_count = *pupper - *plower + 1;
22234cc4bb4cSJim Cownie   } else if (incr == -1) {
22244cc4bb4cSJim Cownie     trip_count = *plower - *pupper + 1;
22255235a1b6SJonathan Peyton   } else if (incr > 0) {
22265235a1b6SJonathan Peyton     // upper-lower can exceed the limit of signed type
22275235a1b6SJonathan Peyton     trip_count = (UT)(*pupper - *plower) / incr + 1;
22284cc4bb4cSJim Cownie   } else {
22295235a1b6SJonathan Peyton     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
22304cc4bb4cSJim Cownie   }
223145be4500SJonathan Peyton 
22324cc4bb4cSJim Cownie   if (trip_count <= nteams) {
22334cc4bb4cSJim Cownie     KMP_DEBUG_ASSERT(
22343041982dSJonathan Peyton         __kmp_static == kmp_sch_static_greedy ||
22353041982dSJonathan Peyton         __kmp_static ==
22363041982dSJonathan Peyton             kmp_sch_static_balanced); // Unknown static scheduling type.
22374cc4bb4cSJim Cownie     // only some teams get single iteration, others get nothing
22384cc4bb4cSJim Cownie     if (team_id < trip_count) {
22394cc4bb4cSJim Cownie       *pupper = *plower = *plower + team_id * incr;
22404cc4bb4cSJim Cownie     } else {
22414cc4bb4cSJim Cownie       *plower = *pupper + incr; // zero-trip loop
22424cc4bb4cSJim Cownie     }
22434cc4bb4cSJim Cownie     if (plastiter != NULL)
22444cc4bb4cSJim Cownie       *plastiter = (team_id == trip_count - 1);
22454cc4bb4cSJim Cownie   } else {
22464cc4bb4cSJim Cownie     if (__kmp_static == kmp_sch_static_balanced) {
2247414544c9SEd Maste       UT chunk = trip_count / nteams;
2248414544c9SEd Maste       UT extras = trip_count % nteams;
22493041982dSJonathan Peyton       *plower +=
22503041982dSJonathan Peyton           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
22514cc4bb4cSJim Cownie       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
22524cc4bb4cSJim Cownie       if (plastiter != NULL)
22534cc4bb4cSJim Cownie         *plastiter = (team_id == nteams - 1);
22544cc4bb4cSJim Cownie     } else {
2255414544c9SEd Maste       T chunk_inc_count =
22564cc4bb4cSJim Cownie           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2257414544c9SEd Maste       T upper = *pupper;
22584cc4bb4cSJim Cownie       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
22594cc4bb4cSJim Cownie       // Unknown static scheduling type.
22604cc4bb4cSJim Cownie       *plower += team_id * chunk_inc_count;
22614cc4bb4cSJim Cownie       *pupper = *plower + chunk_inc_count - incr;
22624cc4bb4cSJim Cownie       // Check/correct bounds if needed
22634cc4bb4cSJim Cownie       if (incr > 0) {
22644cc4bb4cSJim Cownie         if (*pupper < *plower)
226512313d44SJonathan Peyton           *pupper = traits_t<T>::max_value;
22664cc4bb4cSJim Cownie         if (plastiter != NULL)
22674cc4bb4cSJim Cownie           *plastiter = *plower <= upper && *pupper > upper - incr;
22684cc4bb4cSJim Cownie         if (*pupper > upper)
22694cc4bb4cSJim Cownie           *pupper = upper; // tracker C73258
22704cc4bb4cSJim Cownie       } else {
22714cc4bb4cSJim Cownie         if (*pupper > *plower)
227212313d44SJonathan Peyton           *pupper = traits_t<T>::min_value;
22734cc4bb4cSJim Cownie         if (plastiter != NULL)
22744cc4bb4cSJim Cownie           *plastiter = *plower >= upper && *pupper < upper - incr;
22754cc4bb4cSJim Cownie         if (*pupper < upper)
22764cc4bb4cSJim Cownie           *pupper = upper; // tracker C73258
22774cc4bb4cSJim Cownie       }
22784cc4bb4cSJim Cownie     }
22794cc4bb4cSJim Cownie   }
22804cc4bb4cSJim Cownie }
22814cc4bb4cSJim Cownie 
22823041982dSJonathan Peyton //-----------------------------------------------------------------------------
22835e8470afSJim Cownie // Dispatch routines
22845e8470afSJim Cownie //    Transfer call to template< type T >
22855e8470afSJim Cownie //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
22865e8470afSJim Cownie //                         T lb, T ub, ST st, ST chunk )
22875e8470afSJim Cownie extern "C" {
22885e8470afSJim Cownie 
22895e8470afSJim Cownie /*!
22905e8470afSJim Cownie @ingroup WORK_SHARING
22915e8470afSJim Cownie @{
22925e8470afSJim Cownie @param loc Source location
22935e8470afSJim Cownie @param gtid Global thread id
22945e8470afSJim Cownie @param schedule Schedule type
22955e8470afSJim Cownie @param lb  Lower bound
22965e8470afSJim Cownie @param ub  Upper bound
22975e8470afSJim Cownie @param st  Step (or increment if you prefer)
22985e8470afSJim Cownie @param chunk The chunk size to block with
22995e8470afSJim Cownie 
23003041982dSJonathan Peyton This function prepares the runtime to start a dynamically scheduled for loop,
23013041982dSJonathan Peyton saving the loop arguments.
23025e8470afSJim Cownie These functions are all identical apart from the types of the arguments.
23035e8470afSJim Cownie */
23045e8470afSJim Cownie 
23053041982dSJonathan Peyton void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
23063041982dSJonathan Peyton                             enum sched_type schedule, kmp_int32 lb,
23073041982dSJonathan Peyton                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
23085e8470afSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
230982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
231082e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
231182e94a59SJoachim Protze #endif
23125e8470afSJim Cownie   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
23135e8470afSJim Cownie }
23145e8470afSJim Cownie /*!
23155e8470afSJim Cownie See @ref __kmpc_dispatch_init_4
23165e8470afSJim Cownie */
23173041982dSJonathan Peyton void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
23183041982dSJonathan Peyton                              enum sched_type schedule, kmp_uint32 lb,
23193041982dSJonathan Peyton                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
23205e8470afSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
232182e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
232282e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
232382e94a59SJoachim Protze #endif
23245e8470afSJim Cownie   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
23255e8470afSJim Cownie }
23265e8470afSJim Cownie 
23275e8470afSJim Cownie /*!
23285e8470afSJim Cownie See @ref __kmpc_dispatch_init_4
23295e8470afSJim Cownie */
23303041982dSJonathan Peyton void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
23313041982dSJonathan Peyton                             enum sched_type schedule, kmp_int64 lb,
23323041982dSJonathan Peyton                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
23335e8470afSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
233482e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
233582e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
233682e94a59SJoachim Protze #endif
23375e8470afSJim Cownie   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
23385e8470afSJim Cownie }
23395e8470afSJim Cownie 
23405e8470afSJim Cownie /*!
23415e8470afSJim Cownie See @ref __kmpc_dispatch_init_4
23425e8470afSJim Cownie */
23433041982dSJonathan Peyton void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
23443041982dSJonathan Peyton                              enum sched_type schedule, kmp_uint64 lb,
23453041982dSJonathan Peyton                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
23465e8470afSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
234782e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
234882e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
234982e94a59SJoachim Protze #endif
23505e8470afSJim Cownie   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
23515e8470afSJim Cownie }
23525e8470afSJim Cownie 
23535e8470afSJim Cownie /*!
23544cc4bb4cSJim Cownie See @ref __kmpc_dispatch_init_4
23554cc4bb4cSJim Cownie 
23564cc4bb4cSJim Cownie Difference from __kmpc_dispatch_init set of functions is these functions
23574cc4bb4cSJim Cownie are called for composite distribute parallel for construct. Thus before
23584cc4bb4cSJim Cownie regular iterations dispatching we need to calc per-team iteration space.
23594cc4bb4cSJim Cownie 
23604cc4bb4cSJim Cownie These functions are all identical apart from the types of the arguments.
23614cc4bb4cSJim Cownie */
23623041982dSJonathan Peyton void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
23633041982dSJonathan Peyton                                  enum sched_type schedule, kmp_int32 *p_last,
23643041982dSJonathan Peyton                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
23653041982dSJonathan Peyton                                  kmp_int32 chunk) {
23664cc4bb4cSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
236782e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
236882e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
236982e94a59SJoachim Protze #endif
23704cc4bb4cSJim Cownie   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
23714cc4bb4cSJim Cownie   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
23724cc4bb4cSJim Cownie }
23734cc4bb4cSJim Cownie 
23743041982dSJonathan Peyton void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
23753041982dSJonathan Peyton                                   enum sched_type schedule, kmp_int32 *p_last,
23763041982dSJonathan Peyton                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
23773041982dSJonathan Peyton                                   kmp_int32 chunk) {
23784cc4bb4cSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
237982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
238082e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
238182e94a59SJoachim Protze #endif
23824cc4bb4cSJim Cownie   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
23834cc4bb4cSJim Cownie   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
23844cc4bb4cSJim Cownie }
23854cc4bb4cSJim Cownie 
23863041982dSJonathan Peyton void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
23873041982dSJonathan Peyton                                  enum sched_type schedule, kmp_int32 *p_last,
23883041982dSJonathan Peyton                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
23893041982dSJonathan Peyton                                  kmp_int64 chunk) {
23904cc4bb4cSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
239182e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
239282e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
239382e94a59SJoachim Protze #endif
23944cc4bb4cSJim Cownie   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
23954cc4bb4cSJim Cownie   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
23964cc4bb4cSJim Cownie }
23974cc4bb4cSJim Cownie 
23983041982dSJonathan Peyton void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
23993041982dSJonathan Peyton                                   enum sched_type schedule, kmp_int32 *p_last,
24003041982dSJonathan Peyton                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
24013041982dSJonathan Peyton                                   kmp_int64 chunk) {
24024cc4bb4cSJim Cownie   KMP_DEBUG_ASSERT(__kmp_init_serial);
240382e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
240482e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
240582e94a59SJoachim Protze #endif
24064cc4bb4cSJim Cownie   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
24074cc4bb4cSJim Cownie   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
24084cc4bb4cSJim Cownie }
24094cc4bb4cSJim Cownie 
24104cc4bb4cSJim Cownie /*!
24115e8470afSJim Cownie @param loc Source code location
24125e8470afSJim Cownie @param gtid Global thread id
24133041982dSJonathan Peyton @param p_last Pointer to a flag set to one if this is the last chunk or zero
24143041982dSJonathan Peyton otherwise
24155e8470afSJim Cownie @param p_lb   Pointer to the lower bound for the next chunk of work
24165e8470afSJim Cownie @param p_ub   Pointer to the upper bound for the next chunk of work
24175e8470afSJim Cownie @param p_st   Pointer to the stride for the next chunk of work
24185e8470afSJim Cownie @return one if there is work to be done, zero otherwise
24195e8470afSJim Cownie 
24205e8470afSJim Cownie Get the next dynamically allocated chunk of work for this thread.
24215e8470afSJim Cownie If there is no more work, then the lb,ub and stride need not be modified.
24225e8470afSJim Cownie */
24233041982dSJonathan Peyton int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
24243041982dSJonathan Peyton                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
242582e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
242682e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
242782e94a59SJoachim Protze #endif
242882e94a59SJoachim Protze   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
242982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
243082e94a59SJoachim Protze                                         ,
243182e94a59SJoachim Protze                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
243282e94a59SJoachim Protze #endif
243382e94a59SJoachim Protze   );
24345e8470afSJim Cownie }
24355e8470afSJim Cownie 
24365e8470afSJim Cownie /*!
24375e8470afSJim Cownie See @ref __kmpc_dispatch_next_4
24385e8470afSJim Cownie */
24393041982dSJonathan Peyton int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
24403041982dSJonathan Peyton                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
24413041982dSJonathan Peyton                             kmp_int32 *p_st) {
244282e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
244382e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
244482e94a59SJoachim Protze #endif
244582e94a59SJoachim Protze   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
244682e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
244782e94a59SJoachim Protze                                          ,
244882e94a59SJoachim Protze                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
244982e94a59SJoachim Protze #endif
245082e94a59SJoachim Protze   );
24515e8470afSJim Cownie }
24525e8470afSJim Cownie 
24535e8470afSJim Cownie /*!
24545e8470afSJim Cownie See @ref __kmpc_dispatch_next_4
24555e8470afSJim Cownie */
24563041982dSJonathan Peyton int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
24573041982dSJonathan Peyton                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
245882e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
245982e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
246082e94a59SJoachim Protze #endif
246182e94a59SJoachim Protze   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
246282e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
246382e94a59SJoachim Protze                                         ,
246482e94a59SJoachim Protze                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
246582e94a59SJoachim Protze #endif
246682e94a59SJoachim Protze   );
24675e8470afSJim Cownie }
24685e8470afSJim Cownie 
24695e8470afSJim Cownie /*!
24705e8470afSJim Cownie See @ref __kmpc_dispatch_next_4
24715e8470afSJim Cownie */
24723041982dSJonathan Peyton int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
24733041982dSJonathan Peyton                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
24743041982dSJonathan Peyton                             kmp_int64 *p_st) {
247582e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
247682e94a59SJoachim Protze   OMPT_STORE_RETURN_ADDRESS(gtid);
247782e94a59SJoachim Protze #endif
247882e94a59SJoachim Protze   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
247982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL
248082e94a59SJoachim Protze                                          ,
248182e94a59SJoachim Protze                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
248282e94a59SJoachim Protze #endif
248382e94a59SJoachim Protze   );
24845e8470afSJim Cownie }
24855e8470afSJim Cownie 
24865e8470afSJim Cownie /*!
24875e8470afSJim Cownie @param loc Source code location
24885e8470afSJim Cownie @param gtid Global thread id
24895e8470afSJim Cownie 
24905e8470afSJim Cownie Mark the end of a dynamic loop.
24915e8470afSJim Cownie */
24923041982dSJonathan Peyton void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
24935e8470afSJim Cownie   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
24945e8470afSJim Cownie }
24955e8470afSJim Cownie 
24965e8470afSJim Cownie /*!
24975e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4
24985e8470afSJim Cownie */
24993041982dSJonathan Peyton void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
25005e8470afSJim Cownie   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
25015e8470afSJim Cownie }
25025e8470afSJim Cownie 
25035e8470afSJim Cownie /*!
25045e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4
25055e8470afSJim Cownie */
25063041982dSJonathan Peyton void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
25075e8470afSJim Cownie   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
25085e8470afSJim Cownie }
25095e8470afSJim Cownie 
25105e8470afSJim Cownie /*!
25115e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4
25125e8470afSJim Cownie */
25133041982dSJonathan Peyton void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
25145e8470afSJim Cownie   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
25155e8470afSJim Cownie }
25165e8470afSJim Cownie /*! @} */
25175e8470afSJim Cownie 
25183041982dSJonathan Peyton //-----------------------------------------------------------------------------
2519de4749b7SJonathan Peyton // Non-template routines from kmp_dispatch.cpp used in other sources
25205e8470afSJim Cownie 
25215e8470afSJim Cownie kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
25225e8470afSJim Cownie   return value == checker;
25235e8470afSJim Cownie }
25245e8470afSJim Cownie 
25255e8470afSJim Cownie kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
25265e8470afSJim Cownie   return value != checker;
25275e8470afSJim Cownie }
25285e8470afSJim Cownie 
25295e8470afSJim Cownie kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
25305e8470afSJim Cownie   return value < checker;
25315e8470afSJim Cownie }
25325e8470afSJim Cownie 
25335e8470afSJim Cownie kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
25345e8470afSJim Cownie   return value >= checker;
25355e8470afSJim Cownie }
25365e8470afSJim Cownie 
25375e8470afSJim Cownie kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
25385e8470afSJim Cownie   return value <= checker;
25395e8470afSJim Cownie }
25405e8470afSJim Cownie 
25415e8470afSJim Cownie kmp_uint32
2542e47d32f1SJonathan Peyton __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
25433041982dSJonathan Peyton              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
25443041982dSJonathan Peyton              void *obj // Higher-level synchronization object, or NULL.
25453041982dSJonathan Peyton ) {
25465e8470afSJim Cownie   // note: we may not belong to a team at this point
2547414544c9SEd Maste   volatile kmp_uint32 *spin = spinner;
2548414544c9SEd Maste   kmp_uint32 check = checker;
2549414544c9SEd Maste   kmp_uint32 spins;
2550414544c9SEd Maste   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2551414544c9SEd Maste   kmp_uint32 r;
25525e8470afSJim Cownie 
2553c47afcd9SAndrey Churbanov   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
25545e8470afSJim Cownie   KMP_INIT_YIELD(spins);
25555e8470afSJim Cownie   // main wait spin loop
25565e8470afSJim Cownie   while (!f(r = TCR_4(*spin), check)) {
25575e8470afSJim Cownie     KMP_FSYNC_SPIN_PREPARE(obj);
25583041982dSJonathan Peyton     /* GEH - remove this since it was accidentally introduced when kmp_wait was
25593041982dSJonathan Peyton        split. It causes problems with infinite recursion because of exit lock */
25605e8470afSJim Cownie     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
25615e8470afSJim Cownie         __kmp_abort_thread(); */
2562e47d32f1SJonathan Peyton     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
25635e8470afSJim Cownie   }
25645e8470afSJim Cownie   KMP_FSYNC_SPIN_ACQUIRED(obj);
25655e8470afSJim Cownie   return r;
25665e8470afSJim Cownie }
25675e8470afSJim Cownie 
2568e47d32f1SJonathan Peyton void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2569e47d32f1SJonathan Peyton                       kmp_uint32 (*pred)(void *, kmp_uint32),
2570f7cc6affSPaul Osmialowski                       void *obj // Higher-level synchronization object, or NULL.
25713041982dSJonathan Peyton ) {
2572f7cc6affSPaul Osmialowski   // note: we may not belong to a team at this point
2573414544c9SEd Maste   void *spin = spinner;
2574414544c9SEd Maste   kmp_uint32 check = checker;
2575414544c9SEd Maste   kmp_uint32 spins;
2576414544c9SEd Maste   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2577f7cc6affSPaul Osmialowski 
2578f7cc6affSPaul Osmialowski   KMP_FSYNC_SPIN_INIT(obj, spin);
2579f7cc6affSPaul Osmialowski   KMP_INIT_YIELD(spins);
2580f7cc6affSPaul Osmialowski   // main wait spin loop
2581f7cc6affSPaul Osmialowski   while (!f(spin, check)) {
2582f7cc6affSPaul Osmialowski     KMP_FSYNC_SPIN_PREPARE(obj);
2583e47d32f1SJonathan Peyton     /* if we have waited a bit, or are noversubscribed, yield */
2584f7cc6affSPaul Osmialowski     /* pause is in the following code */
2585e47d32f1SJonathan Peyton     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2586f7cc6affSPaul Osmialowski   }
2587f7cc6affSPaul Osmialowski   KMP_FSYNC_SPIN_ACQUIRED(obj);
2588f7cc6affSPaul Osmialowski }
2589f7cc6affSPaul Osmialowski 
25905e8470afSJim Cownie } // extern "C"
25915e8470afSJim Cownie 
25925e8470afSJim Cownie #ifdef KMP_GOMP_COMPAT
25935e8470afSJim Cownie 
25943041982dSJonathan Peyton void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
25953041982dSJonathan Peyton                                enum sched_type schedule, kmp_int32 lb,
25963041982dSJonathan Peyton                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
25973041982dSJonathan Peyton                                int push_ws) {
25985e8470afSJim Cownie   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
25995e8470afSJim Cownie                                  push_ws);
26005e8470afSJim Cownie }
26015e8470afSJim Cownie 
26023041982dSJonathan Peyton void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
26033041982dSJonathan Peyton                                 enum sched_type schedule, kmp_uint32 lb,
26043041982dSJonathan Peyton                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
26053041982dSJonathan Peyton                                 int push_ws) {
26065e8470afSJim Cownie   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
26075e8470afSJim Cownie                                   push_ws);
26085e8470afSJim Cownie }
26095e8470afSJim Cownie 
26103041982dSJonathan Peyton void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
26113041982dSJonathan Peyton                                enum sched_type schedule, kmp_int64 lb,
26123041982dSJonathan Peyton                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
26133041982dSJonathan Peyton                                int push_ws) {
26145e8470afSJim Cownie   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
26155e8470afSJim Cownie                                  push_ws);
26165e8470afSJim Cownie }
26175e8470afSJim Cownie 
26183041982dSJonathan Peyton void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
26193041982dSJonathan Peyton                                 enum sched_type schedule, kmp_uint64 lb,
26203041982dSJonathan Peyton                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
26213041982dSJonathan Peyton                                 int push_ws) {
26225e8470afSJim Cownie   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
26235e8470afSJim Cownie                                   push_ws);
26245e8470afSJim Cownie }
26255e8470afSJim Cownie 
26263041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
26275e8470afSJim Cownie   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
26285e8470afSJim Cownie }
26295e8470afSJim Cownie 
26303041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
26315e8470afSJim Cownie   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
26325e8470afSJim Cownie }
26335e8470afSJim Cownie 
26343041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
26355e8470afSJim Cownie   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
26365e8470afSJim Cownie }
26375e8470afSJim Cownie 
26383041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
26395e8470afSJim Cownie   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
26405e8470afSJim Cownie }
26415e8470afSJim Cownie 
26425e8470afSJim Cownie #endif /* KMP_GOMP_COMPAT */
26435e8470afSJim Cownie 
26445e8470afSJim Cownie /* ------------------------------------------------------------------------ */
2645