15e8470afSJim Cownie /* 25e8470afSJim Cownie * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 35e8470afSJim Cownie */ 45e8470afSJim Cownie 55e8470afSJim Cownie //===----------------------------------------------------------------------===// 65e8470afSJim Cownie // 757b08b09SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 857b08b09SChandler Carruth // See https://llvm.org/LICENSE.txt for license information. 957b08b09SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 105e8470afSJim Cownie // 115e8470afSJim Cownie //===----------------------------------------------------------------------===// 125e8470afSJim Cownie 133041982dSJonathan Peyton /* Dynamic scheduling initialization and dispatch. 145e8470afSJim Cownie * 155e8470afSJim Cownie * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 165e8470afSJim Cownie * it may change values between parallel regions. __kmp_max_nth 175e8470afSJim Cownie * is the largest value __kmp_nth may take, 1 is the smallest. 185e8470afSJim Cownie */ 195e8470afSJim Cownie 205e8470afSJim Cownie #include "kmp.h" 213041982dSJonathan Peyton #include "kmp_error.h" 225e8470afSJim Cownie #include "kmp_i18n.h" 235e8470afSJim Cownie #include "kmp_itt.h" 244cc4bb4cSJim Cownie #include "kmp_stats.h" 253041982dSJonathan Peyton #include "kmp_str.h" 26f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 275e8470afSJim Cownie #include <float.h> 285e8470afSJim Cownie #endif 2939ada854SJonathan Peyton #include "kmp_lock.h" 3039ada854SJonathan Peyton #include "kmp_dispatch.h" 31f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 32f6399367SJonathan Peyton #include "kmp_dispatch_hier.h" 33f6399367SJonathan Peyton #endif 345e8470afSJim Cownie 35d7d088f8SAndrey Churbanov #if OMPT_SUPPORT 36d7d088f8SAndrey Churbanov #include "ompt-specific.h" 37d7d088f8SAndrey Churbanov #endif 38d7d088f8SAndrey Churbanov 395e8470afSJim Cownie /* ------------------------------------------------------------------------ */ 405e8470afSJim Cownie /* ------------------------------------------------------------------------ */ 415e8470afSJim Cownie 4239ada854SJonathan Peyton void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 435e8470afSJim Cownie kmp_info_t *th; 445e8470afSJim Cownie 455e8470afSJim Cownie KMP_DEBUG_ASSERT(gtid_ref); 465e8470afSJim Cownie 475e8470afSJim Cownie if (__kmp_env_consistency_check) { 485e8470afSJim Cownie th = __kmp_threads[*gtid_ref]; 493041982dSJonathan Peyton if (th->th.th_root->r.r_active && 503041982dSJonathan Peyton (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 515c56fb55SAndrey Churbanov #if KMP_USE_DYNAMIC_LOCK 525c56fb55SAndrey Churbanov __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 535c56fb55SAndrey Churbanov #else 545e8470afSJim Cownie __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 555c56fb55SAndrey Churbanov #endif 565e8470afSJim Cownie } 575e8470afSJim Cownie } 585e8470afSJim Cownie } 595e8470afSJim Cownie 6039ada854SJonathan Peyton void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 615e8470afSJim Cownie kmp_info_t *th; 625e8470afSJim Cownie 635e8470afSJim Cownie if (__kmp_env_consistency_check) { 645e8470afSJim Cownie th = __kmp_threads[*gtid_ref]; 655e8470afSJim Cownie if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 665e8470afSJim Cownie __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 675e8470afSJim Cownie } 685e8470afSJim Cownie } 695e8470afSJim Cownie } 705e8470afSJim Cownie 7171abe28eSJonathan Peyton // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 725e348774SPeyton, Jonathan L static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 7371abe28eSJonathan Peyton bool use_hier = false) { 7471abe28eSJonathan Peyton // Pick up the nonmonotonic/monotonic bits from the scheduling type 755e348774SPeyton, Jonathan L // TODO: make nonmonotonic when static_steal is fixed 765e348774SPeyton, Jonathan L int monotonicity = SCHEDULE_MONOTONIC; 775e348774SPeyton, Jonathan L 785e348774SPeyton, Jonathan L // Let default be monotonic for executables 795e348774SPeyton, Jonathan L // compiled with OpenMP* 4.5 or less compilers 805e348774SPeyton, Jonathan L if (loc->get_openmp_version() < 50) 8171abe28eSJonathan Peyton monotonicity = SCHEDULE_MONOTONIC; 825e348774SPeyton, Jonathan L 8367773681SJonathan Peyton if (use_hier || __kmp_force_monotonic) 845e348774SPeyton, Jonathan L monotonicity = SCHEDULE_MONOTONIC; 855e348774SPeyton, Jonathan L else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 8671abe28eSJonathan Peyton monotonicity = SCHEDULE_NONMONOTONIC; 8771abe28eSJonathan Peyton else if (SCHEDULE_HAS_MONOTONIC(schedule)) 8871abe28eSJonathan Peyton monotonicity = SCHEDULE_MONOTONIC; 895e348774SPeyton, Jonathan L 9071abe28eSJonathan Peyton return monotonicity; 9171abe28eSJonathan Peyton } 9271abe28eSJonathan Peyton 9339ada854SJonathan Peyton // Initialize a dispatch_private_info_template<T> buffer for a particular 9439ada854SJonathan Peyton // type of schedule,chunk. The loop description is found in lb (lower bound), 9539ada854SJonathan Peyton // ub (upper bound), and st (stride). nproc is the number of threads relevant 9639ada854SJonathan Peyton // to the scheduling (often the number of threads in a team, but not always if 9739ada854SJonathan Peyton // hierarchical scheduling is used). tid is the id of the thread calling 9839ada854SJonathan Peyton // the function within the group of nproc threads. It will have a value 9939ada854SJonathan Peyton // between 0 and nproc - 1. This is often just the thread id within a team, but 10039ada854SJonathan Peyton // is not necessarily the case when using hierarchical scheduling. 10139ada854SJonathan Peyton // loc is the source file location of the corresponding loop 10239ada854SJonathan Peyton // gtid is the global thread id 1035e8470afSJim Cownie template <typename T> 10439ada854SJonathan Peyton void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 10539ada854SJonathan Peyton dispatch_private_info_template<T> *pr, 10639ada854SJonathan Peyton enum sched_type schedule, T lb, T ub, 10739ada854SJonathan Peyton typename traits_t<T>::signed_t st, 10839ada854SJonathan Peyton #if USE_ITT_BUILD 10939ada854SJonathan Peyton kmp_uint64 *cur_chunk, 11039ada854SJonathan Peyton #endif 11139ada854SJonathan Peyton typename traits_t<T>::signed_t chunk, 11239ada854SJonathan Peyton T nproc, T tid) { 1135e8470afSJim Cownie typedef typename traits_t<T>::unsigned_t UT; 1145e8470afSJim Cownie typedef typename traits_t<T>::floating_t DBL; 1155e8470afSJim Cownie 1165e8470afSJim Cownie int active; 1175e8470afSJim Cownie T tc; 1185e8470afSJim Cownie kmp_info_t *th; 1195e8470afSJim Cownie kmp_team_t *team; 12071abe28eSJonathan Peyton int monotonicity; 12171abe28eSJonathan Peyton bool use_hier; 1225e8470afSJim Cownie 1235e8470afSJim Cownie #ifdef KMP_DEBUG 124baad3f60SJonathan Peyton typedef typename traits_t<T>::signed_t ST; 1255e8470afSJim Cownie { 126aeb40adaSJonas Hahnfeld char *buff; 1275e8470afSJim Cownie // create format specifiers before the debug output 12839ada854SJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 12939ada854SJonathan Peyton "pr:%%p lb:%%%s ub:%%%s st:%%%s " 13039ada854SJonathan Peyton "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 13139ada854SJonathan Peyton traits_t<T>::spec, traits_t<T>::spec, 13239ada854SJonathan Peyton traits_t<ST>::spec, traits_t<ST>::spec, 13339ada854SJonathan Peyton traits_t<T>::spec, traits_t<T>::spec); 13439ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 1355e8470afSJim Cownie __kmp_str_free(&buff); 1365e8470afSJim Cownie } 1375e8470afSJim Cownie #endif 1385e8470afSJim Cownie /* setup data */ 1395e8470afSJim Cownie th = __kmp_threads[gtid]; 1405e8470afSJim Cownie team = th->th.th_team; 1415e8470afSJim Cownie active = !team->t.t_serialized; 1425e8470afSJim Cownie 1434cc4bb4cSJim Cownie #if USE_ITT_BUILD 144e4b4f994SJonathan Peyton int itt_need_metadata_reporting = 145e4b4f994SJonathan Peyton __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146e4b4f994SJonathan Peyton KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 14751aecb82SAndrey Churbanov team->t.t_active_level == 1; 1484cc4bb4cSJim Cownie #endif 14971abe28eSJonathan Peyton 15071abe28eSJonathan Peyton #if KMP_USE_HIER_SCHED 15171abe28eSJonathan Peyton use_hier = pr->flags.use_hier; 15271abe28eSJonathan Peyton #else 15371abe28eSJonathan Peyton use_hier = false; 154429dbc2aSAndrey Churbanov #endif 15571abe28eSJonathan Peyton 15671abe28eSJonathan Peyton /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 1575e348774SPeyton, Jonathan L monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158ea0fe1dfSJonathan Peyton schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159ea0fe1dfSJonathan Peyton 1605e8470afSJim Cownie /* Pick up the nomerge/ordered bits from the scheduling type */ 1615e8470afSJim Cownie if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 16239ada854SJonathan Peyton pr->flags.nomerge = TRUE; 1633041982dSJonathan Peyton schedule = 1643041982dSJonathan Peyton (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 1655e8470afSJim Cownie } else { 16639ada854SJonathan Peyton pr->flags.nomerge = FALSE; 1675e8470afSJim Cownie } 16812313d44SJonathan Peyton pr->type_size = traits_t<T>::type_size; // remember the size of variables 1695e8470afSJim Cownie if (kmp_ord_lower & schedule) { 17039ada854SJonathan Peyton pr->flags.ordered = TRUE; 1713041982dSJonathan Peyton schedule = 1723041982dSJonathan Peyton (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 1735e8470afSJim Cownie } else { 17439ada854SJonathan Peyton pr->flags.ordered = FALSE; 1755e8470afSJim Cownie } 17671abe28eSJonathan Peyton // Ordered overrides nonmonotonic 17771abe28eSJonathan Peyton if (pr->flags.ordered) { 17871abe28eSJonathan Peyton monotonicity = SCHEDULE_MONOTONIC; 17971abe28eSJonathan Peyton } 18045be4500SJonathan Peyton 1815e8470afSJim Cownie if (schedule == kmp_sch_static) { 1825e8470afSJim Cownie schedule = __kmp_static; 1835e8470afSJim Cownie } else { 1845e8470afSJim Cownie if (schedule == kmp_sch_runtime) { 1853041982dSJonathan Peyton // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 1863041982dSJonathan Peyton // not specified) 1875e8470afSJim Cownie schedule = team->t.t_sched.r_sched_type; 1885e348774SPeyton, Jonathan L monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 18971abe28eSJonathan Peyton schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 1903041982dSJonathan Peyton // Detail the schedule if needed (global controls are differentiated 1913041982dSJonathan Peyton // appropriately) 1925e8470afSJim Cownie if (schedule == kmp_sch_guided_chunked) { 1935e8470afSJim Cownie schedule = __kmp_guided; 1945e8470afSJim Cownie } else if (schedule == kmp_sch_static) { 1955e8470afSJim Cownie schedule = __kmp_static; 1965e8470afSJim Cownie } 1973041982dSJonathan Peyton // Use the chunk size specified by OMP_SCHEDULE (or default if not 1983041982dSJonathan Peyton // specified) 1995e8470afSJim Cownie chunk = team->t.t_sched.chunk; 20000afbd01SJonathan Peyton #if USE_ITT_BUILD 20139ada854SJonathan Peyton if (cur_chunk) 20239ada854SJonathan Peyton *cur_chunk = chunk; 20300afbd01SJonathan Peyton #endif 2045e8470afSJim Cownie #ifdef KMP_DEBUG 2055e8470afSJim Cownie { 206aeb40adaSJonas Hahnfeld char *buff; 2075e8470afSJim Cownie // create format specifiers before the debug output 20839ada854SJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 20939ada854SJonathan Peyton "schedule:%%d chunk:%%%s\n", 2105e8470afSJim Cownie traits_t<ST>::spec); 2115e8470afSJim Cownie KD_TRACE(10, (buff, gtid, schedule, chunk)); 2125e8470afSJim Cownie __kmp_str_free(&buff); 2135e8470afSJim Cownie } 2145e8470afSJim Cownie #endif 2155e8470afSJim Cownie } else { 2165e8470afSJim Cownie if (schedule == kmp_sch_guided_chunked) { 2175e8470afSJim Cownie schedule = __kmp_guided; 2185e8470afSJim Cownie } 2195e8470afSJim Cownie if (chunk <= 0) { 2205e8470afSJim Cownie chunk = KMP_DEFAULT_CHUNK; 2215e8470afSJim Cownie } 2225e8470afSJim Cownie } 2235e8470afSJim Cownie 2245e8470afSJim Cownie if (schedule == kmp_sch_auto) { 2255e8470afSJim Cownie // mapping and differentiation: in the __kmp_do_serial_initialize() 2265e8470afSJim Cownie schedule = __kmp_auto; 2275e8470afSJim Cownie #ifdef KMP_DEBUG 2285e8470afSJim Cownie { 229aeb40adaSJonas Hahnfeld char *buff; 2305e8470afSJim Cownie // create format specifiers before the debug output 23139ada854SJonathan Peyton buff = __kmp_str_format( 23239ada854SJonathan Peyton "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 2333041982dSJonathan Peyton "schedule:%%d chunk:%%%s\n", 2345e8470afSJim Cownie traits_t<ST>::spec); 2355e8470afSJim Cownie KD_TRACE(10, (buff, gtid, schedule, chunk)); 2365e8470afSJim Cownie __kmp_str_free(&buff); 2375e8470afSJim Cownie } 2385e8470afSJim Cownie #endif 2395e8470afSJim Cownie } 24071abe28eSJonathan Peyton #if KMP_STATIC_STEAL_ENABLED 24171abe28eSJonathan Peyton // map nonmonotonic:dynamic to static steal 24271abe28eSJonathan Peyton if (schedule == kmp_sch_dynamic_chunked) { 24371abe28eSJonathan Peyton if (monotonicity == SCHEDULE_NONMONOTONIC) 24471abe28eSJonathan Peyton schedule = kmp_sch_static_steal; 24571abe28eSJonathan Peyton } 24671abe28eSJonathan Peyton #endif 2475e8470afSJim Cownie /* guided analytical not safe for too many threads */ 24839ada854SJonathan Peyton if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 2495e8470afSJim Cownie schedule = kmp_sch_guided_iterative_chunked; 2505e8470afSJim Cownie KMP_WARNING(DispatchManyThreads); 2515e8470afSJim Cownie } 252d454c73cSAndrey Churbanov if (schedule == kmp_sch_runtime_simd) { 253d454c73cSAndrey Churbanov // compiler provides simd_width in the chunk parameter 254d454c73cSAndrey Churbanov schedule = team->t.t_sched.r_sched_type; 2555e348774SPeyton, Jonathan L monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 25671abe28eSJonathan Peyton schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257d454c73cSAndrey Churbanov // Detail the schedule if needed (global controls are differentiated 258d454c73cSAndrey Churbanov // appropriately) 259d454c73cSAndrey Churbanov if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260d454c73cSAndrey Churbanov schedule == __kmp_static) { 261d454c73cSAndrey Churbanov schedule = kmp_sch_static_balanced_chunked; 262d454c73cSAndrey Churbanov } else { 263d454c73cSAndrey Churbanov if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264d454c73cSAndrey Churbanov schedule = kmp_sch_guided_simd; 265d454c73cSAndrey Churbanov } 266d454c73cSAndrey Churbanov chunk = team->t.t_sched.chunk * chunk; 267d454c73cSAndrey Churbanov } 268d454c73cSAndrey Churbanov #if USE_ITT_BUILD 26939ada854SJonathan Peyton if (cur_chunk) 27039ada854SJonathan Peyton *cur_chunk = chunk; 271d454c73cSAndrey Churbanov #endif 272d454c73cSAndrey Churbanov #ifdef KMP_DEBUG 273d454c73cSAndrey Churbanov { 274aeb40adaSJonas Hahnfeld char *buff; 275d454c73cSAndrey Churbanov // create format specifiers before the debug output 27671abe28eSJonathan Peyton buff = __kmp_str_format( 27771abe28eSJonathan Peyton "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278d454c73cSAndrey Churbanov " chunk:%%%s\n", 279d454c73cSAndrey Churbanov traits_t<ST>::spec); 280d454c73cSAndrey Churbanov KD_TRACE(10, (buff, gtid, schedule, chunk)); 281d454c73cSAndrey Churbanov __kmp_str_free(&buff); 282d454c73cSAndrey Churbanov } 283d454c73cSAndrey Churbanov #endif 284d454c73cSAndrey Churbanov } 2855e8470afSJim Cownie pr->u.p.parm1 = chunk; 2865e8470afSJim Cownie } 2875e8470afSJim Cownie KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 2885e8470afSJim Cownie "unknown scheduling type"); 2895e8470afSJim Cownie 2905e8470afSJim Cownie pr->u.p.count = 0; 2915e8470afSJim Cownie 2925e8470afSJim Cownie if (__kmp_env_consistency_check) { 2935e8470afSJim Cownie if (st == 0) { 2943041982dSJonathan Peyton __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 29539ada854SJonathan Peyton (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 2965e8470afSJim Cownie } 2975e8470afSJim Cownie } 2985235a1b6SJonathan Peyton // compute trip count 2995235a1b6SJonathan Peyton if (st == 1) { // most common case 3005235a1b6SJonathan Peyton if (ub >= lb) { 3015235a1b6SJonathan Peyton tc = ub - lb + 1; 3025235a1b6SJonathan Peyton } else { // ub < lb 3035e8470afSJim Cownie tc = 0; // zero-trip 3045235a1b6SJonathan Peyton } 3055235a1b6SJonathan Peyton } else if (st < 0) { 3065235a1b6SJonathan Peyton if (lb >= ub) { 3075235a1b6SJonathan Peyton // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 3085235a1b6SJonathan Peyton // where the division needs to be unsigned regardless of the result type 3095235a1b6SJonathan Peyton tc = (UT)(lb - ub) / (-st) + 1; 3105235a1b6SJonathan Peyton } else { // lb < ub 3115235a1b6SJonathan Peyton tc = 0; // zero-trip 3125e8470afSJim Cownie } 3135e8470afSJim Cownie } else { // st > 0 3145235a1b6SJonathan Peyton if (ub >= lb) { 3155235a1b6SJonathan Peyton // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 3165235a1b6SJonathan Peyton // where the division needs to be unsigned regardless of the result type 3175235a1b6SJonathan Peyton tc = (UT)(ub - lb) / st + 1; 3185235a1b6SJonathan Peyton } else { // ub < lb 3195e8470afSJim Cownie tc = 0; // zero-trip 3205e8470afSJim Cownie } 3215e8470afSJim Cownie } 3225e8470afSJim Cownie 323d2b53cadSJonathan Peyton #if KMP_STATS_ENABLED 324d2b53cadSJonathan Peyton if (KMP_MASTER_GTID(gtid)) { 325d2b53cadSJonathan Peyton KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326d2b53cadSJonathan Peyton } 327d2b53cadSJonathan Peyton #endif 328d2b53cadSJonathan Peyton 3295e8470afSJim Cownie pr->u.p.lb = lb; 3305e8470afSJim Cownie pr->u.p.ub = ub; 3315e8470afSJim Cownie pr->u.p.st = st; 3325e8470afSJim Cownie pr->u.p.tc = tc; 3335e8470afSJim Cownie 3345e8470afSJim Cownie #if KMP_OS_WINDOWS 3355e8470afSJim Cownie pr->u.p.last_upper = ub + st; 3365e8470afSJim Cownie #endif /* KMP_OS_WINDOWS */ 3375e8470afSJim Cownie 3385e8470afSJim Cownie /* NOTE: only the active parallel region(s) has active ordered sections */ 3395e8470afSJim Cownie 3405e8470afSJim Cownie if (active) { 34139ada854SJonathan Peyton if (pr->flags.ordered) { 3425e8470afSJim Cownie pr->ordered_bumped = 0; 3435e8470afSJim Cownie pr->u.p.ordered_lower = 1; 3445e8470afSJim Cownie pr->u.p.ordered_upper = 0; 3455e8470afSJim Cownie } 3465e8470afSJim Cownie } 3475e8470afSJim Cownie 3485e8470afSJim Cownie switch (schedule) { 349429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED) 3503041982dSJonathan Peyton case kmp_sch_static_steal: { 3515e8470afSJim Cownie T ntc, init; 3525e8470afSJim Cownie 3533041982dSJonathan Peyton KD_TRACE(100, 35439ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 35539ada854SJonathan Peyton gtid)); 3565e8470afSJim Cownie 3575e8470afSJim Cownie ntc = (tc % chunk ? 1 : 0) + tc / chunk; 3585e8470afSJim Cownie if (nproc > 1 && ntc >= nproc) { 359f0682ac4SJonathan Peyton KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 36039ada854SJonathan Peyton T id = tid; 3615e8470afSJim Cownie T small_chunk, extras; 3625e8470afSJim Cownie 3635e8470afSJim Cownie small_chunk = ntc / nproc; 3645e8470afSJim Cownie extras = ntc % nproc; 3655e8470afSJim Cownie 3665e8470afSJim Cownie init = id * small_chunk + (id < extras ? id : extras); 3675e8470afSJim Cownie pr->u.p.count = init; 3685e8470afSJim Cownie pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 3695e8470afSJim Cownie 3705e8470afSJim Cownie pr->u.p.parm2 = lb; 37171abe28eSJonathan Peyton // parm3 is the number of times to attempt stealing which is 37271abe28eSJonathan Peyton // proportional to the number of chunks per thread up until 37371abe28eSJonathan Peyton // the maximum value of nproc. 37471abe28eSJonathan Peyton pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375429dbc2aSAndrey Churbanov pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 3765e8470afSJim Cownie pr->u.p.st = st; 37712313d44SJonathan Peyton if (traits_t<T>::type_size > 4) { 378429dbc2aSAndrey Churbanov // AC: TODO: check if 16-byte CAS available and use it to 379429dbc2aSAndrey Churbanov // improve performance (probably wait for explicit request 380429dbc2aSAndrey Churbanov // before spending time on this). 381429dbc2aSAndrey Churbanov // For now use dynamically allocated per-thread lock, 382429dbc2aSAndrey Churbanov // free memory in __kmp_dispatch_next when status==0. 383abe64360SAndreyChurbanov KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384abe64360SAndreyChurbanov pr->u.p.th_steal_lock = 385429dbc2aSAndrey Churbanov (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386abe64360SAndreyChurbanov __kmp_init_lock(pr->u.p.th_steal_lock); 387429dbc2aSAndrey Churbanov } 3885e8470afSJim Cownie break; 3895e8470afSJim Cownie } else { 390bd2fb41cSAndreyChurbanov /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391bd2fb41cSAndreyChurbanov schedule = kmp_sch_dynamic_chunked; 392bd2fb41cSAndreyChurbanov KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393bd2fb41cSAndreyChurbanov "kmp_sch_dynamic_chunked\n", 3945e8470afSJim Cownie gtid)); 395bd2fb41cSAndreyChurbanov if (pr->u.p.parm1 <= 0) 396bd2fb41cSAndreyChurbanov pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 397bd2fb41cSAndreyChurbanov break; 3985e8470afSJim Cownie } // if 3995e8470afSJim Cownie } // case 4005e8470afSJim Cownie #endif 4013041982dSJonathan Peyton case kmp_sch_static_balanced: { 4025e8470afSJim Cownie T init, limit; 4035e8470afSJim Cownie 40439ada854SJonathan Peyton KD_TRACE( 40539ada854SJonathan Peyton 100, 40639ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 4075e8470afSJim Cownie gtid)); 4085e8470afSJim Cownie 4095e8470afSJim Cownie if (nproc > 1) { 41039ada854SJonathan Peyton T id = tid; 4115e8470afSJim Cownie 4125e8470afSJim Cownie if (tc < nproc) { 4135e8470afSJim Cownie if (id < tc) { 4145e8470afSJim Cownie init = id; 4155e8470afSJim Cownie limit = id; 4165e8470afSJim Cownie pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 4175e8470afSJim Cownie } else { 4185e8470afSJim Cownie pr->u.p.count = 1; /* means no more chunks to execute */ 4195e8470afSJim Cownie pr->u.p.parm1 = FALSE; 4205e8470afSJim Cownie break; 4215e8470afSJim Cownie } 4225e8470afSJim Cownie } else { 4235e8470afSJim Cownie T small_chunk = tc / nproc; 4245e8470afSJim Cownie T extras = tc % nproc; 4255e8470afSJim Cownie init = id * small_chunk + (id < extras ? id : extras); 4265e8470afSJim Cownie limit = init + small_chunk - (id < extras ? 0 : 1); 4275e8470afSJim Cownie pr->u.p.parm1 = (id == nproc - 1); 4285e8470afSJim Cownie } 4295e8470afSJim Cownie } else { 4305e8470afSJim Cownie if (tc > 0) { 4315e8470afSJim Cownie init = 0; 4325e8470afSJim Cownie limit = tc - 1; 4335e8470afSJim Cownie pr->u.p.parm1 = TRUE; 43439ada854SJonathan Peyton } else { 43539ada854SJonathan Peyton // zero trip count 4365e8470afSJim Cownie pr->u.p.count = 1; /* means no more chunks to execute */ 4375e8470afSJim Cownie pr->u.p.parm1 = FALSE; 4385e8470afSJim Cownie break; 4395e8470afSJim Cownie } 4405e8470afSJim Cownie } 4414cc4bb4cSJim Cownie #if USE_ITT_BUILD 4424cc4bb4cSJim Cownie // Calculate chunk for metadata report 44351aecb82SAndrey Churbanov if (itt_need_metadata_reporting) 44439ada854SJonathan Peyton if (cur_chunk) 44539ada854SJonathan Peyton *cur_chunk = limit - init + 1; 4464cc4bb4cSJim Cownie #endif 4475e8470afSJim Cownie if (st == 1) { 4485e8470afSJim Cownie pr->u.p.lb = lb + init; 4495e8470afSJim Cownie pr->u.p.ub = lb + limit; 4505e8470afSJim Cownie } else { 4513041982dSJonathan Peyton // calculated upper bound, "ub" is user-defined upper bound 4523041982dSJonathan Peyton T ub_tmp = lb + limit * st; 4535e8470afSJim Cownie pr->u.p.lb = lb + init * st; 4543041982dSJonathan Peyton // adjust upper bound to "ub" if needed, so that MS lastprivate will match 4553041982dSJonathan Peyton // it exactly 4565e8470afSJim Cownie if (st > 0) { 4575e8470afSJim Cownie pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 4585e8470afSJim Cownie } else { 4595e8470afSJim Cownie pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 4605e8470afSJim Cownie } 4615e8470afSJim Cownie } 46239ada854SJonathan Peyton if (pr->flags.ordered) { 4635e8470afSJim Cownie pr->u.p.ordered_lower = init; 4645e8470afSJim Cownie pr->u.p.ordered_upper = limit; 4655e8470afSJim Cownie } 4665e8470afSJim Cownie break; 4675e8470afSJim Cownie } // case 468d454c73cSAndrey Churbanov case kmp_sch_static_balanced_chunked: { 469d454c73cSAndrey Churbanov // similar to balanced, but chunk adjusted to multiple of simd width 47039ada854SJonathan Peyton T nth = nproc; 47139ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 472d454c73cSAndrey Churbanov " -> falling-through to static_greedy\n", 473d454c73cSAndrey Churbanov gtid)); 474d454c73cSAndrey Churbanov schedule = kmp_sch_static_greedy; 475d454c73cSAndrey Churbanov if (nth > 1) 476d454c73cSAndrey Churbanov pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 477d454c73cSAndrey Churbanov else 478d454c73cSAndrey Churbanov pr->u.p.parm1 = tc; 479d454c73cSAndrey Churbanov break; 480d454c73cSAndrey Churbanov } // case 48139ada854SJonathan Peyton case kmp_sch_guided_simd: 48239ada854SJonathan Peyton case kmp_sch_guided_iterative_chunked: { 48339ada854SJonathan Peyton KD_TRACE( 48439ada854SJonathan Peyton 100, 48539ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 4863041982dSJonathan Peyton " case\n", 4873041982dSJonathan Peyton gtid)); 4885e8470afSJim Cownie 4895e8470afSJim Cownie if (nproc > 1) { 4905e8470afSJim Cownie if ((2L * chunk + 1) * nproc >= tc) { 4915e8470afSJim Cownie /* chunk size too large, switch to dynamic */ 4925e8470afSJim Cownie schedule = kmp_sch_dynamic_chunked; 4935e8470afSJim Cownie } else { 4945e8470afSJim Cownie // when remaining iters become less than parm2 - switch to dynamic 4955e8470afSJim Cownie pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 4963041982dSJonathan Peyton *(double *)&pr->u.p.parm3 = 4976b316febSTerry Wilmarth guided_flt_param / (double)nproc; // may occupy parm3 and parm4 4985e8470afSJim Cownie } 4995e8470afSJim Cownie } else { 50039ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 5013041982dSJonathan Peyton "kmp_sch_static_greedy\n", 5023041982dSJonathan Peyton gtid)); 5035e8470afSJim Cownie schedule = kmp_sch_static_greedy; 5045e8470afSJim Cownie /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 50539ada854SJonathan Peyton KD_TRACE( 50639ada854SJonathan Peyton 100, 50739ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 5083041982dSJonathan Peyton gtid)); 5095e8470afSJim Cownie pr->u.p.parm1 = tc; 5105e8470afSJim Cownie } // if 5115e8470afSJim Cownie } // case 5125e8470afSJim Cownie break; 5133041982dSJonathan Peyton case kmp_sch_guided_analytical_chunked: { 51439ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 51539ada854SJonathan Peyton "kmp_sch_guided_analytical_chunked case\n", 5163041982dSJonathan Peyton gtid)); 51739ada854SJonathan Peyton 5185e8470afSJim Cownie if (nproc > 1) { 5195e8470afSJim Cownie if ((2L * chunk + 1) * nproc >= tc) { 5205e8470afSJim Cownie /* chunk size too large, switch to dynamic */ 5215e8470afSJim Cownie schedule = kmp_sch_dynamic_chunked; 5225e8470afSJim Cownie } else { 5235e8470afSJim Cownie /* commonly used term: (2 nproc - 1)/(2 nproc) */ 5245e8470afSJim Cownie DBL x; 5255e8470afSJim Cownie 526f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 5273041982dSJonathan Peyton /* Linux* OS already has 64-bit computation by default for long double, 5283041982dSJonathan Peyton and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 5293041982dSJonathan Peyton Windows* OS on IA-32 architecture, we need to set precision to 64-bit 5303041982dSJonathan Peyton instead of the default 53-bit. Even though long double doesn't work 5313041982dSJonathan Peyton on Windows* OS on Intel(R) 64, the resulting lack of precision is not 5323041982dSJonathan Peyton expected to impact the correctness of the algorithm, but this has not 5333041982dSJonathan Peyton been mathematically proven. */ 5345e8470afSJim Cownie // save original FPCW and set precision to 64-bit, as 5355e8470afSJim Cownie // Windows* OS on IA-32 architecture defaults to 53-bit 536181b4bb3SJim Cownie unsigned int oldFpcw = _control87(0, 0); 537181b4bb3SJim Cownie _control87(_PC_64, _MCW_PC); // 0,0x30000 5385e8470afSJim Cownie #endif 5395e8470afSJim Cownie /* value used for comparison in solver for cross-over point */ 5405e8470afSJim Cownie long double target = ((long double)chunk * 2 + 1) * nproc / tc; 5415e8470afSJim Cownie 5425e8470afSJim Cownie /* crossover point--chunk indexes equal to or greater than 5435e8470afSJim Cownie this point switch to dynamic-style scheduling */ 5445e8470afSJim Cownie UT cross; 5455e8470afSJim Cownie 5465e8470afSJim Cownie /* commonly used term: (2 nproc - 1)/(2 nproc) */ 5476b316febSTerry Wilmarth x = 1.0 - 0.5 / (double)nproc; 5485e8470afSJim Cownie 5495e8470afSJim Cownie #ifdef KMP_DEBUG 5505e8470afSJim Cownie { // test natural alignment 5515e8470afSJim Cownie struct _test_a { 5525e8470afSJim Cownie char a; 5535e8470afSJim Cownie union { 5545e8470afSJim Cownie char b; 5555e8470afSJim Cownie DBL d; 5565e8470afSJim Cownie }; 5575e8470afSJim Cownie } t; 5583041982dSJonathan Peyton ptrdiff_t natural_alignment = 5593041982dSJonathan Peyton (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 5603041982dSJonathan Peyton //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 5613041982dSJonathan Peyton // long)natural_alignment ); 5623041982dSJonathan Peyton KMP_DEBUG_ASSERT( 5633041982dSJonathan Peyton (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 5645e8470afSJim Cownie } 5655e8470afSJim Cownie #endif // KMP_DEBUG 5665e8470afSJim Cownie 5675e8470afSJim Cownie /* save the term in thread private dispatch structure */ 5685e8470afSJim Cownie *(DBL *)&pr->u.p.parm3 = x; 5695e8470afSJim Cownie 5703041982dSJonathan Peyton /* solve for the crossover point to the nearest integer i for which C_i 5713041982dSJonathan Peyton <= chunk */ 5725e8470afSJim Cownie { 5735e8470afSJim Cownie UT left, right, mid; 5745e8470afSJim Cownie long double p; 5755e8470afSJim Cownie 5765e8470afSJim Cownie /* estimate initial upper and lower bound */ 5775e8470afSJim Cownie 5785e8470afSJim Cownie /* doesn't matter what value right is as long as it is positive, but 5793041982dSJonathan Peyton it affects performance of the solver */ 5805e8470afSJim Cownie right = 229; 5815e8470afSJim Cownie p = __kmp_pow<UT>(x, right); 5825e8470afSJim Cownie if (p > target) { 5835e8470afSJim Cownie do { 5845e8470afSJim Cownie p *= p; 5855e8470afSJim Cownie right <<= 1; 5865e8470afSJim Cownie } while (p > target && right < (1 << 27)); 5873041982dSJonathan Peyton /* lower bound is previous (failed) estimate of upper bound */ 5883041982dSJonathan Peyton left = right >> 1; 5895e8470afSJim Cownie } else { 5905e8470afSJim Cownie left = 0; 5915e8470afSJim Cownie } 5925e8470afSJim Cownie 5935e8470afSJim Cownie /* bisection root-finding method */ 5945e8470afSJim Cownie while (left + 1 < right) { 5955e8470afSJim Cownie mid = (left + right) / 2; 5965e8470afSJim Cownie if (__kmp_pow<UT>(x, mid) > target) { 5975e8470afSJim Cownie left = mid; 5985e8470afSJim Cownie } else { 5995e8470afSJim Cownie right = mid; 6005e8470afSJim Cownie } 6015e8470afSJim Cownie } // while 6025e8470afSJim Cownie cross = right; 6035e8470afSJim Cownie } 6045e8470afSJim Cownie /* assert sanity of computed crossover point */ 6053041982dSJonathan Peyton KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 6063041982dSJonathan Peyton __kmp_pow<UT>(x, cross) <= target); 6075e8470afSJim Cownie 6085e8470afSJim Cownie /* save the crossover point in thread private dispatch structure */ 6095e8470afSJim Cownie pr->u.p.parm2 = cross; 6105e8470afSJim Cownie 6115e8470afSJim Cownie // C75803 6125e8470afSJim Cownie #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 6135e8470afSJim Cownie #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 6145e8470afSJim Cownie #else 6155e8470afSJim Cownie #define GUIDED_ANALYTICAL_WORKAROUND (x) 6165e8470afSJim Cownie #endif 6175e8470afSJim Cownie /* dynamic-style scheduling offset */ 618*309b00a4SShilei Tian pr->u.p.count = tc - 619*309b00a4SShilei Tian __kmp_dispatch_guided_remaining( 6203041982dSJonathan Peyton tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 6213041982dSJonathan Peyton cross * chunk; 622f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 6235e8470afSJim Cownie // restore FPCW 624181b4bb3SJim Cownie _control87(oldFpcw, _MCW_PC); 6255e8470afSJim Cownie #endif 6265e8470afSJim Cownie } // if 6275e8470afSJim Cownie } else { 62839ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 6293041982dSJonathan Peyton "kmp_sch_static_greedy\n", 6305e8470afSJim Cownie gtid)); 6315e8470afSJim Cownie schedule = kmp_sch_static_greedy; 6325e8470afSJim Cownie /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 6335e8470afSJim Cownie pr->u.p.parm1 = tc; 6345e8470afSJim Cownie } // if 6355e8470afSJim Cownie } // case 6365e8470afSJim Cownie break; 6375e8470afSJim Cownie case kmp_sch_static_greedy: 63839ada854SJonathan Peyton KD_TRACE( 63939ada854SJonathan Peyton 100, 64039ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 64139ada854SJonathan Peyton gtid)); 64239ada854SJonathan Peyton pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 6435e8470afSJim Cownie break; 6445e8470afSJim Cownie case kmp_sch_static_chunked: 6455e8470afSJim Cownie case kmp_sch_dynamic_chunked: 64670bda912SJonathan Peyton if (pr->u.p.parm1 <= 0) { 64770bda912SJonathan Peyton pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 64870bda912SJonathan Peyton } 64939ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 6503041982dSJonathan Peyton "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 6513041982dSJonathan Peyton gtid)); 6525e8470afSJim Cownie break; 6533041982dSJonathan Peyton case kmp_sch_trapezoidal: { 6545e8470afSJim Cownie /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 6555e8470afSJim Cownie 6565e8470afSJim Cownie T parm1, parm2, parm3, parm4; 6573041982dSJonathan Peyton KD_TRACE(100, 65839ada854SJonathan Peyton ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 65939ada854SJonathan Peyton gtid)); 6605e8470afSJim Cownie 6615e8470afSJim Cownie parm1 = chunk; 6625e8470afSJim Cownie 6635e8470afSJim Cownie /* F : size of the first cycle */ 66439ada854SJonathan Peyton parm2 = (tc / (2 * nproc)); 6655e8470afSJim Cownie 6665e8470afSJim Cownie if (parm2 < 1) { 6675e8470afSJim Cownie parm2 = 1; 6685e8470afSJim Cownie } 6695e8470afSJim Cownie 6703041982dSJonathan Peyton /* L : size of the last cycle. Make sure the last cycle is not larger 6713041982dSJonathan Peyton than the first cycle. */ 6725e8470afSJim Cownie if (parm1 < 1) { 6735e8470afSJim Cownie parm1 = 1; 6745e8470afSJim Cownie } else if (parm1 > parm2) { 6755e8470afSJim Cownie parm1 = parm2; 6765e8470afSJim Cownie } 6775e8470afSJim Cownie 6785e8470afSJim Cownie /* N : number of cycles */ 6795e8470afSJim Cownie parm3 = (parm2 + parm1); 6805e8470afSJim Cownie parm3 = (2 * tc + parm3 - 1) / parm3; 6815e8470afSJim Cownie 6825e8470afSJim Cownie if (parm3 < 2) { 6835e8470afSJim Cownie parm3 = 2; 6845e8470afSJim Cownie } 6855e8470afSJim Cownie 6865e8470afSJim Cownie /* sigma : decreasing incr of the trapezoid */ 6875e8470afSJim Cownie parm4 = (parm3 - 1); 6885e8470afSJim Cownie parm4 = (parm2 - parm1) / parm4; 6895e8470afSJim Cownie 6905e8470afSJim Cownie // pointless check, because parm4 >= 0 always 6915e8470afSJim Cownie // if ( parm4 < 0 ) { 6925e8470afSJim Cownie // parm4 = 0; 6935e8470afSJim Cownie //} 6945e8470afSJim Cownie 6955e8470afSJim Cownie pr->u.p.parm1 = parm1; 6965e8470afSJim Cownie pr->u.p.parm2 = parm2; 6975e8470afSJim Cownie pr->u.p.parm3 = parm3; 6985e8470afSJim Cownie pr->u.p.parm4 = parm4; 6995e8470afSJim Cownie } // case 7005e8470afSJim Cownie break; 7015e8470afSJim Cownie 7023041982dSJonathan Peyton default: { 7036a393f75SJonathan Peyton __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 7045e8470afSJim Cownie KMP_HNT(GetNewerLibrary), // Hint 7055e8470afSJim Cownie __kmp_msg_null // Variadic argument list terminator 7065e8470afSJim Cownie ); 7073041982dSJonathan Peyton } break; 7085e8470afSJim Cownie } // switch 7095e8470afSJim Cownie pr->schedule = schedule; 71039ada854SJonathan Peyton } 71139ada854SJonathan Peyton 712f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 713f6399367SJonathan Peyton template <typename T> 714f6399367SJonathan Peyton inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 715f6399367SJonathan Peyton typename traits_t<T>::signed_t st); 716f6399367SJonathan Peyton template <> 717f6399367SJonathan Peyton inline void 718f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 719f6399367SJonathan Peyton kmp_int32 ub, kmp_int32 st) { 720f6399367SJonathan Peyton __kmp_dispatch_init_hierarchy<kmp_int32>( 721f6399367SJonathan Peyton loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 722f6399367SJonathan Peyton __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 723f6399367SJonathan Peyton } 724f6399367SJonathan Peyton template <> 725f6399367SJonathan Peyton inline void 726f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 727f6399367SJonathan Peyton kmp_uint32 ub, kmp_int32 st) { 728f6399367SJonathan Peyton __kmp_dispatch_init_hierarchy<kmp_uint32>( 729f6399367SJonathan Peyton loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 730f6399367SJonathan Peyton __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 731f6399367SJonathan Peyton } 732f6399367SJonathan Peyton template <> 733f6399367SJonathan Peyton inline void 734f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 735f6399367SJonathan Peyton kmp_int64 ub, kmp_int64 st) { 736f6399367SJonathan Peyton __kmp_dispatch_init_hierarchy<kmp_int64>( 737f6399367SJonathan Peyton loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 738f6399367SJonathan Peyton __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 739f6399367SJonathan Peyton } 740f6399367SJonathan Peyton template <> 741f6399367SJonathan Peyton inline void 742f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 743f6399367SJonathan Peyton kmp_uint64 ub, kmp_int64 st) { 744f6399367SJonathan Peyton __kmp_dispatch_init_hierarchy<kmp_uint64>( 745f6399367SJonathan Peyton loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 746f6399367SJonathan Peyton __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 747f6399367SJonathan Peyton } 748f6399367SJonathan Peyton 749f6399367SJonathan Peyton // free all the hierarchy scheduling memory associated with the team 750f6399367SJonathan Peyton void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 751f6399367SJonathan Peyton int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 752f6399367SJonathan Peyton for (int i = 0; i < num_disp_buff; ++i) { 753f6399367SJonathan Peyton // type does not matter here so use kmp_int32 754f6399367SJonathan Peyton auto sh = 755f6399367SJonathan Peyton reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 756f6399367SJonathan Peyton &team->t.t_disp_buffer[i]); 757f6399367SJonathan Peyton if (sh->hier) { 758f6399367SJonathan Peyton sh->hier->deallocate(); 759f6399367SJonathan Peyton __kmp_free(sh->hier); 760f6399367SJonathan Peyton } 761f6399367SJonathan Peyton } 762f6399367SJonathan Peyton } 763f6399367SJonathan Peyton #endif 764f6399367SJonathan Peyton 76539ada854SJonathan Peyton // UT - unsigned flavor of T, ST - signed flavor of T, 76639ada854SJonathan Peyton // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 76739ada854SJonathan Peyton template <typename T> 76839ada854SJonathan Peyton static void 76939ada854SJonathan Peyton __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 77039ada854SJonathan Peyton T ub, typename traits_t<T>::signed_t st, 77139ada854SJonathan Peyton typename traits_t<T>::signed_t chunk, int push_ws) { 77239ada854SJonathan Peyton typedef typename traits_t<T>::unsigned_t UT; 77339ada854SJonathan Peyton 77439ada854SJonathan Peyton int active; 77539ada854SJonathan Peyton kmp_info_t *th; 77639ada854SJonathan Peyton kmp_team_t *team; 77739ada854SJonathan Peyton kmp_uint32 my_buffer_index; 77839ada854SJonathan Peyton dispatch_private_info_template<T> *pr; 77939ada854SJonathan Peyton dispatch_shared_info_template<T> volatile *sh; 78039ada854SJonathan Peyton 78139ada854SJonathan Peyton KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 78239ada854SJonathan Peyton sizeof(dispatch_private_info)); 78339ada854SJonathan Peyton KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 78439ada854SJonathan Peyton sizeof(dispatch_shared_info)); 785787eb0c6SAndreyChurbanov __kmp_assert_valid_gtid(gtid); 78639ada854SJonathan Peyton 78739ada854SJonathan Peyton if (!TCR_4(__kmp_init_parallel)) 78839ada854SJonathan Peyton __kmp_parallel_initialize(); 78939ada854SJonathan Peyton 7909b8bb323SJonathan Peyton __kmp_resume_if_soft_paused(); 7919b8bb323SJonathan Peyton 79239ada854SJonathan Peyton #if INCLUDE_SSC_MARKS 79339ada854SJonathan Peyton SSC_MARK_DISPATCH_INIT(); 79439ada854SJonathan Peyton #endif 79539ada854SJonathan Peyton #ifdef KMP_DEBUG 796baad3f60SJonathan Peyton typedef typename traits_t<T>::signed_t ST; 79739ada854SJonathan Peyton { 79839ada854SJonathan Peyton char *buff; 79939ada854SJonathan Peyton // create format specifiers before the debug output 80039ada854SJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 80139ada854SJonathan Peyton "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 80239ada854SJonathan Peyton traits_t<ST>::spec, traits_t<T>::spec, 80339ada854SJonathan Peyton traits_t<T>::spec, traits_t<ST>::spec); 80439ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 80539ada854SJonathan Peyton __kmp_str_free(&buff); 80639ada854SJonathan Peyton } 80739ada854SJonathan Peyton #endif 80839ada854SJonathan Peyton /* setup data */ 80939ada854SJonathan Peyton th = __kmp_threads[gtid]; 81039ada854SJonathan Peyton team = th->th.th_team; 81139ada854SJonathan Peyton active = !team->t.t_serialized; 81239ada854SJonathan Peyton th->th.th_ident = loc; 81339ada854SJonathan Peyton 814f0682ac4SJonathan Peyton // Any half-decent optimizer will remove this test when the blocks are empty 815f0682ac4SJonathan Peyton // since the macros expand to nothing 816f0682ac4SJonathan Peyton // when statistics are disabled. 817f0682ac4SJonathan Peyton if (schedule == __kmp_static) { 818f0682ac4SJonathan Peyton KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 819f0682ac4SJonathan Peyton } else { 820f0682ac4SJonathan Peyton KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 821f0682ac4SJonathan Peyton } 822f0682ac4SJonathan Peyton 823f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 824f6399367SJonathan Peyton // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 825f6399367SJonathan Peyton // Hierarchical scheduling does not work with ordered, so if ordered is 826f6399367SJonathan Peyton // detected, then revert back to threaded scheduling. 827f6399367SJonathan Peyton bool ordered; 828f6399367SJonathan Peyton enum sched_type my_sched = schedule; 829f6399367SJonathan Peyton my_buffer_index = th->th.th_dispatch->th_disp_index; 830f6399367SJonathan Peyton pr = reinterpret_cast<dispatch_private_info_template<T> *>( 831f6399367SJonathan Peyton &th->th.th_dispatch 832f6399367SJonathan Peyton ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 833f6399367SJonathan Peyton my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 834f6399367SJonathan Peyton if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 835f6399367SJonathan Peyton my_sched = 836f6399367SJonathan Peyton (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 837f6399367SJonathan Peyton ordered = (kmp_ord_lower & my_sched); 838f6399367SJonathan Peyton if (pr->flags.use_hier) { 839f6399367SJonathan Peyton if (ordered) { 840f6399367SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 841f6399367SJonathan Peyton "Disabling hierarchical scheduling.\n", 842f6399367SJonathan Peyton gtid)); 843f6399367SJonathan Peyton pr->flags.use_hier = FALSE; 844f6399367SJonathan Peyton } 845f6399367SJonathan Peyton } 846f6399367SJonathan Peyton if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 847f6399367SJonathan Peyton // Don't use hierarchical for ordered parallel loops and don't 848f6399367SJonathan Peyton // use the runtime hierarchy if one was specified in the program 849f6399367SJonathan Peyton if (!ordered && !pr->flags.use_hier) 850f6399367SJonathan Peyton __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 851f6399367SJonathan Peyton } 852f6399367SJonathan Peyton #endif // KMP_USE_HIER_SCHED 853f6399367SJonathan Peyton 85439ada854SJonathan Peyton #if USE_ITT_BUILD 85539ada854SJonathan Peyton kmp_uint64 cur_chunk = chunk; 856e4b4f994SJonathan Peyton int itt_need_metadata_reporting = 857e4b4f994SJonathan Peyton __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 858e4b4f994SJonathan Peyton KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 85939ada854SJonathan Peyton team->t.t_active_level == 1; 86039ada854SJonathan Peyton #endif 86139ada854SJonathan Peyton if (!active) { 86239ada854SJonathan Peyton pr = reinterpret_cast<dispatch_private_info_template<T> *>( 86339ada854SJonathan Peyton th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 86439ada854SJonathan Peyton } else { 86539ada854SJonathan Peyton KMP_DEBUG_ASSERT(th->th.th_dispatch == 86639ada854SJonathan Peyton &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 86739ada854SJonathan Peyton 86839ada854SJonathan Peyton my_buffer_index = th->th.th_dispatch->th_disp_index++; 86939ada854SJonathan Peyton 87039ada854SJonathan Peyton /* What happens when number of threads changes, need to resize buffer? */ 87139ada854SJonathan Peyton pr = reinterpret_cast<dispatch_private_info_template<T> *>( 87239ada854SJonathan Peyton &th->th.th_dispatch 87339ada854SJonathan Peyton ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 87439ada854SJonathan Peyton sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 87539ada854SJonathan Peyton &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 87639ada854SJonathan Peyton KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 87739ada854SJonathan Peyton my_buffer_index)); 87839ada854SJonathan Peyton } 87939ada854SJonathan Peyton 88039ada854SJonathan Peyton __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 88139ada854SJonathan Peyton #if USE_ITT_BUILD 88239ada854SJonathan Peyton &cur_chunk, 88339ada854SJonathan Peyton #endif 88439ada854SJonathan Peyton chunk, (T)th->th.th_team_nproc, 88539ada854SJonathan Peyton (T)th->th.th_info.ds.ds_tid); 88639ada854SJonathan Peyton if (active) { 88739ada854SJonathan Peyton if (pr->flags.ordered == 0) { 88839ada854SJonathan Peyton th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 88939ada854SJonathan Peyton th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 89039ada854SJonathan Peyton } else { 89139ada854SJonathan Peyton th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 89239ada854SJonathan Peyton th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 89339ada854SJonathan Peyton } 89439ada854SJonathan Peyton } 89539ada854SJonathan Peyton 8965e8470afSJim Cownie if (active) { 8973041982dSJonathan Peyton /* The name of this buffer should be my_buffer_index when it's free to use 8983041982dSJonathan Peyton * it */ 8995e8470afSJim Cownie 9003041982dSJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 9013041982dSJonathan Peyton "sh->buffer_index:%d\n", 9025e8470afSJim Cownie gtid, my_buffer_index, sh->buffer_index)); 903e47d32f1SJonathan Peyton __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 9043041982dSJonathan Peyton __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 905e47d32f1SJonathan Peyton // Note: KMP_WAIT() cannot be used there: buffer index and 9063041982dSJonathan Peyton // my_buffer_index are *always* 32-bit integers. 9075e8470afSJim Cownie KMP_MB(); /* is this necessary? */ 9083041982dSJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 9093041982dSJonathan Peyton "sh->buffer_index:%d\n", 9105e8470afSJim Cownie gtid, my_buffer_index, sh->buffer_index)); 9115e8470afSJim Cownie 9125e8470afSJim Cownie th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 913c47afcd9SAndrey Churbanov th->th.th_dispatch->th_dispatch_sh_current = 9145ba90c79SAndrey Churbanov CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 9155e8470afSJim Cownie #if USE_ITT_BUILD 91639ada854SJonathan Peyton if (pr->flags.ordered) { 9175e8470afSJim Cownie __kmp_itt_ordered_init(gtid); 918bd3a7633SJonathan Peyton } 9194cc4bb4cSJim Cownie // Report loop metadata 92051aecb82SAndrey Churbanov if (itt_need_metadata_reporting) { 92151aecb82SAndrey Churbanov // Only report metadata by master of active team at level 1 9224cc4bb4cSJim Cownie kmp_uint64 schedtype = 0; 9234cc4bb4cSJim Cownie switch (schedule) { 9244cc4bb4cSJim Cownie case kmp_sch_static_chunked: 9254cc4bb4cSJim Cownie case kmp_sch_static_balanced: // Chunk is calculated in the switch above 9264cc4bb4cSJim Cownie break; 9274cc4bb4cSJim Cownie case kmp_sch_static_greedy: 9284cc4bb4cSJim Cownie cur_chunk = pr->u.p.parm1; 9294cc4bb4cSJim Cownie break; 9304cc4bb4cSJim Cownie case kmp_sch_dynamic_chunked: 9314cc4bb4cSJim Cownie schedtype = 1; 9324cc4bb4cSJim Cownie break; 9334cc4bb4cSJim Cownie case kmp_sch_guided_iterative_chunked: 9344cc4bb4cSJim Cownie case kmp_sch_guided_analytical_chunked: 935d454c73cSAndrey Churbanov case kmp_sch_guided_simd: 9364cc4bb4cSJim Cownie schedtype = 2; 9374cc4bb4cSJim Cownie break; 9384cc4bb4cSJim Cownie default: 9394cc4bb4cSJim Cownie // Should we put this case under "static"? 9404cc4bb4cSJim Cownie // case kmp_sch_static_steal: 9414cc4bb4cSJim Cownie schedtype = 3; 9424cc4bb4cSJim Cownie break; 9434cc4bb4cSJim Cownie } 94439ada854SJonathan Peyton __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 9454cc4bb4cSJim Cownie } 946f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 947f6399367SJonathan Peyton if (pr->flags.use_hier) { 948f6399367SJonathan Peyton pr->u.p.count = 0; 949f6399367SJonathan Peyton pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 950f6399367SJonathan Peyton } 951f6399367SJonathan Peyton #endif // KMP_USER_HIER_SCHED 9524cc4bb4cSJim Cownie #endif /* USE_ITT_BUILD */ 953bd3a7633SJonathan Peyton } 9544cc4bb4cSJim Cownie 9555e8470afSJim Cownie #ifdef KMP_DEBUG 9565e8470afSJim Cownie { 957aeb40adaSJonas Hahnfeld char *buff; 9585e8470afSJim Cownie // create format specifiers before the debug output 9595e8470afSJim Cownie buff = __kmp_str_format( 9603041982dSJonathan Peyton "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 9613041982dSJonathan Peyton "lb:%%%s ub:%%%s" 9623041982dSJonathan Peyton " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 9635e8470afSJim Cownie " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 9645e8470afSJim Cownie traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 9655e8470afSJim Cownie traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 9665e8470afSJim Cownie traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 9675e8470afSJim Cownie traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 96839ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 96939ada854SJonathan Peyton pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 97039ada854SJonathan Peyton pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 97139ada854SJonathan Peyton pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 9725e8470afSJim Cownie __kmp_str_free(&buff); 9735e8470afSJim Cownie } 9745e8470afSJim Cownie #endif 9755e8470afSJim Cownie #if (KMP_STATIC_STEAL_ENABLED) 9763041982dSJonathan Peyton // It cannot be guaranteed that after execution of a loop with some other 9773041982dSJonathan Peyton // schedule kind all the parm3 variables will contain the same value. Even if 9783041982dSJonathan Peyton // all parm3 will be the same, it still exists a bad case like using 0 and 1 9793041982dSJonathan Peyton // rather than program life-time increment. So the dedicated variable is 9803041982dSJonathan Peyton // required. The 'static_steal_counter' is used. 981abe64360SAndreyChurbanov if (pr->schedule == kmp_sch_static_steal) { 9825e8470afSJim Cownie // Other threads will inspect this variable when searching for a victim. 9833041982dSJonathan Peyton // This is a flag showing that other threads may steal from this thread 9843041982dSJonathan Peyton // since then. 9855e8470afSJim Cownie volatile T *p = &pr->u.p.static_steal_counter; 9865e8470afSJim Cownie *p = *p + 1; 9875e8470afSJim Cownie } 988429dbc2aSAndrey Churbanov #endif // ( KMP_STATIC_STEAL_ENABLED ) 989d7d088f8SAndrey Churbanov 99082e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 99182e94a59SJoachim Protze if (ompt_enabled.ompt_callback_work) { 992d7d088f8SAndrey Churbanov ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 99382e94a59SJoachim Protze ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 99482e94a59SJoachim Protze ompt_callbacks.ompt_callback(ompt_callback_work)( 99582e94a59SJoachim Protze ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 99639ada854SJonathan Peyton &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 997d7d088f8SAndrey Churbanov } 998d7d088f8SAndrey Churbanov #endif 999f0682ac4SJonathan Peyton KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 10005e8470afSJim Cownie } 10015e8470afSJim Cownie 10023041982dSJonathan Peyton /* For ordered loops, either __kmp_dispatch_finish() should be called after 10035e8470afSJim Cownie * every iteration, or __kmp_dispatch_finish_chunk() should be called after 10045e8470afSJim Cownie * every chunk of iterations. If the ordered section(s) were not executed 10055e8470afSJim Cownie * for this iteration (or every iteration in this chunk), we need to set the 10063041982dSJonathan Peyton * ordered iteration counters so that the next thread can proceed. */ 10075e8470afSJim Cownie template <typename UT> 10083041982dSJonathan Peyton static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 10095e8470afSJim Cownie typedef typename traits_t<UT>::signed_t ST; 1010787eb0c6SAndreyChurbanov __kmp_assert_valid_gtid(gtid); 10115e8470afSJim Cownie kmp_info_t *th = __kmp_threads[gtid]; 10125e8470afSJim Cownie 10135e8470afSJim Cownie KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 10145e8470afSJim Cownie if (!th->th.th_team->t.t_serialized) { 10155e8470afSJim Cownie 10165e8470afSJim Cownie dispatch_private_info_template<UT> *pr = 10173041982dSJonathan Peyton reinterpret_cast<dispatch_private_info_template<UT> *>( 10183041982dSJonathan Peyton th->th.th_dispatch->th_dispatch_pr_current); 10195e8470afSJim Cownie dispatch_shared_info_template<UT> volatile *sh = 10203041982dSJonathan Peyton reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 10213041982dSJonathan Peyton th->th.th_dispatch->th_dispatch_sh_current); 10225e8470afSJim Cownie KMP_DEBUG_ASSERT(pr); 10235e8470afSJim Cownie KMP_DEBUG_ASSERT(sh); 10245e8470afSJim Cownie KMP_DEBUG_ASSERT(th->th.th_dispatch == 10255e8470afSJim Cownie &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 10265e8470afSJim Cownie 10275e8470afSJim Cownie if (pr->ordered_bumped) { 10283041982dSJonathan Peyton KD_TRACE( 10293041982dSJonathan Peyton 1000, 10303041982dSJonathan Peyton ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 10315e8470afSJim Cownie gtid)); 10325e8470afSJim Cownie pr->ordered_bumped = 0; 10335e8470afSJim Cownie } else { 10345e8470afSJim Cownie UT lower = pr->u.p.ordered_lower; 10355e8470afSJim Cownie 10365e8470afSJim Cownie #ifdef KMP_DEBUG 10375e8470afSJim Cownie { 1038aeb40adaSJonas Hahnfeld char *buff; 10395e8470afSJim Cownie // create format specifiers before the debug output 10403041982dSJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 10413041982dSJonathan Peyton "ordered_iteration:%%%s lower:%%%s\n", 10425e8470afSJim Cownie traits_t<UT>::spec, traits_t<UT>::spec); 10435e8470afSJim Cownie KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 10445e8470afSJim Cownie __kmp_str_free(&buff); 10455e8470afSJim Cownie } 10465e8470afSJim Cownie #endif 10475e8470afSJim Cownie 1048e47d32f1SJonathan Peyton __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 10493041982dSJonathan Peyton __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 10505e8470afSJim Cownie KMP_MB(); /* is this necessary? */ 10515e8470afSJim Cownie #ifdef KMP_DEBUG 10525e8470afSJim Cownie { 1053aeb40adaSJonas Hahnfeld char *buff; 10545e8470afSJim Cownie // create format specifiers before the debug output 10553041982dSJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 10563041982dSJonathan Peyton "ordered_iteration:%%%s lower:%%%s\n", 10575e8470afSJim Cownie traits_t<UT>::spec, traits_t<UT>::spec); 10585e8470afSJim Cownie KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 10595e8470afSJim Cownie __kmp_str_free(&buff); 10605e8470afSJim Cownie } 10615e8470afSJim Cownie #endif 10625e8470afSJim Cownie 10635e8470afSJim Cownie test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 10645e8470afSJim Cownie } // if 10655e8470afSJim Cownie } // if 10665e8470afSJim Cownie KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 10675e8470afSJim Cownie } 10685e8470afSJim Cownie 10695e8470afSJim Cownie #ifdef KMP_GOMP_COMPAT 10705e8470afSJim Cownie 10715e8470afSJim Cownie template <typename UT> 10723041982dSJonathan Peyton static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 10735e8470afSJim Cownie typedef typename traits_t<UT>::signed_t ST; 1074787eb0c6SAndreyChurbanov __kmp_assert_valid_gtid(gtid); 10755e8470afSJim Cownie kmp_info_t *th = __kmp_threads[gtid]; 10765e8470afSJim Cownie 10775e8470afSJim Cownie KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 10785e8470afSJim Cownie if (!th->th.th_team->t.t_serialized) { 10795e8470afSJim Cownie // int cid; 10805e8470afSJim Cownie dispatch_private_info_template<UT> *pr = 10813041982dSJonathan Peyton reinterpret_cast<dispatch_private_info_template<UT> *>( 10823041982dSJonathan Peyton th->th.th_dispatch->th_dispatch_pr_current); 10835e8470afSJim Cownie dispatch_shared_info_template<UT> volatile *sh = 10843041982dSJonathan Peyton reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 10853041982dSJonathan Peyton th->th.th_dispatch->th_dispatch_sh_current); 10865e8470afSJim Cownie KMP_DEBUG_ASSERT(pr); 10875e8470afSJim Cownie KMP_DEBUG_ASSERT(sh); 10885e8470afSJim Cownie KMP_DEBUG_ASSERT(th->th.th_dispatch == 10895e8470afSJim Cownie &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 10905e8470afSJim Cownie 10915e8470afSJim Cownie // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 10925e8470afSJim Cownie UT lower = pr->u.p.ordered_lower; 10935e8470afSJim Cownie UT upper = pr->u.p.ordered_upper; 10945e8470afSJim Cownie UT inc = upper - lower + 1; 10955e8470afSJim Cownie 10965e8470afSJim Cownie if (pr->ordered_bumped == inc) { 10973041982dSJonathan Peyton KD_TRACE( 10983041982dSJonathan Peyton 1000, 10993041982dSJonathan Peyton ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 11005e8470afSJim Cownie gtid)); 11015e8470afSJim Cownie pr->ordered_bumped = 0; 11025e8470afSJim Cownie } else { 11035e8470afSJim Cownie inc -= pr->ordered_bumped; 11045e8470afSJim Cownie 11055e8470afSJim Cownie #ifdef KMP_DEBUG 11065e8470afSJim Cownie { 1107aeb40adaSJonas Hahnfeld char *buff; 11085e8470afSJim Cownie // create format specifiers before the debug output 11095e8470afSJim Cownie buff = __kmp_str_format( 11103041982dSJonathan Peyton "__kmp_dispatch_finish_chunk: T#%%d before wait: " 11115e8470afSJim Cownie "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 11125e8470afSJim Cownie traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 11135e8470afSJim Cownie KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 11145e8470afSJim Cownie __kmp_str_free(&buff); 11155e8470afSJim Cownie } 11165e8470afSJim Cownie #endif 11175e8470afSJim Cownie 1118e47d32f1SJonathan Peyton __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 11193041982dSJonathan Peyton __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 11205e8470afSJim Cownie 11215e8470afSJim Cownie KMP_MB(); /* is this necessary? */ 11223041982dSJonathan Peyton KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 11233041982dSJonathan Peyton "ordered_bumped to zero\n", 11245e8470afSJim Cownie gtid)); 11255e8470afSJim Cownie pr->ordered_bumped = 0; 11265e8470afSJim Cownie //!!!!! TODO check if the inc should be unsigned, or signed??? 11275e8470afSJim Cownie #ifdef KMP_DEBUG 11285e8470afSJim Cownie { 1129aeb40adaSJonas Hahnfeld char *buff; 11305e8470afSJim Cownie // create format specifiers before the debug output 11315e8470afSJim Cownie buff = __kmp_str_format( 11323041982dSJonathan Peyton "__kmp_dispatch_finish_chunk: T#%%d after wait: " 11335e8470afSJim Cownie "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 11343041982dSJonathan Peyton traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 11353041982dSJonathan Peyton traits_t<UT>::spec); 11363041982dSJonathan Peyton KD_TRACE(1000, 11373041982dSJonathan Peyton (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 11385e8470afSJim Cownie __kmp_str_free(&buff); 11395e8470afSJim Cownie } 11405e8470afSJim Cownie #endif 11415e8470afSJim Cownie 11425e8470afSJim Cownie test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 11435e8470afSJim Cownie } 11445e8470afSJim Cownie // } 11455e8470afSJim Cownie } 11465e8470afSJim Cownie KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 11475e8470afSJim Cownie } 11485e8470afSJim Cownie 11495e8470afSJim Cownie #endif /* KMP_GOMP_COMPAT */ 11505e8470afSJim Cownie 11515e8470afSJim Cownie template <typename T> 115239ada854SJonathan Peyton int __kmp_dispatch_next_algorithm(int gtid, 115339ada854SJonathan Peyton dispatch_private_info_template<T> *pr, 115439ada854SJonathan Peyton dispatch_shared_info_template<T> volatile *sh, 115539ada854SJonathan Peyton kmp_int32 *p_last, T *p_lb, T *p_ub, 115639ada854SJonathan Peyton typename traits_t<T>::signed_t *p_st, T nproc, 115739ada854SJonathan Peyton T tid) { 11585e8470afSJim Cownie typedef typename traits_t<T>::unsigned_t UT; 11595e8470afSJim Cownie typedef typename traits_t<T>::signed_t ST; 11605e8470afSJim Cownie typedef typename traits_t<T>::floating_t DBL; 116139ada854SJonathan Peyton int status = 0; 11626b316febSTerry Wilmarth bool last = false; 116339ada854SJonathan Peyton T start; 116439ada854SJonathan Peyton ST incr; 116539ada854SJonathan Peyton UT limit, trip, init; 11665e8470afSJim Cownie kmp_info_t *th = __kmp_threads[gtid]; 11675e8470afSJim Cownie kmp_team_t *team = th->th.th_team; 11685e8470afSJim Cownie 11695e8470afSJim Cownie KMP_DEBUG_ASSERT(th->th.th_dispatch == 11705e8470afSJim Cownie &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 11715e8470afSJim Cownie KMP_DEBUG_ASSERT(pr); 11725e8470afSJim Cownie KMP_DEBUG_ASSERT(sh); 117339ada854SJonathan Peyton KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 117439ada854SJonathan Peyton #ifdef KMP_DEBUG 117539ada854SJonathan Peyton { 117639ada854SJonathan Peyton char *buff; 117739ada854SJonathan Peyton // create format specifiers before the debug output 117839ada854SJonathan Peyton buff = 117939ada854SJonathan Peyton __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 118039ada854SJonathan Peyton "sh:%%p nproc:%%%s tid:%%%s\n", 118139ada854SJonathan Peyton traits_t<T>::spec, traits_t<T>::spec); 118239ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 118339ada854SJonathan Peyton __kmp_str_free(&buff); 118439ada854SJonathan Peyton } 118539ada854SJonathan Peyton #endif 11865e8470afSJim Cownie 11875e8470afSJim Cownie // zero trip count 118839ada854SJonathan Peyton if (pr->u.p.tc == 0) { 118939ada854SJonathan Peyton KD_TRACE(10, 119039ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 119139ada854SJonathan Peyton "zero status:%d\n", 119239ada854SJonathan Peyton gtid, status)); 119339ada854SJonathan Peyton return 0; 119439ada854SJonathan Peyton } 119539ada854SJonathan Peyton 11965e8470afSJim Cownie switch (pr->schedule) { 1197429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED) 11983041982dSJonathan Peyton case kmp_sch_static_steal: { 11995e8470afSJim Cownie T chunk = pr->u.p.parm1; 12005e8470afSJim Cownie 120139ada854SJonathan Peyton KD_TRACE(100, 120239ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 12033041982dSJonathan Peyton gtid)); 12045e8470afSJim Cownie 12055e8470afSJim Cownie trip = pr->u.p.tc - 1; 12065e8470afSJim Cownie 120712313d44SJonathan Peyton if (traits_t<T>::type_size > 4) { 1208429dbc2aSAndrey Churbanov // use lock for 8-byte and CAS for 4-byte induction 1209429dbc2aSAndrey Churbanov // variable. TODO (optional): check and use 16-byte CAS 1210abe64360SAndreyChurbanov kmp_lock_t *lck = pr->u.p.th_steal_lock; 1211429dbc2aSAndrey Churbanov KMP_DEBUG_ASSERT(lck != NULL); 1212429dbc2aSAndrey Churbanov if (pr->u.p.count < (UT)pr->u.p.ub) { 1213429dbc2aSAndrey Churbanov __kmp_acquire_lock(lck, gtid); 1214429dbc2aSAndrey Churbanov // try to get own chunk of iterations 12155e8470afSJim Cownie init = (pr->u.p.count)++; 12165e8470afSJim Cownie status = (init < (UT)pr->u.p.ub); 1217429dbc2aSAndrey Churbanov __kmp_release_lock(lck, gtid); 12185e8470afSJim Cownie } else { 1219429dbc2aSAndrey Churbanov status = 0; // no own chunks 1220429dbc2aSAndrey Churbanov } 1221429dbc2aSAndrey Churbanov if (!status) { // try to steal 1222429dbc2aSAndrey Churbanov kmp_info_t **other_threads = team->t.t_threads; 12236b316febSTerry Wilmarth T while_limit = pr->u.p.parm3; 12246b316febSTerry Wilmarth T while_index = 0; 1225abe64360SAndreyChurbanov T id = pr->u.p.static_steal_counter; // loop id 1226abe64360SAndreyChurbanov int idx = (th->th.th_dispatch->th_disp_index - 1) % 1227abe64360SAndreyChurbanov __kmp_dispatch_num_buffers; // current loop index 1228abe64360SAndreyChurbanov // note: victim thread can potentially execute another loop 1229429dbc2aSAndrey Churbanov // TODO: algorithm of searching for a victim 1230429dbc2aSAndrey Churbanov // should be cleaned up and measured 1231429dbc2aSAndrey Churbanov while ((!status) && (while_limit != ++while_index)) { 1232abe64360SAndreyChurbanov dispatch_private_info_template<T> *victim; 1233429dbc2aSAndrey Churbanov T remaining; 1234429dbc2aSAndrey Churbanov T victimIdx = pr->u.p.parm4; 1235429dbc2aSAndrey Churbanov T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1236abe64360SAndreyChurbanov victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1237abe64360SAndreyChurbanov &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1238abe64360SAndreyChurbanov KMP_DEBUG_ASSERT(victim); 1239abe64360SAndreyChurbanov while ((victim == pr || id != victim->u.p.static_steal_counter) && 12403041982dSJonathan Peyton oldVictimIdx != victimIdx) { 1241429dbc2aSAndrey Churbanov victimIdx = (victimIdx + 1) % nproc; 12423041982dSJonathan Peyton victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1243abe64360SAndreyChurbanov &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1244abe64360SAndreyChurbanov KMP_DEBUG_ASSERT(victim); 1245bd3a7633SJonathan Peyton } 1246abe64360SAndreyChurbanov if (victim == pr || id != victim->u.p.static_steal_counter) { 1247429dbc2aSAndrey Churbanov continue; // try once more (nproc attempts in total) 1248429dbc2aSAndrey Churbanov // no victim is ready yet to participate in stealing 1249abe64360SAndreyChurbanov // because no victim passed kmp_init_dispatch yet 1250429dbc2aSAndrey Churbanov } 1251429dbc2aSAndrey Churbanov if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1252429dbc2aSAndrey Churbanov pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1253429dbc2aSAndrey Churbanov continue; // not enough chunks to steal, goto next victim 1254429dbc2aSAndrey Churbanov } 1255429dbc2aSAndrey Churbanov 1256abe64360SAndreyChurbanov lck = victim->u.p.th_steal_lock; 1257429dbc2aSAndrey Churbanov KMP_ASSERT(lck != NULL); 1258429dbc2aSAndrey Churbanov __kmp_acquire_lock(lck, gtid); 1259429dbc2aSAndrey Churbanov limit = victim->u.p.ub; // keep initial ub 1260429dbc2aSAndrey Churbanov if (victim->u.p.count >= limit || 12613041982dSJonathan Peyton (remaining = limit - victim->u.p.count) < 2) { 1262429dbc2aSAndrey Churbanov __kmp_release_lock(lck, gtid); 1263429dbc2aSAndrey Churbanov pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1264429dbc2aSAndrey Churbanov continue; // not enough chunks to steal 1265429dbc2aSAndrey Churbanov } 126642016791SKazuaki Ishizaki // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 126739ada854SJonathan Peyton // by 1 1268429dbc2aSAndrey Churbanov if (remaining > 3) { 126939ada854SJonathan Peyton // steal 1/4 of remaining 1270f0682ac4SJonathan Peyton KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 127139ada854SJonathan Peyton init = (victim->u.p.ub -= (remaining >> 2)); 1272429dbc2aSAndrey Churbanov } else { 127339ada854SJonathan Peyton // steal 1 chunk of 2 or 3 remaining 1274f0682ac4SJonathan Peyton KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 127539ada854SJonathan Peyton init = (victim->u.p.ub -= 1); 1276429dbc2aSAndrey Churbanov } 1277429dbc2aSAndrey Churbanov __kmp_release_lock(lck, gtid); 1278429dbc2aSAndrey Churbanov 1279429dbc2aSAndrey Churbanov KMP_DEBUG_ASSERT(init + 1 <= limit); 1280429dbc2aSAndrey Churbanov pr->u.p.parm4 = victimIdx; // remember victim to steal from 1281429dbc2aSAndrey Churbanov status = 1; 1282429dbc2aSAndrey Churbanov while_index = 0; 1283429dbc2aSAndrey Churbanov // now update own count and ub with stolen range but init chunk 1284abe64360SAndreyChurbanov __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1285429dbc2aSAndrey Churbanov pr->u.p.count = init + 1; 1286429dbc2aSAndrey Churbanov pr->u.p.ub = limit; 1287abe64360SAndreyChurbanov __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1288429dbc2aSAndrey Churbanov } // while (search for victim) 1289429dbc2aSAndrey Churbanov } // if (try to find victim and steal) 1290429dbc2aSAndrey Churbanov } else { 1291429dbc2aSAndrey Churbanov // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 12925e8470afSJim Cownie typedef union { 12935e8470afSJim Cownie struct { 12945e8470afSJim Cownie UT count; 12955e8470afSJim Cownie T ub; 12965e8470afSJim Cownie } p; 12975e8470afSJim Cownie kmp_int64 b; 12985e8470afSJim Cownie } union_i4; 12993041982dSJonathan Peyton // All operations on 'count' or 'ub' must be combined atomically 13003041982dSJonathan Peyton // together. 13015e8470afSJim Cownie { 13025e8470afSJim Cownie union_i4 vold, vnew; 13035e8470afSJim Cownie vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 13045e8470afSJim Cownie vnew = vold; 13055e8470afSJim Cownie vnew.p.count++; 13065e8470afSJim Cownie while (!KMP_COMPARE_AND_STORE_ACQ64( 13075e8470afSJim Cownie (volatile kmp_int64 *)&pr->u.p.count, 13085e8470afSJim Cownie *VOLATILE_CAST(kmp_int64 *) & vold.b, 13095e8470afSJim Cownie *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 13105e8470afSJim Cownie KMP_CPU_PAUSE(); 13115e8470afSJim Cownie vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 13125e8470afSJim Cownie vnew = vold; 13135e8470afSJim Cownie vnew.p.count++; 13145e8470afSJim Cownie } 13155e8470afSJim Cownie vnew = vold; 13165e8470afSJim Cownie init = vnew.p.count; 13175e8470afSJim Cownie status = (init < (UT)vnew.p.ub); 13185e8470afSJim Cownie } 13195e8470afSJim Cownie 13205e8470afSJim Cownie if (!status) { 13215e8470afSJim Cownie kmp_info_t **other_threads = team->t.t_threads; 13226b316febSTerry Wilmarth T while_limit = pr->u.p.parm3; 13236b316febSTerry Wilmarth T while_index = 0; 1324abe64360SAndreyChurbanov T id = pr->u.p.static_steal_counter; // loop id 1325abe64360SAndreyChurbanov int idx = (th->th.th_dispatch->th_disp_index - 1) % 1326abe64360SAndreyChurbanov __kmp_dispatch_num_buffers; // current loop index 1327abe64360SAndreyChurbanov // note: victim thread can potentially execute another loop 13285e8470afSJim Cownie // TODO: algorithm of searching for a victim 13295e8470afSJim Cownie // should be cleaned up and measured 13305e8470afSJim Cownie while ((!status) && (while_limit != ++while_index)) { 1331abe64360SAndreyChurbanov dispatch_private_info_template<T> *victim; 13325e8470afSJim Cownie union_i4 vold, vnew; 13336b316febSTerry Wilmarth T remaining; 13345e8470afSJim Cownie T victimIdx = pr->u.p.parm4; 1335429dbc2aSAndrey Churbanov T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1336abe64360SAndreyChurbanov victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1337abe64360SAndreyChurbanov &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1338abe64360SAndreyChurbanov KMP_DEBUG_ASSERT(victim); 1339abe64360SAndreyChurbanov while ((victim == pr || id != victim->u.p.static_steal_counter) && 13403041982dSJonathan Peyton oldVictimIdx != victimIdx) { 1341429dbc2aSAndrey Churbanov victimIdx = (victimIdx + 1) % nproc; 13423041982dSJonathan Peyton victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1343abe64360SAndreyChurbanov &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1344abe64360SAndreyChurbanov KMP_DEBUG_ASSERT(victim); 1345bd3a7633SJonathan Peyton } 1346abe64360SAndreyChurbanov if (victim == pr || id != victim->u.p.static_steal_counter) { 1347429dbc2aSAndrey Churbanov continue; // try once more (nproc attempts in total) 1348429dbc2aSAndrey Churbanov // no victim is ready yet to participate in stealing 1349abe64360SAndreyChurbanov // because no victim passed kmp_init_dispatch yet 13505e8470afSJim Cownie } 1351429dbc2aSAndrey Churbanov pr->u.p.parm4 = victimIdx; // new victim found 1352429dbc2aSAndrey Churbanov while (1) { // CAS loop if victim has enough chunks to steal 13535e8470afSJim Cownie vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 13545e8470afSJim Cownie vnew = vold; 13555e8470afSJim Cownie 13565e8470afSJim Cownie KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1357429dbc2aSAndrey Churbanov if (vnew.p.count >= (UT)vnew.p.ub || 13583041982dSJonathan Peyton (remaining = vnew.p.ub - vnew.p.count) < 2) { 135939ada854SJonathan Peyton pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1360429dbc2aSAndrey Churbanov break; // not enough chunks to steal, goto next victim 13615e8470afSJim Cownie } 1362429dbc2aSAndrey Churbanov if (remaining > 3) { 13636b316febSTerry Wilmarth // try to steal 1/4 of remaining 13646b316febSTerry Wilmarth vnew.p.ub -= remaining >> 2; 1365429dbc2aSAndrey Churbanov } else { 1366429dbc2aSAndrey Churbanov vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1367429dbc2aSAndrey Churbanov } 13685e8470afSJim Cownie KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 13695e8470afSJim Cownie // TODO: Should this be acquire or release? 13705e8470afSJim Cownie if (KMP_COMPARE_AND_STORE_ACQ64( 13715e8470afSJim Cownie (volatile kmp_int64 *)&victim->u.p.count, 13725e8470afSJim Cownie *VOLATILE_CAST(kmp_int64 *) & vold.b, 13735e8470afSJim Cownie *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 137442016791SKazuaki Ishizaki // stealing succeeded 1375f0682ac4SJonathan Peyton KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1376f0682ac4SJonathan Peyton vold.p.ub - vnew.p.ub); 13775e8470afSJim Cownie status = 1; 13785e8470afSJim Cownie while_index = 0; 13795e8470afSJim Cownie // now update own count and ub 13805e8470afSJim Cownie init = vnew.p.ub; 13815e8470afSJim Cownie vold.p.count = init + 1; 1382429dbc2aSAndrey Churbanov #if KMP_ARCH_X86 138339ada854SJonathan Peyton KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1384429dbc2aSAndrey Churbanov #else 13855e8470afSJim Cownie *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1386429dbc2aSAndrey Churbanov #endif 13875e8470afSJim Cownie break; 1388429dbc2aSAndrey Churbanov } // if (check CAS result) 138942016791SKazuaki Ishizaki KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1390429dbc2aSAndrey Churbanov } // while (try to steal from particular victim) 1391429dbc2aSAndrey Churbanov } // while (search for victim) 1392429dbc2aSAndrey Churbanov } // if (try to find victim and steal) 1393429dbc2aSAndrey Churbanov } // if (4-byte induction variable) 13945e8470afSJim Cownie if (!status) { 13955e8470afSJim Cownie *p_lb = 0; 13965e8470afSJim Cownie *p_ub = 0; 13973041982dSJonathan Peyton if (p_st != NULL) 13983041982dSJonathan Peyton *p_st = 0; 13995e8470afSJim Cownie } else { 14005e8470afSJim Cownie start = pr->u.p.parm2; 14015e8470afSJim Cownie init *= chunk; 14025e8470afSJim Cownie limit = chunk + init - 1; 14035e8470afSJim Cownie incr = pr->u.p.st; 1404f0682ac4SJonathan Peyton KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 14055e8470afSJim Cownie 14065e8470afSJim Cownie KMP_DEBUG_ASSERT(init <= trip); 14075e8470afSJim Cownie if ((last = (limit >= trip)) != 0) 14085e8470afSJim Cownie limit = trip; 14093041982dSJonathan Peyton if (p_st != NULL) 14103041982dSJonathan Peyton *p_st = incr; 14115e8470afSJim Cownie 14125e8470afSJim Cownie if (incr == 1) { 14135e8470afSJim Cownie *p_lb = start + init; 14145e8470afSJim Cownie *p_ub = start + limit; 14155e8470afSJim Cownie } else { 14165e8470afSJim Cownie *p_lb = start + init * incr; 14175e8470afSJim Cownie *p_ub = start + limit * incr; 14185e8470afSJim Cownie } 14195e8470afSJim Cownie 142039ada854SJonathan Peyton if (pr->flags.ordered) { 14215e8470afSJim Cownie pr->u.p.ordered_lower = init; 14225e8470afSJim Cownie pr->u.p.ordered_upper = limit; 14235e8470afSJim Cownie } // if 14245e8470afSJim Cownie } // if 14255e8470afSJim Cownie break; 14265e8470afSJim Cownie } // case 1427429dbc2aSAndrey Churbanov #endif // ( KMP_STATIC_STEAL_ENABLED ) 14283041982dSJonathan Peyton case kmp_sch_static_balanced: { 14293041982dSJonathan Peyton KD_TRACE( 143039ada854SJonathan Peyton 10, 143139ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 143239ada854SJonathan Peyton gtid)); 143339ada854SJonathan Peyton /* check if thread has any iteration to do */ 143439ada854SJonathan Peyton if ((status = !pr->u.p.count) != 0) { 14355e8470afSJim Cownie pr->u.p.count = 1; 14365e8470afSJim Cownie *p_lb = pr->u.p.lb; 14375e8470afSJim Cownie *p_ub = pr->u.p.ub; 14386b316febSTerry Wilmarth last = (pr->u.p.parm1 != 0); 14394cc4bb4cSJim Cownie if (p_st != NULL) 14405e8470afSJim Cownie *p_st = pr->u.p.st; 14415e8470afSJim Cownie } else { /* no iterations to do */ 14425e8470afSJim Cownie pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 14435e8470afSJim Cownie } 14445e8470afSJim Cownie } // case 14455e8470afSJim Cownie break; 14463041982dSJonathan Peyton case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 14473041982dSJonathan Peyton merged here */ 14483041982dSJonathan Peyton case kmp_sch_static_chunked: { 14495e8470afSJim Cownie T parm1; 14505e8470afSJim Cownie 145139ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 14523041982dSJonathan Peyton "kmp_sch_static_[affinity|chunked] case\n", 14535e8470afSJim Cownie gtid)); 14545e8470afSJim Cownie parm1 = pr->u.p.parm1; 14555e8470afSJim Cownie 14565e8470afSJim Cownie trip = pr->u.p.tc - 1; 145739ada854SJonathan Peyton init = parm1 * (pr->u.p.count + tid); 14585e8470afSJim Cownie 14595e8470afSJim Cownie if ((status = (init <= trip)) != 0) { 14605e8470afSJim Cownie start = pr->u.p.lb; 14615e8470afSJim Cownie incr = pr->u.p.st; 14625e8470afSJim Cownie limit = parm1 + init - 1; 14635e8470afSJim Cownie 14645e8470afSJim Cownie if ((last = (limit >= trip)) != 0) 14655e8470afSJim Cownie limit = trip; 14665e8470afSJim Cownie 14673041982dSJonathan Peyton if (p_st != NULL) 14683041982dSJonathan Peyton *p_st = incr; 14695e8470afSJim Cownie 147039ada854SJonathan Peyton pr->u.p.count += nproc; 14715e8470afSJim Cownie 14725e8470afSJim Cownie if (incr == 1) { 14735e8470afSJim Cownie *p_lb = start + init; 14745e8470afSJim Cownie *p_ub = start + limit; 14753041982dSJonathan Peyton } else { 14765e8470afSJim Cownie *p_lb = start + init * incr; 14775e8470afSJim Cownie *p_ub = start + limit * incr; 14785e8470afSJim Cownie } 14795e8470afSJim Cownie 148039ada854SJonathan Peyton if (pr->flags.ordered) { 14815e8470afSJim Cownie pr->u.p.ordered_lower = init; 14825e8470afSJim Cownie pr->u.p.ordered_upper = limit; 14835e8470afSJim Cownie } // if 14845e8470afSJim Cownie } // if 14855e8470afSJim Cownie } // case 14865e8470afSJim Cownie break; 14875e8470afSJim Cownie 14883041982dSJonathan Peyton case kmp_sch_dynamic_chunked: { 14895e8470afSJim Cownie T chunk = pr->u.p.parm1; 14905e8470afSJim Cownie 14913041982dSJonathan Peyton KD_TRACE( 14923041982dSJonathan Peyton 100, 149339ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 149439ada854SJonathan Peyton gtid)); 14955e8470afSJim Cownie 14965e8470afSJim Cownie init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 14975e8470afSJim Cownie trip = pr->u.p.tc - 1; 14985e8470afSJim Cownie 14995e8470afSJim Cownie if ((status = (init <= trip)) == 0) { 15005e8470afSJim Cownie *p_lb = 0; 15015e8470afSJim Cownie *p_ub = 0; 15023041982dSJonathan Peyton if (p_st != NULL) 15033041982dSJonathan Peyton *p_st = 0; 15045e8470afSJim Cownie } else { 15055e8470afSJim Cownie start = pr->u.p.lb; 15065e8470afSJim Cownie limit = chunk + init - 1; 15075e8470afSJim Cownie incr = pr->u.p.st; 15085e8470afSJim Cownie 15095e8470afSJim Cownie if ((last = (limit >= trip)) != 0) 15105e8470afSJim Cownie limit = trip; 15114cc4bb4cSJim Cownie 15123041982dSJonathan Peyton if (p_st != NULL) 15133041982dSJonathan Peyton *p_st = incr; 15145e8470afSJim Cownie 15155e8470afSJim Cownie if (incr == 1) { 15165e8470afSJim Cownie *p_lb = start + init; 15175e8470afSJim Cownie *p_ub = start + limit; 15185e8470afSJim Cownie } else { 15195e8470afSJim Cownie *p_lb = start + init * incr; 15205e8470afSJim Cownie *p_ub = start + limit * incr; 15215e8470afSJim Cownie } 15225e8470afSJim Cownie 152339ada854SJonathan Peyton if (pr->flags.ordered) { 15245e8470afSJim Cownie pr->u.p.ordered_lower = init; 15255e8470afSJim Cownie pr->u.p.ordered_upper = limit; 15265e8470afSJim Cownie } // if 15275e8470afSJim Cownie } // if 15285e8470afSJim Cownie } // case 15295e8470afSJim Cownie break; 15305e8470afSJim Cownie 15313041982dSJonathan Peyton case kmp_sch_guided_iterative_chunked: { 15325e8470afSJim Cownie T chunkspec = pr->u.p.parm1; 153339ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 15343041982dSJonathan Peyton "iterative case\n", 15353041982dSJonathan Peyton gtid)); 15365e8470afSJim Cownie trip = pr->u.p.tc; 15375e8470afSJim Cownie // Start atomic part of calculations 15385e8470afSJim Cownie while (1) { 15395e8470afSJim Cownie ST remaining; // signed, because can be < 0 15405e8470afSJim Cownie init = sh->u.s.iteration; // shared value 15415e8470afSJim Cownie remaining = trip - init; 15425e8470afSJim Cownie if (remaining <= 0) { // AC: need to compare with 0 first 15435e8470afSJim Cownie // nothing to do, don't try atomic op 15445e8470afSJim Cownie status = 0; 15455e8470afSJim Cownie break; 15465e8470afSJim Cownie } 15473041982dSJonathan Peyton if ((T)remaining < 15483041982dSJonathan Peyton pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 154942016791SKazuaki Ishizaki // use dynamic-style schedule 15504c6a098aSKazuaki Ishizaki // atomically increment iterations, get old value 155194a114fcSJonathan Peyton init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 155294a114fcSJonathan Peyton (ST)chunkspec); 15535e8470afSJim Cownie remaining = trip - init; 15545e8470afSJim Cownie if (remaining <= 0) { 15555e8470afSJim Cownie status = 0; // all iterations got by other threads 155639ada854SJonathan Peyton } else { 155739ada854SJonathan Peyton // got some iterations to work on 15585e8470afSJim Cownie status = 1; 15595e8470afSJim Cownie if ((T)remaining > chunkspec) { 15605e8470afSJim Cownie limit = init + chunkspec - 1; 15615e8470afSJim Cownie } else { 15626b316febSTerry Wilmarth last = true; // the last chunk 15635e8470afSJim Cownie limit = init + remaining - 1; 15645e8470afSJim Cownie } // if 15655e8470afSJim Cownie } // if 15665e8470afSJim Cownie break; 15675e8470afSJim Cownie } // if 15686b316febSTerry Wilmarth limit = init + (UT)((double)remaining * 15696b316febSTerry Wilmarth *(double *)&pr->u.p.parm3); // divide by K*nproc 15705ba90c79SAndrey Churbanov if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1571c47afcd9SAndrey Churbanov (ST)init, (ST)limit)) { 15725e8470afSJim Cownie // CAS was successful, chunk obtained 15735e8470afSJim Cownie status = 1; 15745e8470afSJim Cownie --limit; 15755e8470afSJim Cownie break; 15765e8470afSJim Cownie } // if 15775e8470afSJim Cownie } // while 15785e8470afSJim Cownie if (status != 0) { 15795e8470afSJim Cownie start = pr->u.p.lb; 15805e8470afSJim Cownie incr = pr->u.p.st; 15815e8470afSJim Cownie if (p_st != NULL) 15825e8470afSJim Cownie *p_st = incr; 15835e8470afSJim Cownie *p_lb = start + init * incr; 15845e8470afSJim Cownie *p_ub = start + limit * incr; 158539ada854SJonathan Peyton if (pr->flags.ordered) { 15865e8470afSJim Cownie pr->u.p.ordered_lower = init; 15875e8470afSJim Cownie pr->u.p.ordered_upper = limit; 15885e8470afSJim Cownie } // if 15895e8470afSJim Cownie } else { 15905e8470afSJim Cownie *p_lb = 0; 15915e8470afSJim Cownie *p_ub = 0; 15925e8470afSJim Cownie if (p_st != NULL) 15935e8470afSJim Cownie *p_st = 0; 15945e8470afSJim Cownie } // if 15955e8470afSJim Cownie } // case 15965e8470afSJim Cownie break; 15975e8470afSJim Cownie 1598d454c73cSAndrey Churbanov case kmp_sch_guided_simd: { 1599d454c73cSAndrey Churbanov // same as iterative but curr-chunk adjusted to be multiple of given 1600d454c73cSAndrey Churbanov // chunk 1601d454c73cSAndrey Churbanov T chunk = pr->u.p.parm1; 160239ada854SJonathan Peyton KD_TRACE(100, 160339ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1604d454c73cSAndrey Churbanov gtid)); 1605d454c73cSAndrey Churbanov trip = pr->u.p.tc; 1606d454c73cSAndrey Churbanov // Start atomic part of calculations 1607d454c73cSAndrey Churbanov while (1) { 1608d454c73cSAndrey Churbanov ST remaining; // signed, because can be < 0 1609d454c73cSAndrey Churbanov init = sh->u.s.iteration; // shared value 1610d454c73cSAndrey Churbanov remaining = trip - init; 1611d454c73cSAndrey Churbanov if (remaining <= 0) { // AC: need to compare with 0 first 1612d454c73cSAndrey Churbanov status = 0; // nothing to do, don't try atomic op 1613d454c73cSAndrey Churbanov break; 1614d454c73cSAndrey Churbanov } 1615d454c73cSAndrey Churbanov KMP_DEBUG_ASSERT(init % chunk == 0); 1616d454c73cSAndrey Churbanov // compare with K*nproc*(chunk+1), K=2 by default 1617d454c73cSAndrey Churbanov if ((T)remaining < pr->u.p.parm2) { 161842016791SKazuaki Ishizaki // use dynamic-style schedule 16194c6a098aSKazuaki Ishizaki // atomically increment iterations, get old value 162094a114fcSJonathan Peyton init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 162194a114fcSJonathan Peyton (ST)chunk); 1622d454c73cSAndrey Churbanov remaining = trip - init; 1623d454c73cSAndrey Churbanov if (remaining <= 0) { 1624d454c73cSAndrey Churbanov status = 0; // all iterations got by other threads 1625d454c73cSAndrey Churbanov } else { 1626d454c73cSAndrey Churbanov // got some iterations to work on 1627d454c73cSAndrey Churbanov status = 1; 1628d454c73cSAndrey Churbanov if ((T)remaining > chunk) { 1629d454c73cSAndrey Churbanov limit = init + chunk - 1; 1630d454c73cSAndrey Churbanov } else { 16316b316febSTerry Wilmarth last = true; // the last chunk 1632d454c73cSAndrey Churbanov limit = init + remaining - 1; 1633d454c73cSAndrey Churbanov } // if 1634d454c73cSAndrey Churbanov } // if 1635d454c73cSAndrey Churbanov break; 1636d454c73cSAndrey Churbanov } // if 1637d454c73cSAndrey Churbanov // divide by K*nproc 16386b316febSTerry Wilmarth UT span; 16396b316febSTerry Wilmarth __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 16406b316febSTerry Wilmarth &span); 1641d454c73cSAndrey Churbanov UT rem = span % chunk; 1642d454c73cSAndrey Churbanov if (rem) // adjust so that span%chunk == 0 1643d454c73cSAndrey Churbanov span += chunk - rem; 1644d454c73cSAndrey Churbanov limit = init + span; 16455ba90c79SAndrey Churbanov if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1646c47afcd9SAndrey Churbanov (ST)init, (ST)limit)) { 1647d454c73cSAndrey Churbanov // CAS was successful, chunk obtained 1648d454c73cSAndrey Churbanov status = 1; 1649d454c73cSAndrey Churbanov --limit; 1650d454c73cSAndrey Churbanov break; 1651d454c73cSAndrey Churbanov } // if 1652d454c73cSAndrey Churbanov } // while 1653d454c73cSAndrey Churbanov if (status != 0) { 1654d454c73cSAndrey Churbanov start = pr->u.p.lb; 1655d454c73cSAndrey Churbanov incr = pr->u.p.st; 1656d454c73cSAndrey Churbanov if (p_st != NULL) 1657d454c73cSAndrey Churbanov *p_st = incr; 1658d454c73cSAndrey Churbanov *p_lb = start + init * incr; 1659d454c73cSAndrey Churbanov *p_ub = start + limit * incr; 166039ada854SJonathan Peyton if (pr->flags.ordered) { 1661d454c73cSAndrey Churbanov pr->u.p.ordered_lower = init; 1662d454c73cSAndrey Churbanov pr->u.p.ordered_upper = limit; 1663d454c73cSAndrey Churbanov } // if 1664d454c73cSAndrey Churbanov } else { 1665d454c73cSAndrey Churbanov *p_lb = 0; 1666d454c73cSAndrey Churbanov *p_ub = 0; 1667d454c73cSAndrey Churbanov if (p_st != NULL) 1668d454c73cSAndrey Churbanov *p_st = 0; 1669d454c73cSAndrey Churbanov } // if 1670d454c73cSAndrey Churbanov } // case 1671d454c73cSAndrey Churbanov break; 1672d454c73cSAndrey Churbanov 16733041982dSJonathan Peyton case kmp_sch_guided_analytical_chunked: { 16745e8470afSJim Cownie T chunkspec = pr->u.p.parm1; 16755e8470afSJim Cownie UT chunkIdx; 1676f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 16775e8470afSJim Cownie /* for storing original FPCW value for Windows* OS on 16785e8470afSJim Cownie IA-32 architecture 8-byte version */ 16795e8470afSJim Cownie unsigned int oldFpcw; 1680181b4bb3SJim Cownie unsigned int fpcwSet = 0; 16815e8470afSJim Cownie #endif 168239ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 168339ada854SJonathan Peyton "kmp_sch_guided_analytical_chunked case\n", 16845e8470afSJim Cownie gtid)); 16855e8470afSJim Cownie 16865e8470afSJim Cownie trip = pr->u.p.tc; 16875e8470afSJim Cownie 168839ada854SJonathan Peyton KMP_DEBUG_ASSERT(nproc > 1); 168939ada854SJonathan Peyton KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 16905e8470afSJim Cownie 16913041982dSJonathan Peyton while (1) { /* this while loop is a safeguard against unexpected zero 16923041982dSJonathan Peyton chunk sizes */ 16935e8470afSJim Cownie chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 16945e8470afSJim Cownie if (chunkIdx >= (UT)pr->u.p.parm2) { 16955e8470afSJim Cownie --trip; 16965e8470afSJim Cownie /* use dynamic-style scheduling */ 16975e8470afSJim Cownie init = chunkIdx * chunkspec + pr->u.p.count; 16983041982dSJonathan Peyton /* need to verify init > 0 in case of overflow in the above 16993041982dSJonathan Peyton * calculation */ 17005e8470afSJim Cownie if ((status = (init > 0 && init <= trip)) != 0) { 17015e8470afSJim Cownie limit = init + chunkspec - 1; 17025e8470afSJim Cownie 17035e8470afSJim Cownie if ((last = (limit >= trip)) != 0) 17045e8470afSJim Cownie limit = trip; 17055e8470afSJim Cownie } 17065e8470afSJim Cownie break; 17075e8470afSJim Cownie } else { 17085e8470afSJim Cownie /* use exponential-style scheduling */ 17093041982dSJonathan Peyton /* The following check is to workaround the lack of long double precision on 17103041982dSJonathan Peyton Windows* OS. 17115e8470afSJim Cownie This check works around the possible effect that init != 0 for chunkIdx == 0. 17125e8470afSJim Cownie */ 1713f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 171439ada854SJonathan Peyton /* If we haven't already done so, save original 171539ada854SJonathan Peyton FPCW and set precision to 64-bit, as Windows* OS 171639ada854SJonathan Peyton on IA-32 architecture defaults to 53-bit */ 17175e8470afSJim Cownie if (!fpcwSet) { 1718181b4bb3SJim Cownie oldFpcw = _control87(0, 0); 1719181b4bb3SJim Cownie _control87(_PC_64, _MCW_PC); 17205e8470afSJim Cownie fpcwSet = 0x30000; 17215e8470afSJim Cownie } 17225e8470afSJim Cownie #endif 17235e8470afSJim Cownie if (chunkIdx) { 17245e8470afSJim Cownie init = __kmp_dispatch_guided_remaining<T>( 17255e8470afSJim Cownie trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 17265e8470afSJim Cownie KMP_DEBUG_ASSERT(init); 17275e8470afSJim Cownie init = trip - init; 17285e8470afSJim Cownie } else 17295e8470afSJim Cownie init = 0; 17305e8470afSJim Cownie limit = trip - __kmp_dispatch_guided_remaining<T>( 17315e8470afSJim Cownie trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 17325e8470afSJim Cownie KMP_ASSERT(init <= limit); 17335e8470afSJim Cownie if (init < limit) { 17345e8470afSJim Cownie KMP_DEBUG_ASSERT(limit <= trip); 17355e8470afSJim Cownie --limit; 17365e8470afSJim Cownie status = 1; 17375e8470afSJim Cownie break; 17385e8470afSJim Cownie } // if 17395e8470afSJim Cownie } // if 17405e8470afSJim Cownie } // while (1) 1741f700e9edSAndrey Churbanov #if KMP_USE_X87CONTROL 1742181b4bb3SJim Cownie /* restore FPCW if necessary 174339ada854SJonathan Peyton AC: check fpcwSet flag first because oldFpcw can be uninitialized here 174439ada854SJonathan Peyton */ 1745181b4bb3SJim Cownie if (fpcwSet && (oldFpcw & fpcwSet)) 1746181b4bb3SJim Cownie _control87(oldFpcw, _MCW_PC); 17475e8470afSJim Cownie #endif 17485e8470afSJim Cownie if (status != 0) { 17495e8470afSJim Cownie start = pr->u.p.lb; 17505e8470afSJim Cownie incr = pr->u.p.st; 17515e8470afSJim Cownie if (p_st != NULL) 17525e8470afSJim Cownie *p_st = incr; 17535e8470afSJim Cownie *p_lb = start + init * incr; 17545e8470afSJim Cownie *p_ub = start + limit * incr; 175539ada854SJonathan Peyton if (pr->flags.ordered) { 17565e8470afSJim Cownie pr->u.p.ordered_lower = init; 17575e8470afSJim Cownie pr->u.p.ordered_upper = limit; 17585e8470afSJim Cownie } 17595e8470afSJim Cownie } else { 17605e8470afSJim Cownie *p_lb = 0; 17615e8470afSJim Cownie *p_ub = 0; 17625e8470afSJim Cownie if (p_st != NULL) 17635e8470afSJim Cownie *p_st = 0; 17645e8470afSJim Cownie } 17655e8470afSJim Cownie } // case 17665e8470afSJim Cownie break; 17675e8470afSJim Cownie 17683041982dSJonathan Peyton case kmp_sch_trapezoidal: { 17695e8470afSJim Cownie UT index; 17705e8470afSJim Cownie T parm2 = pr->u.p.parm2; 17715e8470afSJim Cownie T parm3 = pr->u.p.parm3; 17725e8470afSJim Cownie T parm4 = pr->u.p.parm4; 177339ada854SJonathan Peyton KD_TRACE(100, 177439ada854SJonathan Peyton ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 17755e8470afSJim Cownie gtid)); 17765e8470afSJim Cownie 17775e8470afSJim Cownie index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 17785e8470afSJim Cownie 17795e8470afSJim Cownie init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 17805e8470afSJim Cownie trip = pr->u.p.tc - 1; 17815e8470afSJim Cownie 17825e8470afSJim Cownie if ((status = ((T)index < parm3 && init <= trip)) == 0) { 17835e8470afSJim Cownie *p_lb = 0; 17845e8470afSJim Cownie *p_ub = 0; 17853041982dSJonathan Peyton if (p_st != NULL) 17863041982dSJonathan Peyton *p_st = 0; 17875e8470afSJim Cownie } else { 17885e8470afSJim Cownie start = pr->u.p.lb; 17895e8470afSJim Cownie limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 17905e8470afSJim Cownie incr = pr->u.p.st; 17915e8470afSJim Cownie 17925e8470afSJim Cownie if ((last = (limit >= trip)) != 0) 17935e8470afSJim Cownie limit = trip; 17945e8470afSJim Cownie 17953041982dSJonathan Peyton if (p_st != NULL) 17963041982dSJonathan Peyton *p_st = incr; 17975e8470afSJim Cownie 17985e8470afSJim Cownie if (incr == 1) { 17995e8470afSJim Cownie *p_lb = start + init; 18005e8470afSJim Cownie *p_ub = start + limit; 18015e8470afSJim Cownie } else { 18025e8470afSJim Cownie *p_lb = start + init * incr; 18035e8470afSJim Cownie *p_ub = start + limit * incr; 18045e8470afSJim Cownie } 18055e8470afSJim Cownie 180639ada854SJonathan Peyton if (pr->flags.ordered) { 180739ada854SJonathan Peyton pr->u.p.ordered_lower = init; 180839ada854SJonathan Peyton pr->u.p.ordered_upper = limit; 180939ada854SJonathan Peyton } // if 181039ada854SJonathan Peyton } // if 181139ada854SJonathan Peyton } // case 181239ada854SJonathan Peyton break; 181339ada854SJonathan Peyton default: { 181439ada854SJonathan Peyton status = 0; // to avoid complaints on uninitialized variable use 181539ada854SJonathan Peyton __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 181639ada854SJonathan Peyton KMP_HNT(GetNewerLibrary), // Hint 181739ada854SJonathan Peyton __kmp_msg_null // Variadic argument list terminator 181839ada854SJonathan Peyton ); 181939ada854SJonathan Peyton } break; 182039ada854SJonathan Peyton } // switch 182139ada854SJonathan Peyton if (p_last) 182239ada854SJonathan Peyton *p_last = last; 182339ada854SJonathan Peyton #ifdef KMP_DEBUG 182439ada854SJonathan Peyton if (pr->flags.ordered) { 182539ada854SJonathan Peyton char *buff; 182639ada854SJonathan Peyton // create format specifiers before the debug output 182739ada854SJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 182839ada854SJonathan Peyton "ordered_lower:%%%s ordered_upper:%%%s\n", 182939ada854SJonathan Peyton traits_t<UT>::spec, traits_t<UT>::spec); 183039ada854SJonathan Peyton KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 183139ada854SJonathan Peyton __kmp_str_free(&buff); 183239ada854SJonathan Peyton } 183339ada854SJonathan Peyton { 183439ada854SJonathan Peyton char *buff; 183539ada854SJonathan Peyton // create format specifiers before the debug output 183639ada854SJonathan Peyton buff = __kmp_str_format( 183739ada854SJonathan Peyton "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 183839ada854SJonathan Peyton "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 183939ada854SJonathan Peyton traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 184039ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 184139ada854SJonathan Peyton __kmp_str_free(&buff); 184239ada854SJonathan Peyton } 184339ada854SJonathan Peyton #endif 184439ada854SJonathan Peyton return status; 184539ada854SJonathan Peyton } 184639ada854SJonathan Peyton 184739ada854SJonathan Peyton /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 184839ada854SJonathan Peyton work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 184939ada854SJonathan Peyton is not called. */ 185039ada854SJonathan Peyton #if OMPT_SUPPORT && OMPT_OPTIONAL 185139ada854SJonathan Peyton #define OMPT_LOOP_END \ 185239ada854SJonathan Peyton if (status == 0) { \ 185339ada854SJonathan Peyton if (ompt_enabled.ompt_callback_work) { \ 185439ada854SJonathan Peyton ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 185539ada854SJonathan Peyton ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 185639ada854SJonathan Peyton ompt_callbacks.ompt_callback(ompt_callback_work)( \ 185739ada854SJonathan Peyton ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 185839ada854SJonathan Peyton &(task_info->task_data), 0, codeptr); \ 185939ada854SJonathan Peyton } \ 186039ada854SJonathan Peyton } 186139ada854SJonathan Peyton // TODO: implement count 186239ada854SJonathan Peyton #else 186339ada854SJonathan Peyton #define OMPT_LOOP_END // no-op 186439ada854SJonathan Peyton #endif 186539ada854SJonathan Peyton 1866f0682ac4SJonathan Peyton #if KMP_STATS_ENABLED 1867f0682ac4SJonathan Peyton #define KMP_STATS_LOOP_END \ 1868f0682ac4SJonathan Peyton { \ 1869f0682ac4SJonathan Peyton kmp_int64 u, l, t, i; \ 1870f0682ac4SJonathan Peyton l = (kmp_int64)(*p_lb); \ 1871f0682ac4SJonathan Peyton u = (kmp_int64)(*p_ub); \ 1872f0682ac4SJonathan Peyton i = (kmp_int64)(pr->u.p.st); \ 1873f0682ac4SJonathan Peyton if (status == 0) { \ 1874f0682ac4SJonathan Peyton t = 0; \ 1875f0682ac4SJonathan Peyton KMP_POP_PARTITIONED_TIMER(); \ 1876f0682ac4SJonathan Peyton } else if (i == 1) { \ 1877f0682ac4SJonathan Peyton if (u >= l) \ 1878f0682ac4SJonathan Peyton t = u - l + 1; \ 1879f0682ac4SJonathan Peyton else \ 1880f0682ac4SJonathan Peyton t = 0; \ 1881f0682ac4SJonathan Peyton } else if (i < 0) { \ 1882f0682ac4SJonathan Peyton if (l >= u) \ 1883f0682ac4SJonathan Peyton t = (l - u) / (-i) + 1; \ 1884f0682ac4SJonathan Peyton else \ 1885f0682ac4SJonathan Peyton t = 0; \ 1886f0682ac4SJonathan Peyton } else { \ 1887f0682ac4SJonathan Peyton if (u >= l) \ 1888f0682ac4SJonathan Peyton t = (u - l) / i + 1; \ 1889f0682ac4SJonathan Peyton else \ 1890f0682ac4SJonathan Peyton t = 0; \ 1891f0682ac4SJonathan Peyton } \ 1892f0682ac4SJonathan Peyton KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1893f0682ac4SJonathan Peyton } 1894f0682ac4SJonathan Peyton #else 1895f0682ac4SJonathan Peyton #define KMP_STATS_LOOP_END /* Nothing */ 1896f0682ac4SJonathan Peyton #endif 1897f0682ac4SJonathan Peyton 189839ada854SJonathan Peyton template <typename T> 189939ada854SJonathan Peyton static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 190039ada854SJonathan Peyton T *p_lb, T *p_ub, 190139ada854SJonathan Peyton typename traits_t<T>::signed_t *p_st 190239ada854SJonathan Peyton #if OMPT_SUPPORT && OMPT_OPTIONAL 190339ada854SJonathan Peyton , 190439ada854SJonathan Peyton void *codeptr 190539ada854SJonathan Peyton #endif 190639ada854SJonathan Peyton ) { 190739ada854SJonathan Peyton 190839ada854SJonathan Peyton typedef typename traits_t<T>::unsigned_t UT; 190939ada854SJonathan Peyton typedef typename traits_t<T>::signed_t ST; 191039ada854SJonathan Peyton // This is potentially slightly misleading, schedule(runtime) will appear here 191142016791SKazuaki Ishizaki // even if the actual runtime schedule is static. (Which points out a 19124c6a098aSKazuaki Ishizaki // disadvantage of schedule(runtime): even when static scheduling is used it 191339ada854SJonathan Peyton // costs more than a compile time choice to use static scheduling would.) 1914f0682ac4SJonathan Peyton KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 191539ada854SJonathan Peyton 191639ada854SJonathan Peyton int status; 191739ada854SJonathan Peyton dispatch_private_info_template<T> *pr; 1918787eb0c6SAndreyChurbanov __kmp_assert_valid_gtid(gtid); 191939ada854SJonathan Peyton kmp_info_t *th = __kmp_threads[gtid]; 192039ada854SJonathan Peyton kmp_team_t *team = th->th.th_team; 192139ada854SJonathan Peyton 192239ada854SJonathan Peyton KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 192339ada854SJonathan Peyton KD_TRACE( 192439ada854SJonathan Peyton 1000, 192539ada854SJonathan Peyton ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 192639ada854SJonathan Peyton gtid, p_lb, p_ub, p_st, p_last)); 192739ada854SJonathan Peyton 192839ada854SJonathan Peyton if (team->t.t_serialized) { 192942016791SKazuaki Ishizaki /* NOTE: serialize this dispatch because we are not at the active level */ 193039ada854SJonathan Peyton pr = reinterpret_cast<dispatch_private_info_template<T> *>( 193139ada854SJonathan Peyton th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 193239ada854SJonathan Peyton KMP_DEBUG_ASSERT(pr); 193339ada854SJonathan Peyton 193439ada854SJonathan Peyton if ((status = (pr->u.p.tc != 0)) == 0) { 193539ada854SJonathan Peyton *p_lb = 0; 193639ada854SJonathan Peyton *p_ub = 0; 193739ada854SJonathan Peyton // if ( p_last != NULL ) 193839ada854SJonathan Peyton // *p_last = 0; 193939ada854SJonathan Peyton if (p_st != NULL) 194039ada854SJonathan Peyton *p_st = 0; 194139ada854SJonathan Peyton if (__kmp_env_consistency_check) { 194239ada854SJonathan Peyton if (pr->pushed_ws != ct_none) { 194339ada854SJonathan Peyton pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 194439ada854SJonathan Peyton } 194539ada854SJonathan Peyton } 194639ada854SJonathan Peyton } else if (pr->flags.nomerge) { 194739ada854SJonathan Peyton kmp_int32 last; 194839ada854SJonathan Peyton T start; 194939ada854SJonathan Peyton UT limit, trip, init; 195039ada854SJonathan Peyton ST incr; 195139ada854SJonathan Peyton T chunk = pr->u.p.parm1; 195239ada854SJonathan Peyton 195339ada854SJonathan Peyton KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 195439ada854SJonathan Peyton gtid)); 195539ada854SJonathan Peyton 195639ada854SJonathan Peyton init = chunk * pr->u.p.count++; 195739ada854SJonathan Peyton trip = pr->u.p.tc - 1; 195839ada854SJonathan Peyton 195939ada854SJonathan Peyton if ((status = (init <= trip)) == 0) { 196039ada854SJonathan Peyton *p_lb = 0; 196139ada854SJonathan Peyton *p_ub = 0; 196239ada854SJonathan Peyton // if ( p_last != NULL ) 196339ada854SJonathan Peyton // *p_last = 0; 196439ada854SJonathan Peyton if (p_st != NULL) 196539ada854SJonathan Peyton *p_st = 0; 196639ada854SJonathan Peyton if (__kmp_env_consistency_check) { 196739ada854SJonathan Peyton if (pr->pushed_ws != ct_none) { 196839ada854SJonathan Peyton pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 196939ada854SJonathan Peyton } 197039ada854SJonathan Peyton } 197139ada854SJonathan Peyton } else { 197239ada854SJonathan Peyton start = pr->u.p.lb; 197339ada854SJonathan Peyton limit = chunk + init - 1; 197439ada854SJonathan Peyton incr = pr->u.p.st; 197539ada854SJonathan Peyton 197639ada854SJonathan Peyton if ((last = (limit >= trip)) != 0) { 197739ada854SJonathan Peyton limit = trip; 197839ada854SJonathan Peyton #if KMP_OS_WINDOWS 197939ada854SJonathan Peyton pr->u.p.last_upper = pr->u.p.ub; 198039ada854SJonathan Peyton #endif /* KMP_OS_WINDOWS */ 198139ada854SJonathan Peyton } 198239ada854SJonathan Peyton if (p_last != NULL) 198339ada854SJonathan Peyton *p_last = last; 198439ada854SJonathan Peyton if (p_st != NULL) 198539ada854SJonathan Peyton *p_st = incr; 198639ada854SJonathan Peyton if (incr == 1) { 198739ada854SJonathan Peyton *p_lb = start + init; 198839ada854SJonathan Peyton *p_ub = start + limit; 198939ada854SJonathan Peyton } else { 199039ada854SJonathan Peyton *p_lb = start + init * incr; 199139ada854SJonathan Peyton *p_ub = start + limit * incr; 199239ada854SJonathan Peyton } 199339ada854SJonathan Peyton 199439ada854SJonathan Peyton if (pr->flags.ordered) { 19955e8470afSJim Cownie pr->u.p.ordered_lower = init; 19965e8470afSJim Cownie pr->u.p.ordered_upper = limit; 19975e8470afSJim Cownie #ifdef KMP_DEBUG 19985e8470afSJim Cownie { 1999aeb40adaSJonas Hahnfeld char *buff; 20005e8470afSJim Cownie // create format specifiers before the debug output 20013041982dSJonathan Peyton buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 20023041982dSJonathan Peyton "ordered_lower:%%%s ordered_upper:%%%s\n", 20035e8470afSJim Cownie traits_t<UT>::spec, traits_t<UT>::spec); 20043041982dSJonathan Peyton KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 20053041982dSJonathan Peyton pr->u.p.ordered_upper)); 20065e8470afSJim Cownie __kmp_str_free(&buff); 20075e8470afSJim Cownie } 20085e8470afSJim Cownie #endif 20095e8470afSJim Cownie } // if 20105e8470afSJim Cownie } // if 201139ada854SJonathan Peyton } else { 201239ada854SJonathan Peyton pr->u.p.tc = 0; 201339ada854SJonathan Peyton *p_lb = pr->u.p.lb; 201439ada854SJonathan Peyton *p_ub = pr->u.p.ub; 201539ada854SJonathan Peyton #if KMP_OS_WINDOWS 201639ada854SJonathan Peyton pr->u.p.last_upper = *p_ub; 201739ada854SJonathan Peyton #endif /* KMP_OS_WINDOWS */ 201839ada854SJonathan Peyton if (p_last != NULL) 201939ada854SJonathan Peyton *p_last = TRUE; 202039ada854SJonathan Peyton if (p_st != NULL) 202139ada854SJonathan Peyton *p_st = pr->u.p.st; 202239ada854SJonathan Peyton } // if 202339ada854SJonathan Peyton #ifdef KMP_DEBUG 202439ada854SJonathan Peyton { 202539ada854SJonathan Peyton char *buff; 202639ada854SJonathan Peyton // create format specifiers before the debug output 202739ada854SJonathan Peyton buff = __kmp_str_format( 202839ada854SJonathan Peyton "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 202939ada854SJonathan Peyton "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 203039ada854SJonathan Peyton traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2031771f0fb9SPeyton, Jonathan L KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2032771f0fb9SPeyton, Jonathan L (p_last ? *p_last : 0), status)); 203339ada854SJonathan Peyton __kmp_str_free(&buff); 203439ada854SJonathan Peyton } 203539ada854SJonathan Peyton #endif 203639ada854SJonathan Peyton #if INCLUDE_SSC_MARKS 203739ada854SJonathan Peyton SSC_MARK_DISPATCH_NEXT(); 203839ada854SJonathan Peyton #endif 203939ada854SJonathan Peyton OMPT_LOOP_END; 2040f0682ac4SJonathan Peyton KMP_STATS_LOOP_END; 204139ada854SJonathan Peyton return status; 204239ada854SJonathan Peyton } else { 204339ada854SJonathan Peyton kmp_int32 last = 0; 204439ada854SJonathan Peyton dispatch_shared_info_template<T> volatile *sh; 20455e8470afSJim Cownie 204639ada854SJonathan Peyton KMP_DEBUG_ASSERT(th->th.th_dispatch == 204739ada854SJonathan Peyton &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 204839ada854SJonathan Peyton 204939ada854SJonathan Peyton pr = reinterpret_cast<dispatch_private_info_template<T> *>( 205039ada854SJonathan Peyton th->th.th_dispatch->th_dispatch_pr_current); 205139ada854SJonathan Peyton KMP_DEBUG_ASSERT(pr); 205239ada854SJonathan Peyton sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 205339ada854SJonathan Peyton th->th.th_dispatch->th_dispatch_sh_current); 205439ada854SJonathan Peyton KMP_DEBUG_ASSERT(sh); 205539ada854SJonathan Peyton 2056f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 2057f6399367SJonathan Peyton if (pr->flags.use_hier) 2058f6399367SJonathan Peyton status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2059f6399367SJonathan Peyton else 2060f6399367SJonathan Peyton #endif // KMP_USE_HIER_SCHED 206139ada854SJonathan Peyton status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 206239ada854SJonathan Peyton p_st, th->th.th_team_nproc, 206339ada854SJonathan Peyton th->th.th_info.ds.ds_tid); 206439ada854SJonathan Peyton // status == 0: no more iterations to execute 20655e8470afSJim Cownie if (status == 0) { 20665e8470afSJim Cownie UT num_done; 20675e8470afSJim Cownie 20685e8470afSJim Cownie num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 20695e8470afSJim Cownie #ifdef KMP_DEBUG 20705e8470afSJim Cownie { 2071aeb40adaSJonas Hahnfeld char *buff; 20725e8470afSJim Cownie // create format specifiers before the debug output 20735e8470afSJim Cownie buff = __kmp_str_format( 20745e8470afSJim Cownie "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 20755e8470afSJim Cownie traits_t<UT>::spec); 207639ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 20775e8470afSJim Cownie __kmp_str_free(&buff); 20785e8470afSJim Cownie } 20795e8470afSJim Cownie #endif 20805e8470afSJim Cownie 2081f6399367SJonathan Peyton #if KMP_USE_HIER_SCHED 2082f6399367SJonathan Peyton pr->flags.use_hier = FALSE; 2083f6399367SJonathan Peyton #endif 2084ff5ca8b4SJonathan Peyton if ((ST)num_done == th->th.th_team_nproc - 1) { 2085429dbc2aSAndrey Churbanov #if (KMP_STATIC_STEAL_ENABLED) 20863041982dSJonathan Peyton if (pr->schedule == kmp_sch_static_steal && 20873041982dSJonathan Peyton traits_t<T>::type_size > 4) { 2088429dbc2aSAndrey Churbanov int i; 2089abe64360SAndreyChurbanov int idx = (th->th.th_dispatch->th_disp_index - 1) % 2090abe64360SAndreyChurbanov __kmp_dispatch_num_buffers; // current loop index 2091429dbc2aSAndrey Churbanov kmp_info_t **other_threads = team->t.t_threads; 2092429dbc2aSAndrey Churbanov // loop complete, safe to destroy locks used for stealing 2093429dbc2aSAndrey Churbanov for (i = 0; i < th->th.th_team_nproc; ++i) { 2094abe64360SAndreyChurbanov dispatch_private_info_template<T> *buf = 2095abe64360SAndreyChurbanov reinterpret_cast<dispatch_private_info_template<T> *>( 2096abe64360SAndreyChurbanov &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2097abe64360SAndreyChurbanov kmp_lock_t *lck = buf->u.p.th_steal_lock; 2098429dbc2aSAndrey Churbanov KMP_ASSERT(lck != NULL); 2099429dbc2aSAndrey Churbanov __kmp_destroy_lock(lck); 2100429dbc2aSAndrey Churbanov __kmp_free(lck); 2101abe64360SAndreyChurbanov buf->u.p.th_steal_lock = NULL; 2102429dbc2aSAndrey Churbanov } 2103429dbc2aSAndrey Churbanov } 2104429dbc2aSAndrey Churbanov #endif 21055e8470afSJim Cownie /* NOTE: release this buffer to be reused */ 21065e8470afSJim Cownie 21075e8470afSJim Cownie KMP_MB(); /* Flush all pending memory write invalidates. */ 21085e8470afSJim Cownie 21095e8470afSJim Cownie sh->u.s.num_done = 0; 21105e8470afSJim Cownie sh->u.s.iteration = 0; 21115e8470afSJim Cownie 21125e8470afSJim Cownie /* TODO replace with general release procedure? */ 211339ada854SJonathan Peyton if (pr->flags.ordered) { 21145e8470afSJim Cownie sh->u.s.ordered_iteration = 0; 21155e8470afSJim Cownie } 21165e8470afSJim Cownie 21175e8470afSJim Cownie KMP_MB(); /* Flush all pending memory write invalidates. */ 21185e8470afSJim Cownie 2119067325f9SJonathan Peyton sh->buffer_index += __kmp_dispatch_num_buffers; 21205e8470afSJim Cownie KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 21215e8470afSJim Cownie gtid, sh->buffer_index)); 21225e8470afSJim Cownie 21235e8470afSJim Cownie KMP_MB(); /* Flush all pending memory write invalidates. */ 21245e8470afSJim Cownie 21255e8470afSJim Cownie } // if 21265e8470afSJim Cownie if (__kmp_env_consistency_check) { 21275e8470afSJim Cownie if (pr->pushed_ws != ct_none) { 21285e8470afSJim Cownie pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 21295e8470afSJim Cownie } 21305e8470afSJim Cownie } 21315e8470afSJim Cownie 21325e8470afSJim Cownie th->th.th_dispatch->th_deo_fcn = NULL; 21335e8470afSJim Cownie th->th.th_dispatch->th_dxo_fcn = NULL; 21345e8470afSJim Cownie th->th.th_dispatch->th_dispatch_sh_current = NULL; 21355e8470afSJim Cownie th->th.th_dispatch->th_dispatch_pr_current = NULL; 21365e8470afSJim Cownie } // if (status == 0) 21375e8470afSJim Cownie #if KMP_OS_WINDOWS 21385e8470afSJim Cownie else if (last) { 21395e8470afSJim Cownie pr->u.p.last_upper = pr->u.p.ub; 21405e8470afSJim Cownie } 21415e8470afSJim Cownie #endif /* KMP_OS_WINDOWS */ 21424cc4bb4cSJim Cownie if (p_last != NULL && status != 0) 21434cc4bb4cSJim Cownie *p_last = last; 21445e8470afSJim Cownie } // if 21455e8470afSJim Cownie 21465e8470afSJim Cownie #ifdef KMP_DEBUG 21475e8470afSJim Cownie { 2148aeb40adaSJonas Hahnfeld char *buff; 21495e8470afSJim Cownie // create format specifiers before the debug output 21505e8470afSJim Cownie buff = __kmp_str_format( 21513041982dSJonathan Peyton "__kmp_dispatch_next: T#%%d normal case: " 215239ada854SJonathan Peyton "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 21535e8470afSJim Cownie traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 215439ada854SJonathan Peyton KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 215539ada854SJonathan Peyton (p_last ? *p_last : 0), status)); 21565e8470afSJim Cownie __kmp_str_free(&buff); 21575e8470afSJim Cownie } 21585e8470afSJim Cownie #endif 21594cc4bb4cSJim Cownie #if INCLUDE_SSC_MARKS 21604cc4bb4cSJim Cownie SSC_MARK_DISPATCH_NEXT(); 21614cc4bb4cSJim Cownie #endif 2162d7d088f8SAndrey Churbanov OMPT_LOOP_END; 2163f0682ac4SJonathan Peyton KMP_STATS_LOOP_END; 21645e8470afSJim Cownie return status; 21655e8470afSJim Cownie } 21665e8470afSJim Cownie 21674cc4bb4cSJim Cownie template <typename T> 21683041982dSJonathan Peyton static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 21693041982dSJonathan Peyton kmp_int32 *plastiter, T *plower, T *pupper, 21703041982dSJonathan Peyton typename traits_t<T>::signed_t incr) { 21714cc4bb4cSJim Cownie typedef typename traits_t<T>::unsigned_t UT; 2172414544c9SEd Maste kmp_uint32 team_id; 2173414544c9SEd Maste kmp_uint32 nteams; 2174414544c9SEd Maste UT trip_count; 2175414544c9SEd Maste kmp_team_t *team; 21764cc4bb4cSJim Cownie kmp_info_t *th; 21774cc4bb4cSJim Cownie 21784cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(plastiter && plower && pupper); 21794cc4bb4cSJim Cownie KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 21804cc4bb4cSJim Cownie #ifdef KMP_DEBUG 2181baad3f60SJonathan Peyton typedef typename traits_t<T>::signed_t ST; 21824cc4bb4cSJim Cownie { 2183aeb40adaSJonas Hahnfeld char *buff; 21844cc4bb4cSJim Cownie // create format specifiers before the debug output 21853041982dSJonathan Peyton buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 21864cc4bb4cSJim Cownie "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 21873041982dSJonathan Peyton traits_t<T>::spec, traits_t<T>::spec, 21883041982dSJonathan Peyton traits_t<ST>::spec, traits_t<T>::spec); 21894cc4bb4cSJim Cownie KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 21904cc4bb4cSJim Cownie __kmp_str_free(&buff); 21914cc4bb4cSJim Cownie } 21924cc4bb4cSJim Cownie #endif 21934cc4bb4cSJim Cownie 21944cc4bb4cSJim Cownie if (__kmp_env_consistency_check) { 21954cc4bb4cSJim Cownie if (incr == 0) { 21963041982dSJonathan Peyton __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 21973041982dSJonathan Peyton loc); 21984cc4bb4cSJim Cownie } 21994cc4bb4cSJim Cownie if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 22004cc4bb4cSJim Cownie // The loop is illegal. 22014cc4bb4cSJim Cownie // Some zero-trip loops maintained by compiler, e.g.: 22024cc4bb4cSJim Cownie // for(i=10;i<0;++i) // lower >= upper - run-time check 22034cc4bb4cSJim Cownie // for(i=0;i>10;--i) // lower <= upper - run-time check 22044cc4bb4cSJim Cownie // for(i=0;i>10;++i) // incr > 0 - compile-time check 22054cc4bb4cSJim Cownie // for(i=10;i<0;--i) // incr < 0 - compile-time check 22064cc4bb4cSJim Cownie // Compiler does not check the following illegal loops: 22074cc4bb4cSJim Cownie // for(i=0;i<10;i+=incr) // where incr<0 22084cc4bb4cSJim Cownie // for(i=10;i>0;i-=incr) // where incr<0 22094cc4bb4cSJim Cownie __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 22104cc4bb4cSJim Cownie } 22114cc4bb4cSJim Cownie } 2212787eb0c6SAndreyChurbanov __kmp_assert_valid_gtid(gtid); 22134cc4bb4cSJim Cownie th = __kmp_threads[gtid]; 22144cc4bb4cSJim Cownie team = th->th.th_team; 2215441f3376SJonathan Peyton KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 22164cc4bb4cSJim Cownie nteams = th->th.th_teams_size.nteams; 22174cc4bb4cSJim Cownie team_id = team->t.t_master_tid; 2218baad3f60SJonathan Peyton KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 22194cc4bb4cSJim Cownie 22204cc4bb4cSJim Cownie // compute global trip count 22214cc4bb4cSJim Cownie if (incr == 1) { 22224cc4bb4cSJim Cownie trip_count = *pupper - *plower + 1; 22234cc4bb4cSJim Cownie } else if (incr == -1) { 22244cc4bb4cSJim Cownie trip_count = *plower - *pupper + 1; 22255235a1b6SJonathan Peyton } else if (incr > 0) { 22265235a1b6SJonathan Peyton // upper-lower can exceed the limit of signed type 22275235a1b6SJonathan Peyton trip_count = (UT)(*pupper - *plower) / incr + 1; 22284cc4bb4cSJim Cownie } else { 22295235a1b6SJonathan Peyton trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 22304cc4bb4cSJim Cownie } 223145be4500SJonathan Peyton 22324cc4bb4cSJim Cownie if (trip_count <= nteams) { 22334cc4bb4cSJim Cownie KMP_DEBUG_ASSERT( 22343041982dSJonathan Peyton __kmp_static == kmp_sch_static_greedy || 22353041982dSJonathan Peyton __kmp_static == 22363041982dSJonathan Peyton kmp_sch_static_balanced); // Unknown static scheduling type. 22374cc4bb4cSJim Cownie // only some teams get single iteration, others get nothing 22384cc4bb4cSJim Cownie if (team_id < trip_count) { 22394cc4bb4cSJim Cownie *pupper = *plower = *plower + team_id * incr; 22404cc4bb4cSJim Cownie } else { 22414cc4bb4cSJim Cownie *plower = *pupper + incr; // zero-trip loop 22424cc4bb4cSJim Cownie } 22434cc4bb4cSJim Cownie if (plastiter != NULL) 22444cc4bb4cSJim Cownie *plastiter = (team_id == trip_count - 1); 22454cc4bb4cSJim Cownie } else { 22464cc4bb4cSJim Cownie if (__kmp_static == kmp_sch_static_balanced) { 2247414544c9SEd Maste UT chunk = trip_count / nteams; 2248414544c9SEd Maste UT extras = trip_count % nteams; 22493041982dSJonathan Peyton *plower += 22503041982dSJonathan Peyton incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 22514cc4bb4cSJim Cownie *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 22524cc4bb4cSJim Cownie if (plastiter != NULL) 22534cc4bb4cSJim Cownie *plastiter = (team_id == nteams - 1); 22544cc4bb4cSJim Cownie } else { 2255414544c9SEd Maste T chunk_inc_count = 22564cc4bb4cSJim Cownie (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2257414544c9SEd Maste T upper = *pupper; 22584cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 22594cc4bb4cSJim Cownie // Unknown static scheduling type. 22604cc4bb4cSJim Cownie *plower += team_id * chunk_inc_count; 22614cc4bb4cSJim Cownie *pupper = *plower + chunk_inc_count - incr; 22624cc4bb4cSJim Cownie // Check/correct bounds if needed 22634cc4bb4cSJim Cownie if (incr > 0) { 22644cc4bb4cSJim Cownie if (*pupper < *plower) 226512313d44SJonathan Peyton *pupper = traits_t<T>::max_value; 22664cc4bb4cSJim Cownie if (plastiter != NULL) 22674cc4bb4cSJim Cownie *plastiter = *plower <= upper && *pupper > upper - incr; 22684cc4bb4cSJim Cownie if (*pupper > upper) 22694cc4bb4cSJim Cownie *pupper = upper; // tracker C73258 22704cc4bb4cSJim Cownie } else { 22714cc4bb4cSJim Cownie if (*pupper > *plower) 227212313d44SJonathan Peyton *pupper = traits_t<T>::min_value; 22734cc4bb4cSJim Cownie if (plastiter != NULL) 22744cc4bb4cSJim Cownie *plastiter = *plower >= upper && *pupper < upper - incr; 22754cc4bb4cSJim Cownie if (*pupper < upper) 22764cc4bb4cSJim Cownie *pupper = upper; // tracker C73258 22774cc4bb4cSJim Cownie } 22784cc4bb4cSJim Cownie } 22794cc4bb4cSJim Cownie } 22804cc4bb4cSJim Cownie } 22814cc4bb4cSJim Cownie 22823041982dSJonathan Peyton //----------------------------------------------------------------------------- 22835e8470afSJim Cownie // Dispatch routines 22845e8470afSJim Cownie // Transfer call to template< type T > 22855e8470afSJim Cownie // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 22865e8470afSJim Cownie // T lb, T ub, ST st, ST chunk ) 22875e8470afSJim Cownie extern "C" { 22885e8470afSJim Cownie 22895e8470afSJim Cownie /*! 22905e8470afSJim Cownie @ingroup WORK_SHARING 22915e8470afSJim Cownie @{ 22925e8470afSJim Cownie @param loc Source location 22935e8470afSJim Cownie @param gtid Global thread id 22945e8470afSJim Cownie @param schedule Schedule type 22955e8470afSJim Cownie @param lb Lower bound 22965e8470afSJim Cownie @param ub Upper bound 22975e8470afSJim Cownie @param st Step (or increment if you prefer) 22985e8470afSJim Cownie @param chunk The chunk size to block with 22995e8470afSJim Cownie 23003041982dSJonathan Peyton This function prepares the runtime to start a dynamically scheduled for loop, 23013041982dSJonathan Peyton saving the loop arguments. 23025e8470afSJim Cownie These functions are all identical apart from the types of the arguments. 23035e8470afSJim Cownie */ 23045e8470afSJim Cownie 23053041982dSJonathan Peyton void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 23063041982dSJonathan Peyton enum sched_type schedule, kmp_int32 lb, 23073041982dSJonathan Peyton kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 23085e8470afSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 230982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 231082e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 231182e94a59SJoachim Protze #endif 23125e8470afSJim Cownie __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 23135e8470afSJim Cownie } 23145e8470afSJim Cownie /*! 23155e8470afSJim Cownie See @ref __kmpc_dispatch_init_4 23165e8470afSJim Cownie */ 23173041982dSJonathan Peyton void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 23183041982dSJonathan Peyton enum sched_type schedule, kmp_uint32 lb, 23193041982dSJonathan Peyton kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 23205e8470afSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 232182e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 232282e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 232382e94a59SJoachim Protze #endif 23245e8470afSJim Cownie __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 23255e8470afSJim Cownie } 23265e8470afSJim Cownie 23275e8470afSJim Cownie /*! 23285e8470afSJim Cownie See @ref __kmpc_dispatch_init_4 23295e8470afSJim Cownie */ 23303041982dSJonathan Peyton void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 23313041982dSJonathan Peyton enum sched_type schedule, kmp_int64 lb, 23323041982dSJonathan Peyton kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 23335e8470afSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 233482e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 233582e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 233682e94a59SJoachim Protze #endif 23375e8470afSJim Cownie __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 23385e8470afSJim Cownie } 23395e8470afSJim Cownie 23405e8470afSJim Cownie /*! 23415e8470afSJim Cownie See @ref __kmpc_dispatch_init_4 23425e8470afSJim Cownie */ 23433041982dSJonathan Peyton void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 23443041982dSJonathan Peyton enum sched_type schedule, kmp_uint64 lb, 23453041982dSJonathan Peyton kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 23465e8470afSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 234782e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 234882e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 234982e94a59SJoachim Protze #endif 23505e8470afSJim Cownie __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 23515e8470afSJim Cownie } 23525e8470afSJim Cownie 23535e8470afSJim Cownie /*! 23544cc4bb4cSJim Cownie See @ref __kmpc_dispatch_init_4 23554cc4bb4cSJim Cownie 23564cc4bb4cSJim Cownie Difference from __kmpc_dispatch_init set of functions is these functions 23574cc4bb4cSJim Cownie are called for composite distribute parallel for construct. Thus before 23584cc4bb4cSJim Cownie regular iterations dispatching we need to calc per-team iteration space. 23594cc4bb4cSJim Cownie 23604cc4bb4cSJim Cownie These functions are all identical apart from the types of the arguments. 23614cc4bb4cSJim Cownie */ 23623041982dSJonathan Peyton void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 23633041982dSJonathan Peyton enum sched_type schedule, kmp_int32 *p_last, 23643041982dSJonathan Peyton kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 23653041982dSJonathan Peyton kmp_int32 chunk) { 23664cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 236782e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 236882e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 236982e94a59SJoachim Protze #endif 23704cc4bb4cSJim Cownie __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 23714cc4bb4cSJim Cownie __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 23724cc4bb4cSJim Cownie } 23734cc4bb4cSJim Cownie 23743041982dSJonathan Peyton void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 23753041982dSJonathan Peyton enum sched_type schedule, kmp_int32 *p_last, 23763041982dSJonathan Peyton kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 23773041982dSJonathan Peyton kmp_int32 chunk) { 23784cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 237982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 238082e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 238182e94a59SJoachim Protze #endif 23824cc4bb4cSJim Cownie __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 23834cc4bb4cSJim Cownie __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 23844cc4bb4cSJim Cownie } 23854cc4bb4cSJim Cownie 23863041982dSJonathan Peyton void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 23873041982dSJonathan Peyton enum sched_type schedule, kmp_int32 *p_last, 23883041982dSJonathan Peyton kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 23893041982dSJonathan Peyton kmp_int64 chunk) { 23904cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 239182e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 239282e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 239382e94a59SJoachim Protze #endif 23944cc4bb4cSJim Cownie __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 23954cc4bb4cSJim Cownie __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 23964cc4bb4cSJim Cownie } 23974cc4bb4cSJim Cownie 23983041982dSJonathan Peyton void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 23993041982dSJonathan Peyton enum sched_type schedule, kmp_int32 *p_last, 24003041982dSJonathan Peyton kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 24013041982dSJonathan Peyton kmp_int64 chunk) { 24024cc4bb4cSJim Cownie KMP_DEBUG_ASSERT(__kmp_init_serial); 240382e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 240482e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 240582e94a59SJoachim Protze #endif 24064cc4bb4cSJim Cownie __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 24074cc4bb4cSJim Cownie __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 24084cc4bb4cSJim Cownie } 24094cc4bb4cSJim Cownie 24104cc4bb4cSJim Cownie /*! 24115e8470afSJim Cownie @param loc Source code location 24125e8470afSJim Cownie @param gtid Global thread id 24133041982dSJonathan Peyton @param p_last Pointer to a flag set to one if this is the last chunk or zero 24143041982dSJonathan Peyton otherwise 24155e8470afSJim Cownie @param p_lb Pointer to the lower bound for the next chunk of work 24165e8470afSJim Cownie @param p_ub Pointer to the upper bound for the next chunk of work 24175e8470afSJim Cownie @param p_st Pointer to the stride for the next chunk of work 24185e8470afSJim Cownie @return one if there is work to be done, zero otherwise 24195e8470afSJim Cownie 24205e8470afSJim Cownie Get the next dynamically allocated chunk of work for this thread. 24215e8470afSJim Cownie If there is no more work, then the lb,ub and stride need not be modified. 24225e8470afSJim Cownie */ 24233041982dSJonathan Peyton int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 24243041982dSJonathan Peyton kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 242582e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 242682e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 242782e94a59SJoachim Protze #endif 242882e94a59SJoachim Protze return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 242982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 243082e94a59SJoachim Protze , 243182e94a59SJoachim Protze OMPT_LOAD_RETURN_ADDRESS(gtid) 243282e94a59SJoachim Protze #endif 243382e94a59SJoachim Protze ); 24345e8470afSJim Cownie } 24355e8470afSJim Cownie 24365e8470afSJim Cownie /*! 24375e8470afSJim Cownie See @ref __kmpc_dispatch_next_4 24385e8470afSJim Cownie */ 24393041982dSJonathan Peyton int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 24403041982dSJonathan Peyton kmp_uint32 *p_lb, kmp_uint32 *p_ub, 24413041982dSJonathan Peyton kmp_int32 *p_st) { 244282e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 244382e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 244482e94a59SJoachim Protze #endif 244582e94a59SJoachim Protze return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 244682e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 244782e94a59SJoachim Protze , 244882e94a59SJoachim Protze OMPT_LOAD_RETURN_ADDRESS(gtid) 244982e94a59SJoachim Protze #endif 245082e94a59SJoachim Protze ); 24515e8470afSJim Cownie } 24525e8470afSJim Cownie 24535e8470afSJim Cownie /*! 24545e8470afSJim Cownie See @ref __kmpc_dispatch_next_4 24555e8470afSJim Cownie */ 24563041982dSJonathan Peyton int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 24573041982dSJonathan Peyton kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 245882e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 245982e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 246082e94a59SJoachim Protze #endif 246182e94a59SJoachim Protze return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 246282e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 246382e94a59SJoachim Protze , 246482e94a59SJoachim Protze OMPT_LOAD_RETURN_ADDRESS(gtid) 246582e94a59SJoachim Protze #endif 246682e94a59SJoachim Protze ); 24675e8470afSJim Cownie } 24685e8470afSJim Cownie 24695e8470afSJim Cownie /*! 24705e8470afSJim Cownie See @ref __kmpc_dispatch_next_4 24715e8470afSJim Cownie */ 24723041982dSJonathan Peyton int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 24733041982dSJonathan Peyton kmp_uint64 *p_lb, kmp_uint64 *p_ub, 24743041982dSJonathan Peyton kmp_int64 *p_st) { 247582e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 247682e94a59SJoachim Protze OMPT_STORE_RETURN_ADDRESS(gtid); 247782e94a59SJoachim Protze #endif 247882e94a59SJoachim Protze return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 247982e94a59SJoachim Protze #if OMPT_SUPPORT && OMPT_OPTIONAL 248082e94a59SJoachim Protze , 248182e94a59SJoachim Protze OMPT_LOAD_RETURN_ADDRESS(gtid) 248282e94a59SJoachim Protze #endif 248382e94a59SJoachim Protze ); 24845e8470afSJim Cownie } 24855e8470afSJim Cownie 24865e8470afSJim Cownie /*! 24875e8470afSJim Cownie @param loc Source code location 24885e8470afSJim Cownie @param gtid Global thread id 24895e8470afSJim Cownie 24905e8470afSJim Cownie Mark the end of a dynamic loop. 24915e8470afSJim Cownie */ 24923041982dSJonathan Peyton void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 24935e8470afSJim Cownie __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 24945e8470afSJim Cownie } 24955e8470afSJim Cownie 24965e8470afSJim Cownie /*! 24975e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4 24985e8470afSJim Cownie */ 24993041982dSJonathan Peyton void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 25005e8470afSJim Cownie __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 25015e8470afSJim Cownie } 25025e8470afSJim Cownie 25035e8470afSJim Cownie /*! 25045e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4 25055e8470afSJim Cownie */ 25063041982dSJonathan Peyton void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 25075e8470afSJim Cownie __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 25085e8470afSJim Cownie } 25095e8470afSJim Cownie 25105e8470afSJim Cownie /*! 25115e8470afSJim Cownie See @ref __kmpc_dispatch_fini_4 25125e8470afSJim Cownie */ 25133041982dSJonathan Peyton void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 25145e8470afSJim Cownie __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 25155e8470afSJim Cownie } 25165e8470afSJim Cownie /*! @} */ 25175e8470afSJim Cownie 25183041982dSJonathan Peyton //----------------------------------------------------------------------------- 2519de4749b7SJonathan Peyton // Non-template routines from kmp_dispatch.cpp used in other sources 25205e8470afSJim Cownie 25215e8470afSJim Cownie kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 25225e8470afSJim Cownie return value == checker; 25235e8470afSJim Cownie } 25245e8470afSJim Cownie 25255e8470afSJim Cownie kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 25265e8470afSJim Cownie return value != checker; 25275e8470afSJim Cownie } 25285e8470afSJim Cownie 25295e8470afSJim Cownie kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 25305e8470afSJim Cownie return value < checker; 25315e8470afSJim Cownie } 25325e8470afSJim Cownie 25335e8470afSJim Cownie kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 25345e8470afSJim Cownie return value >= checker; 25355e8470afSJim Cownie } 25365e8470afSJim Cownie 25375e8470afSJim Cownie kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 25385e8470afSJim Cownie return value <= checker; 25395e8470afSJim Cownie } 25405e8470afSJim Cownie 25415e8470afSJim Cownie kmp_uint32 2542e47d32f1SJonathan Peyton __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 25433041982dSJonathan Peyton kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 25443041982dSJonathan Peyton void *obj // Higher-level synchronization object, or NULL. 25453041982dSJonathan Peyton ) { 25465e8470afSJim Cownie // note: we may not belong to a team at this point 2547414544c9SEd Maste volatile kmp_uint32 *spin = spinner; 2548414544c9SEd Maste kmp_uint32 check = checker; 2549414544c9SEd Maste kmp_uint32 spins; 2550414544c9SEd Maste kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2551414544c9SEd Maste kmp_uint32 r; 25525e8470afSJim Cownie 2553c47afcd9SAndrey Churbanov KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 25545e8470afSJim Cownie KMP_INIT_YIELD(spins); 25555e8470afSJim Cownie // main wait spin loop 25565e8470afSJim Cownie while (!f(r = TCR_4(*spin), check)) { 25575e8470afSJim Cownie KMP_FSYNC_SPIN_PREPARE(obj); 25583041982dSJonathan Peyton /* GEH - remove this since it was accidentally introduced when kmp_wait was 25593041982dSJonathan Peyton split. It causes problems with infinite recursion because of exit lock */ 25605e8470afSJim Cownie /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 25615e8470afSJim Cownie __kmp_abort_thread(); */ 2562e47d32f1SJonathan Peyton KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 25635e8470afSJim Cownie } 25645e8470afSJim Cownie KMP_FSYNC_SPIN_ACQUIRED(obj); 25655e8470afSJim Cownie return r; 25665e8470afSJim Cownie } 25675e8470afSJim Cownie 2568e47d32f1SJonathan Peyton void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2569e47d32f1SJonathan Peyton kmp_uint32 (*pred)(void *, kmp_uint32), 2570f7cc6affSPaul Osmialowski void *obj // Higher-level synchronization object, or NULL. 25713041982dSJonathan Peyton ) { 2572f7cc6affSPaul Osmialowski // note: we may not belong to a team at this point 2573414544c9SEd Maste void *spin = spinner; 2574414544c9SEd Maste kmp_uint32 check = checker; 2575414544c9SEd Maste kmp_uint32 spins; 2576414544c9SEd Maste kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2577f7cc6affSPaul Osmialowski 2578f7cc6affSPaul Osmialowski KMP_FSYNC_SPIN_INIT(obj, spin); 2579f7cc6affSPaul Osmialowski KMP_INIT_YIELD(spins); 2580f7cc6affSPaul Osmialowski // main wait spin loop 2581f7cc6affSPaul Osmialowski while (!f(spin, check)) { 2582f7cc6affSPaul Osmialowski KMP_FSYNC_SPIN_PREPARE(obj); 2583e47d32f1SJonathan Peyton /* if we have waited a bit, or are noversubscribed, yield */ 2584f7cc6affSPaul Osmialowski /* pause is in the following code */ 2585e47d32f1SJonathan Peyton KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2586f7cc6affSPaul Osmialowski } 2587f7cc6affSPaul Osmialowski KMP_FSYNC_SPIN_ACQUIRED(obj); 2588f7cc6affSPaul Osmialowski } 2589f7cc6affSPaul Osmialowski 25905e8470afSJim Cownie } // extern "C" 25915e8470afSJim Cownie 25925e8470afSJim Cownie #ifdef KMP_GOMP_COMPAT 25935e8470afSJim Cownie 25943041982dSJonathan Peyton void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 25953041982dSJonathan Peyton enum sched_type schedule, kmp_int32 lb, 25963041982dSJonathan Peyton kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 25973041982dSJonathan Peyton int push_ws) { 25985e8470afSJim Cownie __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 25995e8470afSJim Cownie push_ws); 26005e8470afSJim Cownie } 26015e8470afSJim Cownie 26023041982dSJonathan Peyton void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 26033041982dSJonathan Peyton enum sched_type schedule, kmp_uint32 lb, 26043041982dSJonathan Peyton kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 26053041982dSJonathan Peyton int push_ws) { 26065e8470afSJim Cownie __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 26075e8470afSJim Cownie push_ws); 26085e8470afSJim Cownie } 26095e8470afSJim Cownie 26103041982dSJonathan Peyton void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 26113041982dSJonathan Peyton enum sched_type schedule, kmp_int64 lb, 26123041982dSJonathan Peyton kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 26133041982dSJonathan Peyton int push_ws) { 26145e8470afSJim Cownie __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 26155e8470afSJim Cownie push_ws); 26165e8470afSJim Cownie } 26175e8470afSJim Cownie 26183041982dSJonathan Peyton void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 26193041982dSJonathan Peyton enum sched_type schedule, kmp_uint64 lb, 26203041982dSJonathan Peyton kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 26213041982dSJonathan Peyton int push_ws) { 26225e8470afSJim Cownie __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 26235e8470afSJim Cownie push_ws); 26245e8470afSJim Cownie } 26255e8470afSJim Cownie 26263041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 26275e8470afSJim Cownie __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 26285e8470afSJim Cownie } 26295e8470afSJim Cownie 26303041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 26315e8470afSJim Cownie __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 26325e8470afSJim Cownie } 26335e8470afSJim Cownie 26343041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 26355e8470afSJim Cownie __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 26365e8470afSJim Cownie } 26375e8470afSJim Cownie 26383041982dSJonathan Peyton void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 26395e8470afSJim Cownie __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 26405e8470afSJim Cownie } 26415e8470afSJim Cownie 26425e8470afSJim Cownie #endif /* KMP_GOMP_COMPAT */ 26435e8470afSJim Cownie 26445e8470afSJim Cownie /* ------------------------------------------------------------------------ */ 2645