1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc != NULL && loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 #if KMP_STATIC_STEAL_ENABLED 94 enum { // values for steal_flag (possible states of private per-loop buffer) 95 UNUSED = 0, 96 CLAIMED = 1, // owner thread started initialization 97 READY = 2, // available for stealing 98 THIEF = 3 // finished by owner, or claimed by thief 99 // possible state changes: 100 // 0 -> 1 owner only, sync 101 // 0 -> 3 thief only, sync 102 // 1 -> 2 owner only, async 103 // 2 -> 3 owner only, async 104 // 3 -> 2 owner only, async 105 // 3 -> 0 last thread finishing the loop, async 106 }; 107 #endif 108 109 // Initialize a dispatch_private_info_template<T> buffer for a particular 110 // type of schedule,chunk. The loop description is found in lb (lower bound), 111 // ub (upper bound), and st (stride). nproc is the number of threads relevant 112 // to the scheduling (often the number of threads in a team, but not always if 113 // hierarchical scheduling is used). tid is the id of the thread calling 114 // the function within the group of nproc threads. It will have a value 115 // between 0 and nproc - 1. This is often just the thread id within a team, but 116 // is not necessarily the case when using hierarchical scheduling. 117 // loc is the source file location of the corresponding loop 118 // gtid is the global thread id 119 template <typename T> 120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 121 dispatch_private_info_template<T> *pr, 122 enum sched_type schedule, T lb, T ub, 123 typename traits_t<T>::signed_t st, 124 #if USE_ITT_BUILD 125 kmp_uint64 *cur_chunk, 126 #endif 127 typename traits_t<T>::signed_t chunk, 128 T nproc, T tid) { 129 typedef typename traits_t<T>::unsigned_t UT; 130 typedef typename traits_t<T>::floating_t DBL; 131 132 int active; 133 T tc; 134 kmp_info_t *th; 135 kmp_team_t *team; 136 int monotonicity; 137 bool use_hier; 138 139 #ifdef KMP_DEBUG 140 typedef typename traits_t<T>::signed_t ST; 141 { 142 char *buff; 143 // create format specifiers before the debug output 144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 145 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 147 traits_t<T>::spec, traits_t<T>::spec, 148 traits_t<ST>::spec, traits_t<ST>::spec, 149 traits_t<T>::spec, traits_t<T>::spec); 150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 151 __kmp_str_free(&buff); 152 } 153 #endif 154 /* setup data */ 155 th = __kmp_threads[gtid]; 156 team = th->th.th_team; 157 active = !team->t.t_serialized; 158 159 #if USE_ITT_BUILD 160 int itt_need_metadata_reporting = 161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 163 team->t.t_active_level == 1; 164 #endif 165 166 #if KMP_USE_HIER_SCHED 167 use_hier = pr->flags.use_hier; 168 #else 169 use_hier = false; 170 #endif 171 172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 175 176 /* Pick up the nomerge/ordered bits from the scheduling type */ 177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 178 pr->flags.nomerge = TRUE; 179 schedule = 180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 181 } else { 182 pr->flags.nomerge = FALSE; 183 } 184 pr->type_size = traits_t<T>::type_size; // remember the size of variables 185 if (kmp_ord_lower & schedule) { 186 pr->flags.ordered = TRUE; 187 schedule = 188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 189 } else { 190 pr->flags.ordered = FALSE; 191 } 192 // Ordered overrides nonmonotonic 193 if (pr->flags.ordered) { 194 monotonicity = SCHEDULE_MONOTONIC; 195 } 196 197 if (schedule == kmp_sch_static) { 198 schedule = __kmp_static; 199 } else { 200 if (schedule == kmp_sch_runtime) { 201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 202 // not specified) 203 schedule = team->t.t_sched.r_sched_type; 204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed 207 monotonicity = SCHEDULE_MONOTONIC; 208 // Detail the schedule if needed (global controls are differentiated 209 // appropriately) 210 if (schedule == kmp_sch_guided_chunked) { 211 schedule = __kmp_guided; 212 } else if (schedule == kmp_sch_static) { 213 schedule = __kmp_static; 214 } 215 // Use the chunk size specified by OMP_SCHEDULE (or default if not 216 // specified) 217 chunk = team->t.t_sched.chunk; 218 #if USE_ITT_BUILD 219 if (cur_chunk) 220 *cur_chunk = chunk; 221 #endif 222 #ifdef KMP_DEBUG 223 { 224 char *buff; 225 // create format specifiers before the debug output 226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 227 "schedule:%%d chunk:%%%s\n", 228 traits_t<ST>::spec); 229 KD_TRACE(10, (buff, gtid, schedule, chunk)); 230 __kmp_str_free(&buff); 231 } 232 #endif 233 } else { 234 if (schedule == kmp_sch_guided_chunked) { 235 schedule = __kmp_guided; 236 } 237 if (chunk <= 0) { 238 chunk = KMP_DEFAULT_CHUNK; 239 } 240 } 241 242 if (schedule == kmp_sch_auto) { 243 // mapping and differentiation: in the __kmp_do_serial_initialize() 244 schedule = __kmp_auto; 245 #ifdef KMP_DEBUG 246 { 247 char *buff; 248 // create format specifiers before the debug output 249 buff = __kmp_str_format( 250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 251 "schedule:%%d chunk:%%%s\n", 252 traits_t<ST>::spec); 253 KD_TRACE(10, (buff, gtid, schedule, chunk)); 254 __kmp_str_free(&buff); 255 } 256 #endif 257 } 258 #if KMP_STATIC_STEAL_ENABLED 259 // map nonmonotonic:dynamic to static steal 260 if (schedule == kmp_sch_dynamic_chunked) { 261 if (monotonicity == SCHEDULE_NONMONOTONIC) 262 schedule = kmp_sch_static_steal; 263 } 264 #endif 265 /* guided analytical not safe for too many threads */ 266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 267 schedule = kmp_sch_guided_iterative_chunked; 268 KMP_WARNING(DispatchManyThreads); 269 } 270 if (schedule == kmp_sch_runtime_simd) { 271 // compiler provides simd_width in the chunk parameter 272 schedule = team->t.t_sched.r_sched_type; 273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 275 // Detail the schedule if needed (global controls are differentiated 276 // appropriately) 277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 278 schedule == __kmp_static) { 279 schedule = kmp_sch_static_balanced_chunked; 280 } else { 281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 282 schedule = kmp_sch_guided_simd; 283 } 284 chunk = team->t.t_sched.chunk * chunk; 285 } 286 #if USE_ITT_BUILD 287 if (cur_chunk) 288 *cur_chunk = chunk; 289 #endif 290 #ifdef KMP_DEBUG 291 { 292 char *buff; 293 // create format specifiers before the debug output 294 buff = __kmp_str_format( 295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 296 " chunk:%%%s\n", 297 traits_t<ST>::spec); 298 KD_TRACE(10, (buff, gtid, schedule, chunk)); 299 __kmp_str_free(&buff); 300 } 301 #endif 302 } 303 pr->u.p.parm1 = chunk; 304 } 305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 306 "unknown scheduling type"); 307 308 pr->u.p.count = 0; 309 310 if (__kmp_env_consistency_check) { 311 if (st == 0) { 312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 314 } 315 } 316 // compute trip count 317 if (st == 1) { // most common case 318 if (ub >= lb) { 319 tc = ub - lb + 1; 320 } else { // ub < lb 321 tc = 0; // zero-trip 322 } 323 } else if (st < 0) { 324 if (lb >= ub) { 325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 326 // where the division needs to be unsigned regardless of the result type 327 tc = (UT)(lb - ub) / (-st) + 1; 328 } else { // lb < ub 329 tc = 0; // zero-trip 330 } 331 } else { // st > 0 332 if (ub >= lb) { 333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 334 // where the division needs to be unsigned regardless of the result type 335 tc = (UT)(ub - lb) / st + 1; 336 } else { // ub < lb 337 tc = 0; // zero-trip 338 } 339 } 340 341 #if KMP_STATS_ENABLED 342 if (KMP_MASTER_GTID(gtid)) { 343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 344 } 345 #endif 346 347 pr->u.p.lb = lb; 348 pr->u.p.ub = ub; 349 pr->u.p.st = st; 350 pr->u.p.tc = tc; 351 352 #if KMP_OS_WINDOWS 353 pr->u.p.last_upper = ub + st; 354 #endif /* KMP_OS_WINDOWS */ 355 356 /* NOTE: only the active parallel region(s) has active ordered sections */ 357 358 if (active) { 359 if (pr->flags.ordered) { 360 pr->ordered_bumped = 0; 361 pr->u.p.ordered_lower = 1; 362 pr->u.p.ordered_upper = 0; 363 } 364 } 365 366 switch (schedule) { 367 #if KMP_STATIC_STEAL_ENABLED 368 case kmp_sch_static_steal: { 369 T ntc, init; 370 371 KD_TRACE(100, 372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 373 gtid)); 374 375 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 376 if (nproc > 1 && ntc >= nproc) { 377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 378 T id = tid; 379 T small_chunk, extras; 380 kmp_uint32 old = UNUSED; 381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); 382 if (traits_t<T>::type_size > 4) { 383 // AC: TODO: check if 16-byte CAS available and use it to 384 // improve performance (probably wait for explicit request 385 // before spending time on this). 386 // For now use dynamically allocated per-private-buffer lock, 387 // free memory in __kmp_dispatch_next when status==0. 388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 389 __kmp_init_lock(pr->u.p.steal_lock); 390 } 391 small_chunk = ntc / nproc; 392 extras = ntc % nproc; 393 394 init = id * small_chunk + (id < extras ? id : extras); 395 pr->u.p.count = init; 396 if (claimed) { // are we succeeded in claiming own buffer? 397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 398 // Other threads will inspect steal_flag when searching for a victim. 399 // READY means other threads may steal from this thread from now on. 400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 401 } else { 402 // other thread has stolen whole our range 403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF); 404 pr->u.p.ub = init; // mark there is no iterations to work on 405 } 406 pr->u.p.parm2 = ntc; // save number of chunks 407 // parm3 is the number of times to attempt stealing which is 408 // nproc (just a heuristics, could be optimized later on). 409 pr->u.p.parm3 = nproc; 410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 411 break; 412 } else { 413 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 414 schedule = kmp_sch_dynamic_chunked; 415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 416 "kmp_sch_dynamic_chunked\n", 417 gtid)); 418 goto dynamic_init; 419 break; 420 } // if 421 } // case 422 #endif 423 case kmp_sch_static_balanced: { 424 T init, limit; 425 426 KD_TRACE( 427 100, 428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 429 gtid)); 430 431 if (nproc > 1) { 432 T id = tid; 433 434 if (tc < nproc) { 435 if (id < tc) { 436 init = id; 437 limit = id; 438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 439 } else { 440 pr->u.p.count = 1; /* means no more chunks to execute */ 441 pr->u.p.parm1 = FALSE; 442 break; 443 } 444 } else { 445 T small_chunk = tc / nproc; 446 T extras = tc % nproc; 447 init = id * small_chunk + (id < extras ? id : extras); 448 limit = init + small_chunk - (id < extras ? 0 : 1); 449 pr->u.p.parm1 = (id == nproc - 1); 450 } 451 } else { 452 if (tc > 0) { 453 init = 0; 454 limit = tc - 1; 455 pr->u.p.parm1 = TRUE; 456 } else { 457 // zero trip count 458 pr->u.p.count = 1; /* means no more chunks to execute */ 459 pr->u.p.parm1 = FALSE; 460 break; 461 } 462 } 463 #if USE_ITT_BUILD 464 // Calculate chunk for metadata report 465 if (itt_need_metadata_reporting) 466 if (cur_chunk) 467 *cur_chunk = limit - init + 1; 468 #endif 469 if (st == 1) { 470 pr->u.p.lb = lb + init; 471 pr->u.p.ub = lb + limit; 472 } else { 473 // calculated upper bound, "ub" is user-defined upper bound 474 T ub_tmp = lb + limit * st; 475 pr->u.p.lb = lb + init * st; 476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 477 // it exactly 478 if (st > 0) { 479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 480 } else { 481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 482 } 483 } 484 if (pr->flags.ordered) { 485 pr->u.p.ordered_lower = init; 486 pr->u.p.ordered_upper = limit; 487 } 488 break; 489 } // case 490 case kmp_sch_static_balanced_chunked: { 491 // similar to balanced, but chunk adjusted to multiple of simd width 492 T nth = nproc; 493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 494 " -> falling-through to static_greedy\n", 495 gtid)); 496 schedule = kmp_sch_static_greedy; 497 if (nth > 1) 498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 499 else 500 pr->u.p.parm1 = tc; 501 break; 502 } // case 503 case kmp_sch_guided_simd: 504 case kmp_sch_guided_iterative_chunked: { 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 508 " case\n", 509 gtid)); 510 511 if (nproc > 1) { 512 if ((2L * chunk + 1) * nproc >= tc) { 513 /* chunk size too large, switch to dynamic */ 514 schedule = kmp_sch_dynamic_chunked; 515 goto dynamic_init; 516 } else { 517 // when remaining iters become less than parm2 - switch to dynamic 518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 519 *(double *)&pr->u.p.parm3 = 520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 521 } 522 } else { 523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 524 "kmp_sch_static_greedy\n", 525 gtid)); 526 schedule = kmp_sch_static_greedy; 527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 528 KD_TRACE( 529 100, 530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 531 gtid)); 532 pr->u.p.parm1 = tc; 533 } // if 534 } // case 535 break; 536 case kmp_sch_guided_analytical_chunked: { 537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 538 "kmp_sch_guided_analytical_chunked case\n", 539 gtid)); 540 541 if (nproc > 1) { 542 if ((2L * chunk + 1) * nproc >= tc) { 543 /* chunk size too large, switch to dynamic */ 544 schedule = kmp_sch_dynamic_chunked; 545 goto dynamic_init; 546 } else { 547 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 548 DBL x; 549 550 #if KMP_USE_X87CONTROL 551 /* Linux* OS already has 64-bit computation by default for long double, 552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 554 instead of the default 53-bit. Even though long double doesn't work 555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 556 expected to impact the correctness of the algorithm, but this has not 557 been mathematically proven. */ 558 // save original FPCW and set precision to 64-bit, as 559 // Windows* OS on IA-32 architecture defaults to 53-bit 560 unsigned int oldFpcw = _control87(0, 0); 561 _control87(_PC_64, _MCW_PC); // 0,0x30000 562 #endif 563 /* value used for comparison in solver for cross-over point */ 564 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 565 566 /* crossover point--chunk indexes equal to or greater than 567 this point switch to dynamic-style scheduling */ 568 UT cross; 569 570 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 571 x = 1.0 - 0.5 / (double)nproc; 572 573 #ifdef KMP_DEBUG 574 { // test natural alignment 575 struct _test_a { 576 char a; 577 union { 578 char b; 579 DBL d; 580 }; 581 } t; 582 ptrdiff_t natural_alignment = 583 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 584 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 585 // long)natural_alignment ); 586 KMP_DEBUG_ASSERT( 587 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 588 } 589 #endif // KMP_DEBUG 590 591 /* save the term in thread private dispatch structure */ 592 *(DBL *)&pr->u.p.parm3 = x; 593 594 /* solve for the crossover point to the nearest integer i for which C_i 595 <= chunk */ 596 { 597 UT left, right, mid; 598 long double p; 599 600 /* estimate initial upper and lower bound */ 601 602 /* doesn't matter what value right is as long as it is positive, but 603 it affects performance of the solver */ 604 right = 229; 605 p = __kmp_pow<UT>(x, right); 606 if (p > target) { 607 do { 608 p *= p; 609 right <<= 1; 610 } while (p > target && right < (1 << 27)); 611 /* lower bound is previous (failed) estimate of upper bound */ 612 left = right >> 1; 613 } else { 614 left = 0; 615 } 616 617 /* bisection root-finding method */ 618 while (left + 1 < right) { 619 mid = (left + right) / 2; 620 if (__kmp_pow<UT>(x, mid) > target) { 621 left = mid; 622 } else { 623 right = mid; 624 } 625 } // while 626 cross = right; 627 } 628 /* assert sanity of computed crossover point */ 629 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 630 __kmp_pow<UT>(x, cross) <= target); 631 632 /* save the crossover point in thread private dispatch structure */ 633 pr->u.p.parm2 = cross; 634 635 // C75803 636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 638 #else 639 #define GUIDED_ANALYTICAL_WORKAROUND (x) 640 #endif 641 /* dynamic-style scheduling offset */ 642 pr->u.p.count = tc - 643 __kmp_dispatch_guided_remaining( 644 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 645 cross * chunk; 646 #if KMP_USE_X87CONTROL 647 // restore FPCW 648 _control87(oldFpcw, _MCW_PC); 649 #endif 650 } // if 651 } else { 652 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 653 "kmp_sch_static_greedy\n", 654 gtid)); 655 schedule = kmp_sch_static_greedy; 656 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 657 pr->u.p.parm1 = tc; 658 } // if 659 } // case 660 break; 661 case kmp_sch_static_greedy: 662 KD_TRACE( 663 100, 664 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 665 gtid)); 666 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 667 break; 668 case kmp_sch_static_chunked: 669 case kmp_sch_dynamic_chunked: 670 dynamic_init: 671 if (tc == 0) 672 break; 673 if (pr->u.p.parm1 <= 0) 674 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 675 else if (pr->u.p.parm1 > tc) 676 pr->u.p.parm1 = tc; 677 // Store the total number of chunks to prevent integer overflow during 678 // bounds calculations in the get next chunk routine. 679 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); 680 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 681 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 682 gtid)); 683 break; 684 case kmp_sch_trapezoidal: { 685 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 686 687 T parm1, parm2, parm3, parm4; 688 KD_TRACE(100, 689 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 690 gtid)); 691 692 parm1 = chunk; 693 694 /* F : size of the first cycle */ 695 parm2 = (tc / (2 * nproc)); 696 697 if (parm2 < 1) { 698 parm2 = 1; 699 } 700 701 /* L : size of the last cycle. Make sure the last cycle is not larger 702 than the first cycle. */ 703 if (parm1 < 1) { 704 parm1 = 1; 705 } else if (parm1 > parm2) { 706 parm1 = parm2; 707 } 708 709 /* N : number of cycles */ 710 parm3 = (parm2 + parm1); 711 parm3 = (2 * tc + parm3 - 1) / parm3; 712 713 if (parm3 < 2) { 714 parm3 = 2; 715 } 716 717 /* sigma : decreasing incr of the trapezoid */ 718 parm4 = (parm3 - 1); 719 parm4 = (parm2 - parm1) / parm4; 720 721 // pointless check, because parm4 >= 0 always 722 // if ( parm4 < 0 ) { 723 // parm4 = 0; 724 //} 725 726 pr->u.p.parm1 = parm1; 727 pr->u.p.parm2 = parm2; 728 pr->u.p.parm3 = parm3; 729 pr->u.p.parm4 = parm4; 730 } // case 731 break; 732 733 default: { 734 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 735 KMP_HNT(GetNewerLibrary), // Hint 736 __kmp_msg_null // Variadic argument list terminator 737 ); 738 } break; 739 } // switch 740 pr->schedule = schedule; 741 } 742 743 #if KMP_USE_HIER_SCHED 744 template <typename T> 745 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 746 typename traits_t<T>::signed_t st); 747 template <> 748 inline void 749 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 750 kmp_int32 ub, kmp_int32 st) { 751 __kmp_dispatch_init_hierarchy<kmp_int32>( 752 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 753 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 754 } 755 template <> 756 inline void 757 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 758 kmp_uint32 ub, kmp_int32 st) { 759 __kmp_dispatch_init_hierarchy<kmp_uint32>( 760 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 761 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 762 } 763 template <> 764 inline void 765 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 766 kmp_int64 ub, kmp_int64 st) { 767 __kmp_dispatch_init_hierarchy<kmp_int64>( 768 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 769 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 770 } 771 template <> 772 inline void 773 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 774 kmp_uint64 ub, kmp_int64 st) { 775 __kmp_dispatch_init_hierarchy<kmp_uint64>( 776 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 777 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 778 } 779 780 // free all the hierarchy scheduling memory associated with the team 781 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 782 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 783 for (int i = 0; i < num_disp_buff; ++i) { 784 // type does not matter here so use kmp_int32 785 auto sh = 786 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 787 &team->t.t_disp_buffer[i]); 788 if (sh->hier) { 789 sh->hier->deallocate(); 790 __kmp_free(sh->hier); 791 } 792 } 793 } 794 #endif 795 796 // UT - unsigned flavor of T, ST - signed flavor of T, 797 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 798 template <typename T> 799 static void 800 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 801 T ub, typename traits_t<T>::signed_t st, 802 typename traits_t<T>::signed_t chunk, int push_ws) { 803 typedef typename traits_t<T>::unsigned_t UT; 804 805 int active; 806 kmp_info_t *th; 807 kmp_team_t *team; 808 kmp_uint32 my_buffer_index; 809 dispatch_private_info_template<T> *pr; 810 dispatch_shared_info_template<T> volatile *sh; 811 812 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 813 sizeof(dispatch_private_info)); 814 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 815 sizeof(dispatch_shared_info)); 816 __kmp_assert_valid_gtid(gtid); 817 818 if (!TCR_4(__kmp_init_parallel)) 819 __kmp_parallel_initialize(); 820 821 __kmp_resume_if_soft_paused(); 822 823 #if INCLUDE_SSC_MARKS 824 SSC_MARK_DISPATCH_INIT(); 825 #endif 826 #ifdef KMP_DEBUG 827 typedef typename traits_t<T>::signed_t ST; 828 { 829 char *buff; 830 // create format specifiers before the debug output 831 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 832 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 833 traits_t<ST>::spec, traits_t<T>::spec, 834 traits_t<T>::spec, traits_t<ST>::spec); 835 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 836 __kmp_str_free(&buff); 837 } 838 #endif 839 /* setup data */ 840 th = __kmp_threads[gtid]; 841 team = th->th.th_team; 842 active = !team->t.t_serialized; 843 th->th.th_ident = loc; 844 845 // Any half-decent optimizer will remove this test when the blocks are empty 846 // since the macros expand to nothing 847 // when statistics are disabled. 848 if (schedule == __kmp_static) { 849 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 850 } else { 851 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 852 } 853 854 #if KMP_USE_HIER_SCHED 855 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 856 // Hierarchical scheduling does not work with ordered, so if ordered is 857 // detected, then revert back to threaded scheduling. 858 bool ordered; 859 enum sched_type my_sched = schedule; 860 my_buffer_index = th->th.th_dispatch->th_disp_index; 861 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 862 &th->th.th_dispatch 863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 864 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 865 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 866 my_sched = 867 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 868 ordered = (kmp_ord_lower & my_sched); 869 if (pr->flags.use_hier) { 870 if (ordered) { 871 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 872 "Disabling hierarchical scheduling.\n", 873 gtid)); 874 pr->flags.use_hier = FALSE; 875 } 876 } 877 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 878 // Don't use hierarchical for ordered parallel loops and don't 879 // use the runtime hierarchy if one was specified in the program 880 if (!ordered && !pr->flags.use_hier) 881 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 882 } 883 #endif // KMP_USE_HIER_SCHED 884 885 #if USE_ITT_BUILD 886 kmp_uint64 cur_chunk = chunk; 887 int itt_need_metadata_reporting = 888 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 889 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 890 team->t.t_active_level == 1; 891 #endif 892 if (!active) { 893 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 894 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 895 } else { 896 KMP_DEBUG_ASSERT(th->th.th_dispatch == 897 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 898 899 my_buffer_index = th->th.th_dispatch->th_disp_index++; 900 901 /* What happens when number of threads changes, need to resize buffer? */ 902 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 903 &th->th.th_dispatch 904 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 905 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 906 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 907 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 908 my_buffer_index)); 909 if (sh->buffer_index != my_buffer_index) { // too many loops in progress? 910 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" 911 " sh->buffer_index:%d\n", 912 gtid, my_buffer_index, sh->buffer_index)); 913 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 914 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 915 // Note: KMP_WAIT() cannot be used there: buffer index and 916 // my_buffer_index are *always* 32-bit integers. 917 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 918 "sh->buffer_index:%d\n", 919 gtid, my_buffer_index, sh->buffer_index)); 920 } 921 } 922 923 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 924 #if USE_ITT_BUILD 925 &cur_chunk, 926 #endif 927 chunk, (T)th->th.th_team_nproc, 928 (T)th->th.th_info.ds.ds_tid); 929 if (active) { 930 if (pr->flags.ordered == 0) { 931 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 932 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 933 } else { 934 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 935 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 936 } 937 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 938 th->th.th_dispatch->th_dispatch_sh_current = 939 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 940 #if USE_ITT_BUILD 941 if (pr->flags.ordered) { 942 __kmp_itt_ordered_init(gtid); 943 } 944 // Report loop metadata 945 if (itt_need_metadata_reporting) { 946 // Only report metadata by primary thread of active team at level 1 947 kmp_uint64 schedtype = 0; 948 switch (schedule) { 949 case kmp_sch_static_chunked: 950 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 951 break; 952 case kmp_sch_static_greedy: 953 cur_chunk = pr->u.p.parm1; 954 break; 955 case kmp_sch_dynamic_chunked: 956 schedtype = 1; 957 break; 958 case kmp_sch_guided_iterative_chunked: 959 case kmp_sch_guided_analytical_chunked: 960 case kmp_sch_guided_simd: 961 schedtype = 2; 962 break; 963 default: 964 // Should we put this case under "static"? 965 // case kmp_sch_static_steal: 966 schedtype = 3; 967 break; 968 } 969 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 970 } 971 #if KMP_USE_HIER_SCHED 972 if (pr->flags.use_hier) { 973 pr->u.p.count = 0; 974 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 975 } 976 #endif // KMP_USER_HIER_SCHED 977 #endif /* USE_ITT_BUILD */ 978 } 979 980 #ifdef KMP_DEBUG 981 { 982 char *buff; 983 // create format specifiers before the debug output 984 buff = __kmp_str_format( 985 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 986 "lb:%%%s ub:%%%s" 987 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 988 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 989 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 990 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 991 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 992 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 993 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 994 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 995 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 996 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 997 __kmp_str_free(&buff); 998 } 999 #endif 1000 #if OMPT_SUPPORT && OMPT_OPTIONAL 1001 if (ompt_enabled.ompt_callback_work) { 1002 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1003 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1004 ompt_callbacks.ompt_callback(ompt_callback_work)( 1005 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1006 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1007 } 1008 #endif 1009 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1010 } 1011 1012 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1013 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1014 * every chunk of iterations. If the ordered section(s) were not executed 1015 * for this iteration (or every iteration in this chunk), we need to set the 1016 * ordered iteration counters so that the next thread can proceed. */ 1017 template <typename UT> 1018 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1019 typedef typename traits_t<UT>::signed_t ST; 1020 __kmp_assert_valid_gtid(gtid); 1021 kmp_info_t *th = __kmp_threads[gtid]; 1022 1023 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1024 if (!th->th.th_team->t.t_serialized) { 1025 1026 dispatch_private_info_template<UT> *pr = 1027 reinterpret_cast<dispatch_private_info_template<UT> *>( 1028 th->th.th_dispatch->th_dispatch_pr_current); 1029 dispatch_shared_info_template<UT> volatile *sh = 1030 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1031 th->th.th_dispatch->th_dispatch_sh_current); 1032 KMP_DEBUG_ASSERT(pr); 1033 KMP_DEBUG_ASSERT(sh); 1034 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1035 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1036 1037 if (pr->ordered_bumped) { 1038 KD_TRACE( 1039 1000, 1040 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1041 gtid)); 1042 pr->ordered_bumped = 0; 1043 } else { 1044 UT lower = pr->u.p.ordered_lower; 1045 1046 #ifdef KMP_DEBUG 1047 { 1048 char *buff; 1049 // create format specifiers before the debug output 1050 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1051 "ordered_iteration:%%%s lower:%%%s\n", 1052 traits_t<UT>::spec, traits_t<UT>::spec); 1053 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1054 __kmp_str_free(&buff); 1055 } 1056 #endif 1057 1058 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1059 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1060 KMP_MB(); /* is this necessary? */ 1061 #ifdef KMP_DEBUG 1062 { 1063 char *buff; 1064 // create format specifiers before the debug output 1065 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1066 "ordered_iteration:%%%s lower:%%%s\n", 1067 traits_t<UT>::spec, traits_t<UT>::spec); 1068 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1069 __kmp_str_free(&buff); 1070 } 1071 #endif 1072 1073 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1074 } // if 1075 } // if 1076 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1077 } 1078 1079 #ifdef KMP_GOMP_COMPAT 1080 1081 template <typename UT> 1082 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1083 typedef typename traits_t<UT>::signed_t ST; 1084 __kmp_assert_valid_gtid(gtid); 1085 kmp_info_t *th = __kmp_threads[gtid]; 1086 1087 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1088 if (!th->th.th_team->t.t_serialized) { 1089 dispatch_private_info_template<UT> *pr = 1090 reinterpret_cast<dispatch_private_info_template<UT> *>( 1091 th->th.th_dispatch->th_dispatch_pr_current); 1092 dispatch_shared_info_template<UT> volatile *sh = 1093 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1094 th->th.th_dispatch->th_dispatch_sh_current); 1095 KMP_DEBUG_ASSERT(pr); 1096 KMP_DEBUG_ASSERT(sh); 1097 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1098 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1099 1100 UT lower = pr->u.p.ordered_lower; 1101 UT upper = pr->u.p.ordered_upper; 1102 UT inc = upper - lower + 1; 1103 1104 if (pr->ordered_bumped == inc) { 1105 KD_TRACE( 1106 1000, 1107 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1108 gtid)); 1109 pr->ordered_bumped = 0; 1110 } else { 1111 inc -= pr->ordered_bumped; 1112 1113 #ifdef KMP_DEBUG 1114 { 1115 char *buff; 1116 // create format specifiers before the debug output 1117 buff = __kmp_str_format( 1118 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1119 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1120 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1121 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1122 __kmp_str_free(&buff); 1123 } 1124 #endif 1125 1126 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1127 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1128 1129 KMP_MB(); /* is this necessary? */ 1130 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1131 "ordered_bumped to zero\n", 1132 gtid)); 1133 pr->ordered_bumped = 0; 1134 //!!!!! TODO check if the inc should be unsigned, or signed??? 1135 #ifdef KMP_DEBUG 1136 { 1137 char *buff; 1138 // create format specifiers before the debug output 1139 buff = __kmp_str_format( 1140 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1141 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1142 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1143 traits_t<UT>::spec); 1144 KD_TRACE(1000, 1145 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1146 __kmp_str_free(&buff); 1147 } 1148 #endif 1149 1150 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1151 } 1152 // } 1153 } 1154 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1155 } 1156 1157 #endif /* KMP_GOMP_COMPAT */ 1158 1159 template <typename T> 1160 int __kmp_dispatch_next_algorithm(int gtid, 1161 dispatch_private_info_template<T> *pr, 1162 dispatch_shared_info_template<T> volatile *sh, 1163 kmp_int32 *p_last, T *p_lb, T *p_ub, 1164 typename traits_t<T>::signed_t *p_st, T nproc, 1165 T tid) { 1166 typedef typename traits_t<T>::unsigned_t UT; 1167 typedef typename traits_t<T>::signed_t ST; 1168 typedef typename traits_t<T>::floating_t DBL; 1169 int status = 0; 1170 bool last = false; 1171 T start; 1172 ST incr; 1173 UT limit, trip, init; 1174 kmp_info_t *th = __kmp_threads[gtid]; 1175 kmp_team_t *team = th->th.th_team; 1176 1177 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1178 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1179 KMP_DEBUG_ASSERT(pr); 1180 KMP_DEBUG_ASSERT(sh); 1181 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1182 #ifdef KMP_DEBUG 1183 { 1184 char *buff; 1185 // create format specifiers before the debug output 1186 buff = 1187 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1188 "sh:%%p nproc:%%%s tid:%%%s\n", 1189 traits_t<T>::spec, traits_t<T>::spec); 1190 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1191 __kmp_str_free(&buff); 1192 } 1193 #endif 1194 1195 // zero trip count 1196 if (pr->u.p.tc == 0) { 1197 KD_TRACE(10, 1198 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1199 "zero status:%d\n", 1200 gtid, status)); 1201 return 0; 1202 } 1203 1204 switch (pr->schedule) { 1205 #if KMP_STATIC_STEAL_ENABLED 1206 case kmp_sch_static_steal: { 1207 T chunk = pr->u.p.parm1; 1208 UT nchunks = pr->u.p.parm2; 1209 KD_TRACE(100, 1210 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1211 gtid)); 1212 1213 trip = pr->u.p.tc - 1; 1214 1215 if (traits_t<T>::type_size > 4) { 1216 // use lock for 8-byte induction variable. 1217 // TODO (optional): check presence and use 16-byte CAS 1218 kmp_lock_t *lck = pr->u.p.steal_lock; 1219 KMP_DEBUG_ASSERT(lck != NULL); 1220 if (pr->u.p.count < (UT)pr->u.p.ub) { 1221 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1222 __kmp_acquire_lock(lck, gtid); 1223 // try to get own chunk of iterations 1224 init = (pr->u.p.count)++; 1225 status = (init < (UT)pr->u.p.ub); 1226 __kmp_release_lock(lck, gtid); 1227 } else { 1228 status = 0; // no own chunks 1229 } 1230 if (!status) { // try to steal 1231 kmp_lock_t *lckv; // victim buffer's lock 1232 T while_limit = pr->u.p.parm3; 1233 T while_index = 0; 1234 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1235 __kmp_dispatch_num_buffers; // current loop index 1236 // note: victim thread can potentially execute another loop 1237 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1238 while ((!status) && (while_limit != ++while_index)) { 1239 dispatch_private_info_template<T> *v; 1240 T remaining; 1241 T victimId = pr->u.p.parm4; 1242 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1243 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1244 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1245 KMP_DEBUG_ASSERT(v); 1246 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1247 oldVictimId != victimId) { 1248 victimId = (victimId + 1) % nproc; 1249 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1250 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1251 KMP_DEBUG_ASSERT(v); 1252 } 1253 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1254 continue; // try once more (nproc attempts in total) 1255 } 1256 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1257 kmp_uint32 old = UNUSED; 1258 // try to steal whole range from inactive victim 1259 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1260 if (status) { 1261 // initialize self buffer with victim's whole range of chunks 1262 T id = victimId; 1263 T small_chunk, extras; 1264 small_chunk = nchunks / nproc; // chunks per thread 1265 extras = nchunks % nproc; 1266 init = id * small_chunk + (id < extras ? id : extras); 1267 __kmp_acquire_lock(lck, gtid); 1268 pr->u.p.count = init + 1; // exclude one we execute immediately 1269 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1270 __kmp_release_lock(lck, gtid); 1271 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1272 // no need to reinitialize other thread invariants: lb, st, etc. 1273 #ifdef KMP_DEBUG 1274 { 1275 char *buff; 1276 // create format specifiers before the debug output 1277 buff = __kmp_str_format( 1278 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1279 "count:%%%s ub:%%%s\n", 1280 traits_t<UT>::spec, traits_t<T>::spec); 1281 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1282 __kmp_str_free(&buff); 1283 } 1284 #endif 1285 // activate non-empty buffer and let others steal from us 1286 if (pr->u.p.count < (UT)pr->u.p.ub) 1287 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1288 break; 1289 } 1290 } 1291 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY || 1292 v->u.p.count >= (UT)v->u.p.ub) { 1293 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1294 continue; // no chunks to steal, try next victim 1295 } 1296 lckv = v->u.p.steal_lock; 1297 KMP_ASSERT(lckv != NULL); 1298 __kmp_acquire_lock(lckv, gtid); 1299 limit = v->u.p.ub; // keep initial ub 1300 if (v->u.p.count >= limit) { 1301 __kmp_release_lock(lckv, gtid); 1302 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid 1303 continue; // no chunks to steal, try next victim 1304 } 1305 1306 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1307 // TODO: is this heuristics good enough?? 1308 remaining = limit - v->u.p.count; 1309 if (remaining > 7) { 1310 // steal 1/4 of remaining 1311 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1312 init = (v->u.p.ub -= (remaining >> 2)); 1313 } else { 1314 // steal 1 chunk of 1..7 remaining 1315 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1316 init = (v->u.p.ub -= 1); 1317 } 1318 __kmp_release_lock(lckv, gtid); 1319 #ifdef KMP_DEBUG 1320 { 1321 char *buff; 1322 // create format specifiers before the debug output 1323 buff = __kmp_str_format( 1324 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1325 "count:%%%s ub:%%%s\n", 1326 traits_t<UT>::spec, traits_t<UT>::spec); 1327 KD_TRACE(10, (buff, gtid, victimId, init, limit)); 1328 __kmp_str_free(&buff); 1329 } 1330 #endif 1331 KMP_DEBUG_ASSERT(init + 1 <= limit); 1332 pr->u.p.parm4 = victimId; // remember victim to steal from 1333 status = 1; 1334 // now update own count and ub with stolen range excluding init chunk 1335 __kmp_acquire_lock(lck, gtid); 1336 pr->u.p.count = init + 1; 1337 pr->u.p.ub = limit; 1338 __kmp_release_lock(lck, gtid); 1339 // activate non-empty buffer and let others steal from us 1340 if (init + 1 < limit) 1341 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1342 } // while (search for victim) 1343 } // if (try to find victim and steal) 1344 } else { 1345 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1346 // as all operations on pair (count, ub) must be done atomically 1347 typedef union { 1348 struct { 1349 UT count; 1350 T ub; 1351 } p; 1352 kmp_int64 b; 1353 } union_i4; 1354 union_i4 vold, vnew; 1355 if (pr->u.p.count < (UT)pr->u.p.ub) { 1356 KMP_DEBUG_ASSERT(pr->steal_flag == READY); 1357 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1358 vnew.b = vold.b; 1359 vnew.p.count++; // get chunk from head of self range 1360 while (!KMP_COMPARE_AND_STORE_REL64( 1361 (volatile kmp_int64 *)&pr->u.p.count, 1362 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1363 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1364 KMP_CPU_PAUSE(); 1365 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1366 vnew.b = vold.b; 1367 vnew.p.count++; 1368 } 1369 init = vold.p.count; 1370 status = (init < (UT)vold.p.ub); 1371 } else { 1372 status = 0; // no own chunks 1373 } 1374 if (!status) { // try to steal 1375 T while_limit = pr->u.p.parm3; 1376 T while_index = 0; 1377 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1378 __kmp_dispatch_num_buffers; // current loop index 1379 // note: victim thread can potentially execute another loop 1380 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive 1381 while ((!status) && (while_limit != ++while_index)) { 1382 dispatch_private_info_template<T> *v; 1383 T remaining; 1384 T victimId = pr->u.p.parm4; 1385 T oldVictimId = victimId ? victimId - 1 : nproc - 1; 1386 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1387 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1388 KMP_DEBUG_ASSERT(v); 1389 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && 1390 oldVictimId != victimId) { 1391 victimId = (victimId + 1) % nproc; 1392 v = reinterpret_cast<dispatch_private_info_template<T> *>( 1393 &team->t.t_dispatch[victimId].th_disp_buffer[idx]); 1394 KMP_DEBUG_ASSERT(v); 1395 } 1396 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { 1397 continue; // try once more (nproc attempts in total) 1398 } 1399 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { 1400 kmp_uint32 old = UNUSED; 1401 // try to steal whole range from inactive victim 1402 status = v->steal_flag.compare_exchange_strong(old, THIEF); 1403 if (status) { 1404 // initialize self buffer with victim's whole range of chunks 1405 T id = victimId; 1406 T small_chunk, extras; 1407 small_chunk = nchunks / nproc; // chunks per thread 1408 extras = nchunks % nproc; 1409 init = id * small_chunk + (id < extras ? id : extras); 1410 vnew.p.count = init + 1; 1411 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); 1412 // write pair (count, ub) at once atomically 1413 #if KMP_ARCH_X86 1414 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); 1415 #else 1416 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; 1417 #endif 1418 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 1419 // no need to initialize other thread invariants: lb, st, etc. 1420 #ifdef KMP_DEBUG 1421 { 1422 char *buff; 1423 // create format specifiers before the debug output 1424 buff = __kmp_str_format( 1425 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1426 "count:%%%s ub:%%%s\n", 1427 traits_t<UT>::spec, traits_t<T>::spec); 1428 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); 1429 __kmp_str_free(&buff); 1430 } 1431 #endif 1432 // activate non-empty buffer and let others steal from us 1433 if (pr->u.p.count < (UT)pr->u.p.ub) 1434 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1435 break; 1436 } 1437 } 1438 while (1) { // CAS loop with check if victim still has enough chunks 1439 // many threads may be stealing concurrently from same victim 1440 vold.b = *(volatile kmp_int64 *)(&v->u.p.count); 1441 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || 1442 vold.p.count >= (UT)vold.p.ub) { 1443 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id 1444 break; // no chunks to steal, try next victim 1445 } 1446 vnew.b = vold.b; 1447 remaining = vold.p.ub - vold.p.count; 1448 // try to steal 1/4 of remaining 1449 // TODO: is this heuristics good enough?? 1450 if (remaining > 7) { 1451 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range 1452 } else { 1453 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining 1454 } 1455 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip); 1456 if (KMP_COMPARE_AND_STORE_REL64( 1457 (volatile kmp_int64 *)&v->u.p.count, 1458 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1459 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1460 // stealing succedded 1461 #ifdef KMP_DEBUG 1462 { 1463 char *buff; 1464 // create format specifiers before the debug output 1465 buff = __kmp_str_format( 1466 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " 1467 "count:%%%s ub:%%%s\n", 1468 traits_t<T>::spec, traits_t<T>::spec); 1469 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub)); 1470 __kmp_str_free(&buff); 1471 } 1472 #endif 1473 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1474 vold.p.ub - vnew.p.ub); 1475 status = 1; 1476 pr->u.p.parm4 = victimId; // keep victim id 1477 // now update own count and ub 1478 init = vnew.p.ub; 1479 vold.p.count = init + 1; 1480 #if KMP_ARCH_X86 1481 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1482 #else 1483 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1484 #endif 1485 // activate non-empty buffer and let others steal from us 1486 if (vold.p.count < (UT)vold.p.ub) 1487 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); 1488 break; 1489 } // if (check CAS result) 1490 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1491 } // while (try to steal from particular victim) 1492 } // while (search for victim) 1493 } // if (try to find victim and steal) 1494 } // if (4-byte induction variable) 1495 if (!status) { 1496 *p_lb = 0; 1497 *p_ub = 0; 1498 if (p_st != NULL) 1499 *p_st = 0; 1500 } else { 1501 start = pr->u.p.lb; 1502 init *= chunk; 1503 limit = chunk + init - 1; 1504 incr = pr->u.p.st; 1505 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1506 1507 KMP_DEBUG_ASSERT(init <= trip); 1508 // keep track of done chunks for possible early exit from stealing 1509 // TODO: count executed chunks locally with rare update of shared location 1510 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1511 if ((last = (limit >= trip)) != 0) 1512 limit = trip; 1513 if (p_st != NULL) 1514 *p_st = incr; 1515 1516 if (incr == 1) { 1517 *p_lb = start + init; 1518 *p_ub = start + limit; 1519 } else { 1520 *p_lb = start + init * incr; 1521 *p_ub = start + limit * incr; 1522 } 1523 } // if 1524 break; 1525 } // case 1526 #endif // KMP_STATIC_STEAL_ENABLED 1527 case kmp_sch_static_balanced: { 1528 KD_TRACE( 1529 10, 1530 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1531 gtid)); 1532 /* check if thread has any iteration to do */ 1533 if ((status = !pr->u.p.count) != 0) { 1534 pr->u.p.count = 1; 1535 *p_lb = pr->u.p.lb; 1536 *p_ub = pr->u.p.ub; 1537 last = (pr->u.p.parm1 != 0); 1538 if (p_st != NULL) 1539 *p_st = pr->u.p.st; 1540 } else { /* no iterations to do */ 1541 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1542 } 1543 } // case 1544 break; 1545 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1546 merged here */ 1547 case kmp_sch_static_chunked: { 1548 T parm1; 1549 1550 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1551 "kmp_sch_static_[affinity|chunked] case\n", 1552 gtid)); 1553 parm1 = pr->u.p.parm1; 1554 1555 trip = pr->u.p.tc - 1; 1556 init = parm1 * (pr->u.p.count + tid); 1557 1558 if ((status = (init <= trip)) != 0) { 1559 start = pr->u.p.lb; 1560 incr = pr->u.p.st; 1561 limit = parm1 + init - 1; 1562 1563 if ((last = (limit >= trip)) != 0) 1564 limit = trip; 1565 1566 if (p_st != NULL) 1567 *p_st = incr; 1568 1569 pr->u.p.count += nproc; 1570 1571 if (incr == 1) { 1572 *p_lb = start + init; 1573 *p_ub = start + limit; 1574 } else { 1575 *p_lb = start + init * incr; 1576 *p_ub = start + limit * incr; 1577 } 1578 1579 if (pr->flags.ordered) { 1580 pr->u.p.ordered_lower = init; 1581 pr->u.p.ordered_upper = limit; 1582 } // if 1583 } // if 1584 } // case 1585 break; 1586 1587 case kmp_sch_dynamic_chunked: { 1588 UT chunk_number; 1589 UT chunk_size = pr->u.p.parm1; 1590 UT nchunks = pr->u.p.parm2; 1591 1592 KD_TRACE( 1593 100, 1594 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1595 gtid)); 1596 1597 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1598 status = (chunk_number < nchunks); 1599 if (!status) { 1600 *p_lb = 0; 1601 *p_ub = 0; 1602 if (p_st != NULL) 1603 *p_st = 0; 1604 } else { 1605 init = chunk_size * chunk_number; 1606 trip = pr->u.p.tc - 1; 1607 start = pr->u.p.lb; 1608 incr = pr->u.p.st; 1609 1610 if ((last = (trip - init < (UT)chunk_size))) 1611 limit = trip; 1612 else 1613 limit = chunk_size + init - 1; 1614 1615 if (p_st != NULL) 1616 *p_st = incr; 1617 1618 if (incr == 1) { 1619 *p_lb = start + init; 1620 *p_ub = start + limit; 1621 } else { 1622 *p_lb = start + init * incr; 1623 *p_ub = start + limit * incr; 1624 } 1625 1626 if (pr->flags.ordered) { 1627 pr->u.p.ordered_lower = init; 1628 pr->u.p.ordered_upper = limit; 1629 } // if 1630 } // if 1631 } // case 1632 break; 1633 1634 case kmp_sch_guided_iterative_chunked: { 1635 T chunkspec = pr->u.p.parm1; 1636 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1637 "iterative case\n", 1638 gtid)); 1639 trip = pr->u.p.tc; 1640 // Start atomic part of calculations 1641 while (1) { 1642 ST remaining; // signed, because can be < 0 1643 init = sh->u.s.iteration; // shared value 1644 remaining = trip - init; 1645 if (remaining <= 0) { // AC: need to compare with 0 first 1646 // nothing to do, don't try atomic op 1647 status = 0; 1648 break; 1649 } 1650 if ((T)remaining < 1651 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1652 // use dynamic-style schedule 1653 // atomically increment iterations, get old value 1654 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1655 (ST)chunkspec); 1656 remaining = trip - init; 1657 if (remaining <= 0) { 1658 status = 0; // all iterations got by other threads 1659 } else { 1660 // got some iterations to work on 1661 status = 1; 1662 if ((T)remaining > chunkspec) { 1663 limit = init + chunkspec - 1; 1664 } else { 1665 last = true; // the last chunk 1666 limit = init + remaining - 1; 1667 } // if 1668 } // if 1669 break; 1670 } // if 1671 limit = init + (UT)((double)remaining * 1672 *(double *)&pr->u.p.parm3); // divide by K*nproc 1673 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1674 (ST)init, (ST)limit)) { 1675 // CAS was successful, chunk obtained 1676 status = 1; 1677 --limit; 1678 break; 1679 } // if 1680 } // while 1681 if (status != 0) { 1682 start = pr->u.p.lb; 1683 incr = pr->u.p.st; 1684 if (p_st != NULL) 1685 *p_st = incr; 1686 *p_lb = start + init * incr; 1687 *p_ub = start + limit * incr; 1688 if (pr->flags.ordered) { 1689 pr->u.p.ordered_lower = init; 1690 pr->u.p.ordered_upper = limit; 1691 } // if 1692 } else { 1693 *p_lb = 0; 1694 *p_ub = 0; 1695 if (p_st != NULL) 1696 *p_st = 0; 1697 } // if 1698 } // case 1699 break; 1700 1701 case kmp_sch_guided_simd: { 1702 // same as iterative but curr-chunk adjusted to be multiple of given 1703 // chunk 1704 T chunk = pr->u.p.parm1; 1705 KD_TRACE(100, 1706 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1707 gtid)); 1708 trip = pr->u.p.tc; 1709 // Start atomic part of calculations 1710 while (1) { 1711 ST remaining; // signed, because can be < 0 1712 init = sh->u.s.iteration; // shared value 1713 remaining = trip - init; 1714 if (remaining <= 0) { // AC: need to compare with 0 first 1715 status = 0; // nothing to do, don't try atomic op 1716 break; 1717 } 1718 KMP_DEBUG_ASSERT(init % chunk == 0); 1719 // compare with K*nproc*(chunk+1), K=2 by default 1720 if ((T)remaining < pr->u.p.parm2) { 1721 // use dynamic-style schedule 1722 // atomically increment iterations, get old value 1723 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1724 (ST)chunk); 1725 remaining = trip - init; 1726 if (remaining <= 0) { 1727 status = 0; // all iterations got by other threads 1728 } else { 1729 // got some iterations to work on 1730 status = 1; 1731 if ((T)remaining > chunk) { 1732 limit = init + chunk - 1; 1733 } else { 1734 last = true; // the last chunk 1735 limit = init + remaining - 1; 1736 } // if 1737 } // if 1738 break; 1739 } // if 1740 // divide by K*nproc 1741 UT span; 1742 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1743 &span); 1744 UT rem = span % chunk; 1745 if (rem) // adjust so that span%chunk == 0 1746 span += chunk - rem; 1747 limit = init + span; 1748 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1749 (ST)init, (ST)limit)) { 1750 // CAS was successful, chunk obtained 1751 status = 1; 1752 --limit; 1753 break; 1754 } // if 1755 } // while 1756 if (status != 0) { 1757 start = pr->u.p.lb; 1758 incr = pr->u.p.st; 1759 if (p_st != NULL) 1760 *p_st = incr; 1761 *p_lb = start + init * incr; 1762 *p_ub = start + limit * incr; 1763 if (pr->flags.ordered) { 1764 pr->u.p.ordered_lower = init; 1765 pr->u.p.ordered_upper = limit; 1766 } // if 1767 } else { 1768 *p_lb = 0; 1769 *p_ub = 0; 1770 if (p_st != NULL) 1771 *p_st = 0; 1772 } // if 1773 } // case 1774 break; 1775 1776 case kmp_sch_guided_analytical_chunked: { 1777 T chunkspec = pr->u.p.parm1; 1778 UT chunkIdx; 1779 #if KMP_USE_X87CONTROL 1780 /* for storing original FPCW value for Windows* OS on 1781 IA-32 architecture 8-byte version */ 1782 unsigned int oldFpcw; 1783 unsigned int fpcwSet = 0; 1784 #endif 1785 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1786 "kmp_sch_guided_analytical_chunked case\n", 1787 gtid)); 1788 1789 trip = pr->u.p.tc; 1790 1791 KMP_DEBUG_ASSERT(nproc > 1); 1792 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1793 1794 while (1) { /* this while loop is a safeguard against unexpected zero 1795 chunk sizes */ 1796 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1797 if (chunkIdx >= (UT)pr->u.p.parm2) { 1798 --trip; 1799 /* use dynamic-style scheduling */ 1800 init = chunkIdx * chunkspec + pr->u.p.count; 1801 /* need to verify init > 0 in case of overflow in the above 1802 * calculation */ 1803 if ((status = (init > 0 && init <= trip)) != 0) { 1804 limit = init + chunkspec - 1; 1805 1806 if ((last = (limit >= trip)) != 0) 1807 limit = trip; 1808 } 1809 break; 1810 } else { 1811 /* use exponential-style scheduling */ 1812 /* The following check is to workaround the lack of long double precision on 1813 Windows* OS. 1814 This check works around the possible effect that init != 0 for chunkIdx == 0. 1815 */ 1816 #if KMP_USE_X87CONTROL 1817 /* If we haven't already done so, save original 1818 FPCW and set precision to 64-bit, as Windows* OS 1819 on IA-32 architecture defaults to 53-bit */ 1820 if (!fpcwSet) { 1821 oldFpcw = _control87(0, 0); 1822 _control87(_PC_64, _MCW_PC); 1823 fpcwSet = 0x30000; 1824 } 1825 #endif 1826 if (chunkIdx) { 1827 init = __kmp_dispatch_guided_remaining<T>( 1828 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1829 KMP_DEBUG_ASSERT(init); 1830 init = trip - init; 1831 } else 1832 init = 0; 1833 limit = trip - __kmp_dispatch_guided_remaining<T>( 1834 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1835 KMP_ASSERT(init <= limit); 1836 if (init < limit) { 1837 KMP_DEBUG_ASSERT(limit <= trip); 1838 --limit; 1839 status = 1; 1840 break; 1841 } // if 1842 } // if 1843 } // while (1) 1844 #if KMP_USE_X87CONTROL 1845 /* restore FPCW if necessary 1846 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1847 */ 1848 if (fpcwSet && (oldFpcw & fpcwSet)) 1849 _control87(oldFpcw, _MCW_PC); 1850 #endif 1851 if (status != 0) { 1852 start = pr->u.p.lb; 1853 incr = pr->u.p.st; 1854 if (p_st != NULL) 1855 *p_st = incr; 1856 *p_lb = start + init * incr; 1857 *p_ub = start + limit * incr; 1858 if (pr->flags.ordered) { 1859 pr->u.p.ordered_lower = init; 1860 pr->u.p.ordered_upper = limit; 1861 } 1862 } else { 1863 *p_lb = 0; 1864 *p_ub = 0; 1865 if (p_st != NULL) 1866 *p_st = 0; 1867 } 1868 } // case 1869 break; 1870 1871 case kmp_sch_trapezoidal: { 1872 UT index; 1873 T parm2 = pr->u.p.parm2; 1874 T parm3 = pr->u.p.parm3; 1875 T parm4 = pr->u.p.parm4; 1876 KD_TRACE(100, 1877 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1878 gtid)); 1879 1880 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1881 1882 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1883 trip = pr->u.p.tc - 1; 1884 1885 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1886 *p_lb = 0; 1887 *p_ub = 0; 1888 if (p_st != NULL) 1889 *p_st = 0; 1890 } else { 1891 start = pr->u.p.lb; 1892 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1893 incr = pr->u.p.st; 1894 1895 if ((last = (limit >= trip)) != 0) 1896 limit = trip; 1897 1898 if (p_st != NULL) 1899 *p_st = incr; 1900 1901 if (incr == 1) { 1902 *p_lb = start + init; 1903 *p_ub = start + limit; 1904 } else { 1905 *p_lb = start + init * incr; 1906 *p_ub = start + limit * incr; 1907 } 1908 1909 if (pr->flags.ordered) { 1910 pr->u.p.ordered_lower = init; 1911 pr->u.p.ordered_upper = limit; 1912 } // if 1913 } // if 1914 } // case 1915 break; 1916 default: { 1917 status = 0; // to avoid complaints on uninitialized variable use 1918 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1919 KMP_HNT(GetNewerLibrary), // Hint 1920 __kmp_msg_null // Variadic argument list terminator 1921 ); 1922 } break; 1923 } // switch 1924 if (p_last) 1925 *p_last = last; 1926 #ifdef KMP_DEBUG 1927 if (pr->flags.ordered) { 1928 char *buff; 1929 // create format specifiers before the debug output 1930 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1931 "ordered_lower:%%%s ordered_upper:%%%s\n", 1932 traits_t<UT>::spec, traits_t<UT>::spec); 1933 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1934 __kmp_str_free(&buff); 1935 } 1936 { 1937 char *buff; 1938 // create format specifiers before the debug output 1939 buff = __kmp_str_format( 1940 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1941 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1942 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1943 KMP_DEBUG_ASSERT(p_last); 1944 KMP_DEBUG_ASSERT(p_st); 1945 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1946 __kmp_str_free(&buff); 1947 } 1948 #endif 1949 return status; 1950 } 1951 1952 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1953 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1954 is not called. */ 1955 #if OMPT_SUPPORT && OMPT_OPTIONAL 1956 #define OMPT_LOOP_END \ 1957 if (status == 0) { \ 1958 if (ompt_enabled.ompt_callback_work) { \ 1959 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1960 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1961 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1962 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1963 &(task_info->task_data), 0, codeptr); \ 1964 } \ 1965 } 1966 // TODO: implement count 1967 #else 1968 #define OMPT_LOOP_END // no-op 1969 #endif 1970 1971 #if KMP_STATS_ENABLED 1972 #define KMP_STATS_LOOP_END \ 1973 { \ 1974 kmp_int64 u, l, t, i; \ 1975 l = (kmp_int64)(*p_lb); \ 1976 u = (kmp_int64)(*p_ub); \ 1977 i = (kmp_int64)(pr->u.p.st); \ 1978 if (status == 0) { \ 1979 t = 0; \ 1980 KMP_POP_PARTITIONED_TIMER(); \ 1981 } else if (i == 1) { \ 1982 if (u >= l) \ 1983 t = u - l + 1; \ 1984 else \ 1985 t = 0; \ 1986 } else if (i < 0) { \ 1987 if (l >= u) \ 1988 t = (l - u) / (-i) + 1; \ 1989 else \ 1990 t = 0; \ 1991 } else { \ 1992 if (u >= l) \ 1993 t = (u - l) / i + 1; \ 1994 else \ 1995 t = 0; \ 1996 } \ 1997 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1998 } 1999 #else 2000 #define KMP_STATS_LOOP_END /* Nothing */ 2001 #endif 2002 2003 template <typename T> 2004 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 2005 T *p_lb, T *p_ub, 2006 typename traits_t<T>::signed_t *p_st 2007 #if OMPT_SUPPORT && OMPT_OPTIONAL 2008 , 2009 void *codeptr 2010 #endif 2011 ) { 2012 2013 typedef typename traits_t<T>::unsigned_t UT; 2014 typedef typename traits_t<T>::signed_t ST; 2015 // This is potentially slightly misleading, schedule(runtime) will appear here 2016 // even if the actual runtime schedule is static. (Which points out a 2017 // disadvantage of schedule(runtime): even when static scheduling is used it 2018 // costs more than a compile time choice to use static scheduling would.) 2019 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 2020 2021 int status; 2022 dispatch_private_info_template<T> *pr; 2023 __kmp_assert_valid_gtid(gtid); 2024 kmp_info_t *th = __kmp_threads[gtid]; 2025 kmp_team_t *team = th->th.th_team; 2026 2027 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 2028 KD_TRACE( 2029 1000, 2030 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 2031 gtid, p_lb, p_ub, p_st, p_last)); 2032 2033 if (team->t.t_serialized) { 2034 /* NOTE: serialize this dispatch because we are not at the active level */ 2035 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2036 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 2037 KMP_DEBUG_ASSERT(pr); 2038 2039 if ((status = (pr->u.p.tc != 0)) == 0) { 2040 *p_lb = 0; 2041 *p_ub = 0; 2042 // if ( p_last != NULL ) 2043 // *p_last = 0; 2044 if (p_st != NULL) 2045 *p_st = 0; 2046 if (__kmp_env_consistency_check) { 2047 if (pr->pushed_ws != ct_none) { 2048 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2049 } 2050 } 2051 } else if (pr->flags.nomerge) { 2052 kmp_int32 last; 2053 T start; 2054 UT limit, trip, init; 2055 ST incr; 2056 T chunk = pr->u.p.parm1; 2057 2058 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 2059 gtid)); 2060 2061 init = chunk * pr->u.p.count++; 2062 trip = pr->u.p.tc - 1; 2063 2064 if ((status = (init <= trip)) == 0) { 2065 *p_lb = 0; 2066 *p_ub = 0; 2067 // if ( p_last != NULL ) 2068 // *p_last = 0; 2069 if (p_st != NULL) 2070 *p_st = 0; 2071 if (__kmp_env_consistency_check) { 2072 if (pr->pushed_ws != ct_none) { 2073 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2074 } 2075 } 2076 } else { 2077 start = pr->u.p.lb; 2078 limit = chunk + init - 1; 2079 incr = pr->u.p.st; 2080 2081 if ((last = (limit >= trip)) != 0) { 2082 limit = trip; 2083 #if KMP_OS_WINDOWS 2084 pr->u.p.last_upper = pr->u.p.ub; 2085 #endif /* KMP_OS_WINDOWS */ 2086 } 2087 if (p_last != NULL) 2088 *p_last = last; 2089 if (p_st != NULL) 2090 *p_st = incr; 2091 if (incr == 1) { 2092 *p_lb = start + init; 2093 *p_ub = start + limit; 2094 } else { 2095 *p_lb = start + init * incr; 2096 *p_ub = start + limit * incr; 2097 } 2098 2099 if (pr->flags.ordered) { 2100 pr->u.p.ordered_lower = init; 2101 pr->u.p.ordered_upper = limit; 2102 #ifdef KMP_DEBUG 2103 { 2104 char *buff; 2105 // create format specifiers before the debug output 2106 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2107 "ordered_lower:%%%s ordered_upper:%%%s\n", 2108 traits_t<UT>::spec, traits_t<UT>::spec); 2109 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2110 pr->u.p.ordered_upper)); 2111 __kmp_str_free(&buff); 2112 } 2113 #endif 2114 } // if 2115 } // if 2116 } else { 2117 pr->u.p.tc = 0; 2118 *p_lb = pr->u.p.lb; 2119 *p_ub = pr->u.p.ub; 2120 #if KMP_OS_WINDOWS 2121 pr->u.p.last_upper = *p_ub; 2122 #endif /* KMP_OS_WINDOWS */ 2123 if (p_last != NULL) 2124 *p_last = TRUE; 2125 if (p_st != NULL) 2126 *p_st = pr->u.p.st; 2127 } // if 2128 #ifdef KMP_DEBUG 2129 { 2130 char *buff; 2131 // create format specifiers before the debug output 2132 buff = __kmp_str_format( 2133 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2134 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2135 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2136 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2137 (p_last ? *p_last : 0), status)); 2138 __kmp_str_free(&buff); 2139 } 2140 #endif 2141 #if INCLUDE_SSC_MARKS 2142 SSC_MARK_DISPATCH_NEXT(); 2143 #endif 2144 OMPT_LOOP_END; 2145 KMP_STATS_LOOP_END; 2146 return status; 2147 } else { 2148 kmp_int32 last = 0; 2149 dispatch_shared_info_template<T> volatile *sh; 2150 2151 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2152 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2153 2154 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2155 th->th.th_dispatch->th_dispatch_pr_current); 2156 KMP_DEBUG_ASSERT(pr); 2157 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2158 th->th.th_dispatch->th_dispatch_sh_current); 2159 KMP_DEBUG_ASSERT(sh); 2160 2161 #if KMP_USE_HIER_SCHED 2162 if (pr->flags.use_hier) 2163 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2164 else 2165 #endif // KMP_USE_HIER_SCHED 2166 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2167 p_st, th->th.th_team_nproc, 2168 th->th.th_info.ds.ds_tid); 2169 // status == 0: no more iterations to execute 2170 if (status == 0) { 2171 ST num_done; 2172 num_done = test_then_inc<ST>(&sh->u.s.num_done); 2173 #ifdef KMP_DEBUG 2174 { 2175 char *buff; 2176 // create format specifiers before the debug output 2177 buff = __kmp_str_format( 2178 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2179 traits_t<ST>::spec); 2180 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2181 __kmp_str_free(&buff); 2182 } 2183 #endif 2184 2185 #if KMP_USE_HIER_SCHED 2186 pr->flags.use_hier = FALSE; 2187 #endif 2188 if (num_done == th->th.th_team_nproc - 1) { 2189 #if KMP_STATIC_STEAL_ENABLED 2190 if (pr->schedule == kmp_sch_static_steal) { 2191 int i; 2192 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2193 __kmp_dispatch_num_buffers; // current loop index 2194 // loop complete, safe to destroy locks used for stealing 2195 for (i = 0; i < th->th.th_team_nproc; ++i) { 2196 dispatch_private_info_template<T> *buf = 2197 reinterpret_cast<dispatch_private_info_template<T> *>( 2198 &team->t.t_dispatch[i].th_disp_buffer[idx]); 2199 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive 2200 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED); 2201 if (traits_t<T>::type_size > 4) { 2202 // destroy locks used for stealing 2203 kmp_lock_t *lck = buf->u.p.steal_lock; 2204 KMP_ASSERT(lck != NULL); 2205 __kmp_destroy_lock(lck); 2206 __kmp_free(lck); 2207 buf->u.p.steal_lock = NULL; 2208 } 2209 } 2210 } 2211 #endif 2212 /* NOTE: release shared buffer to be reused */ 2213 2214 KMP_MB(); /* Flush all pending memory write invalidates. */ 2215 2216 sh->u.s.num_done = 0; 2217 sh->u.s.iteration = 0; 2218 2219 /* TODO replace with general release procedure? */ 2220 if (pr->flags.ordered) { 2221 sh->u.s.ordered_iteration = 0; 2222 } 2223 2224 sh->buffer_index += __kmp_dispatch_num_buffers; 2225 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2226 gtid, sh->buffer_index)); 2227 2228 KMP_MB(); /* Flush all pending memory write invalidates. */ 2229 2230 } // if 2231 if (__kmp_env_consistency_check) { 2232 if (pr->pushed_ws != ct_none) { 2233 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2234 } 2235 } 2236 2237 th->th.th_dispatch->th_deo_fcn = NULL; 2238 th->th.th_dispatch->th_dxo_fcn = NULL; 2239 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2240 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2241 } // if (status == 0) 2242 #if KMP_OS_WINDOWS 2243 else if (last) { 2244 pr->u.p.last_upper = pr->u.p.ub; 2245 } 2246 #endif /* KMP_OS_WINDOWS */ 2247 if (p_last != NULL && status != 0) 2248 *p_last = last; 2249 } // if 2250 2251 #ifdef KMP_DEBUG 2252 { 2253 char *buff; 2254 // create format specifiers before the debug output 2255 buff = __kmp_str_format( 2256 "__kmp_dispatch_next: T#%%d normal case: " 2257 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2258 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2259 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2260 (p_last ? *p_last : 0), status)); 2261 __kmp_str_free(&buff); 2262 } 2263 #endif 2264 #if INCLUDE_SSC_MARKS 2265 SSC_MARK_DISPATCH_NEXT(); 2266 #endif 2267 OMPT_LOOP_END; 2268 KMP_STATS_LOOP_END; 2269 return status; 2270 } 2271 2272 template <typename T> 2273 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2274 kmp_int32 *plastiter, T *plower, T *pupper, 2275 typename traits_t<T>::signed_t incr) { 2276 typedef typename traits_t<T>::unsigned_t UT; 2277 kmp_uint32 team_id; 2278 kmp_uint32 nteams; 2279 UT trip_count; 2280 kmp_team_t *team; 2281 kmp_info_t *th; 2282 2283 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2284 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2285 #ifdef KMP_DEBUG 2286 typedef typename traits_t<T>::signed_t ST; 2287 { 2288 char *buff; 2289 // create format specifiers before the debug output 2290 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2291 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2292 traits_t<T>::spec, traits_t<T>::spec, 2293 traits_t<ST>::spec, traits_t<T>::spec); 2294 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2295 __kmp_str_free(&buff); 2296 } 2297 #endif 2298 2299 if (__kmp_env_consistency_check) { 2300 if (incr == 0) { 2301 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2302 loc); 2303 } 2304 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2305 // The loop is illegal. 2306 // Some zero-trip loops maintained by compiler, e.g.: 2307 // for(i=10;i<0;++i) // lower >= upper - run-time check 2308 // for(i=0;i>10;--i) // lower <= upper - run-time check 2309 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2310 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2311 // Compiler does not check the following illegal loops: 2312 // for(i=0;i<10;i+=incr) // where incr<0 2313 // for(i=10;i>0;i-=incr) // where incr<0 2314 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2315 } 2316 } 2317 __kmp_assert_valid_gtid(gtid); 2318 th = __kmp_threads[gtid]; 2319 team = th->th.th_team; 2320 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2321 nteams = th->th.th_teams_size.nteams; 2322 team_id = team->t.t_master_tid; 2323 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2324 2325 // compute global trip count 2326 if (incr == 1) { 2327 trip_count = *pupper - *plower + 1; 2328 } else if (incr == -1) { 2329 trip_count = *plower - *pupper + 1; 2330 } else if (incr > 0) { 2331 // upper-lower can exceed the limit of signed type 2332 trip_count = (UT)(*pupper - *plower) / incr + 1; 2333 } else { 2334 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2335 } 2336 2337 if (trip_count <= nteams) { 2338 KMP_DEBUG_ASSERT( 2339 __kmp_static == kmp_sch_static_greedy || 2340 __kmp_static == 2341 kmp_sch_static_balanced); // Unknown static scheduling type. 2342 // only some teams get single iteration, others get nothing 2343 if (team_id < trip_count) { 2344 *pupper = *plower = *plower + team_id * incr; 2345 } else { 2346 *plower = *pupper + incr; // zero-trip loop 2347 } 2348 if (plastiter != NULL) 2349 *plastiter = (team_id == trip_count - 1); 2350 } else { 2351 if (__kmp_static == kmp_sch_static_balanced) { 2352 UT chunk = trip_count / nteams; 2353 UT extras = trip_count % nteams; 2354 *plower += 2355 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2356 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2357 if (plastiter != NULL) 2358 *plastiter = (team_id == nteams - 1); 2359 } else { 2360 T chunk_inc_count = 2361 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2362 T upper = *pupper; 2363 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2364 // Unknown static scheduling type. 2365 *plower += team_id * chunk_inc_count; 2366 *pupper = *plower + chunk_inc_count - incr; 2367 // Check/correct bounds if needed 2368 if (incr > 0) { 2369 if (*pupper < *plower) 2370 *pupper = traits_t<T>::max_value; 2371 if (plastiter != NULL) 2372 *plastiter = *plower <= upper && *pupper > upper - incr; 2373 if (*pupper > upper) 2374 *pupper = upper; // tracker C73258 2375 } else { 2376 if (*pupper > *plower) 2377 *pupper = traits_t<T>::min_value; 2378 if (plastiter != NULL) 2379 *plastiter = *plower >= upper && *pupper < upper - incr; 2380 if (*pupper < upper) 2381 *pupper = upper; // tracker C73258 2382 } 2383 } 2384 } 2385 } 2386 2387 //----------------------------------------------------------------------------- 2388 // Dispatch routines 2389 // Transfer call to template< type T > 2390 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2391 // T lb, T ub, ST st, ST chunk ) 2392 extern "C" { 2393 2394 /*! 2395 @ingroup WORK_SHARING 2396 @{ 2397 @param loc Source location 2398 @param gtid Global thread id 2399 @param schedule Schedule type 2400 @param lb Lower bound 2401 @param ub Upper bound 2402 @param st Step (or increment if you prefer) 2403 @param chunk The chunk size to block with 2404 2405 This function prepares the runtime to start a dynamically scheduled for loop, 2406 saving the loop arguments. 2407 These functions are all identical apart from the types of the arguments. 2408 */ 2409 2410 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2411 enum sched_type schedule, kmp_int32 lb, 2412 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2413 KMP_DEBUG_ASSERT(__kmp_init_serial); 2414 #if OMPT_SUPPORT && OMPT_OPTIONAL 2415 OMPT_STORE_RETURN_ADDRESS(gtid); 2416 #endif 2417 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2418 } 2419 /*! 2420 See @ref __kmpc_dispatch_init_4 2421 */ 2422 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2423 enum sched_type schedule, kmp_uint32 lb, 2424 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2425 KMP_DEBUG_ASSERT(__kmp_init_serial); 2426 #if OMPT_SUPPORT && OMPT_OPTIONAL 2427 OMPT_STORE_RETURN_ADDRESS(gtid); 2428 #endif 2429 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2430 } 2431 2432 /*! 2433 See @ref __kmpc_dispatch_init_4 2434 */ 2435 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2436 enum sched_type schedule, kmp_int64 lb, 2437 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2438 KMP_DEBUG_ASSERT(__kmp_init_serial); 2439 #if OMPT_SUPPORT && OMPT_OPTIONAL 2440 OMPT_STORE_RETURN_ADDRESS(gtid); 2441 #endif 2442 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2443 } 2444 2445 /*! 2446 See @ref __kmpc_dispatch_init_4 2447 */ 2448 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2449 enum sched_type schedule, kmp_uint64 lb, 2450 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2451 KMP_DEBUG_ASSERT(__kmp_init_serial); 2452 #if OMPT_SUPPORT && OMPT_OPTIONAL 2453 OMPT_STORE_RETURN_ADDRESS(gtid); 2454 #endif 2455 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2456 } 2457 2458 /*! 2459 See @ref __kmpc_dispatch_init_4 2460 2461 Difference from __kmpc_dispatch_init set of functions is these functions 2462 are called for composite distribute parallel for construct. Thus before 2463 regular iterations dispatching we need to calc per-team iteration space. 2464 2465 These functions are all identical apart from the types of the arguments. 2466 */ 2467 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2468 enum sched_type schedule, kmp_int32 *p_last, 2469 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2470 kmp_int32 chunk) { 2471 KMP_DEBUG_ASSERT(__kmp_init_serial); 2472 #if OMPT_SUPPORT && OMPT_OPTIONAL 2473 OMPT_STORE_RETURN_ADDRESS(gtid); 2474 #endif 2475 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2476 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2477 } 2478 2479 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2480 enum sched_type schedule, kmp_int32 *p_last, 2481 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2482 kmp_int32 chunk) { 2483 KMP_DEBUG_ASSERT(__kmp_init_serial); 2484 #if OMPT_SUPPORT && OMPT_OPTIONAL 2485 OMPT_STORE_RETURN_ADDRESS(gtid); 2486 #endif 2487 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2488 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2489 } 2490 2491 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2492 enum sched_type schedule, kmp_int32 *p_last, 2493 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2494 kmp_int64 chunk) { 2495 KMP_DEBUG_ASSERT(__kmp_init_serial); 2496 #if OMPT_SUPPORT && OMPT_OPTIONAL 2497 OMPT_STORE_RETURN_ADDRESS(gtid); 2498 #endif 2499 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2500 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2501 } 2502 2503 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2504 enum sched_type schedule, kmp_int32 *p_last, 2505 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2506 kmp_int64 chunk) { 2507 KMP_DEBUG_ASSERT(__kmp_init_serial); 2508 #if OMPT_SUPPORT && OMPT_OPTIONAL 2509 OMPT_STORE_RETURN_ADDRESS(gtid); 2510 #endif 2511 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2512 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2513 } 2514 2515 /*! 2516 @param loc Source code location 2517 @param gtid Global thread id 2518 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2519 otherwise 2520 @param p_lb Pointer to the lower bound for the next chunk of work 2521 @param p_ub Pointer to the upper bound for the next chunk of work 2522 @param p_st Pointer to the stride for the next chunk of work 2523 @return one if there is work to be done, zero otherwise 2524 2525 Get the next dynamically allocated chunk of work for this thread. 2526 If there is no more work, then the lb,ub and stride need not be modified. 2527 */ 2528 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2529 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2530 #if OMPT_SUPPORT && OMPT_OPTIONAL 2531 OMPT_STORE_RETURN_ADDRESS(gtid); 2532 #endif 2533 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2534 #if OMPT_SUPPORT && OMPT_OPTIONAL 2535 , 2536 OMPT_LOAD_RETURN_ADDRESS(gtid) 2537 #endif 2538 ); 2539 } 2540 2541 /*! 2542 See @ref __kmpc_dispatch_next_4 2543 */ 2544 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2545 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2546 kmp_int32 *p_st) { 2547 #if OMPT_SUPPORT && OMPT_OPTIONAL 2548 OMPT_STORE_RETURN_ADDRESS(gtid); 2549 #endif 2550 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2551 #if OMPT_SUPPORT && OMPT_OPTIONAL 2552 , 2553 OMPT_LOAD_RETURN_ADDRESS(gtid) 2554 #endif 2555 ); 2556 } 2557 2558 /*! 2559 See @ref __kmpc_dispatch_next_4 2560 */ 2561 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2562 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2563 #if OMPT_SUPPORT && OMPT_OPTIONAL 2564 OMPT_STORE_RETURN_ADDRESS(gtid); 2565 #endif 2566 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2567 #if OMPT_SUPPORT && OMPT_OPTIONAL 2568 , 2569 OMPT_LOAD_RETURN_ADDRESS(gtid) 2570 #endif 2571 ); 2572 } 2573 2574 /*! 2575 See @ref __kmpc_dispatch_next_4 2576 */ 2577 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2578 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2579 kmp_int64 *p_st) { 2580 #if OMPT_SUPPORT && OMPT_OPTIONAL 2581 OMPT_STORE_RETURN_ADDRESS(gtid); 2582 #endif 2583 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2584 #if OMPT_SUPPORT && OMPT_OPTIONAL 2585 , 2586 OMPT_LOAD_RETURN_ADDRESS(gtid) 2587 #endif 2588 ); 2589 } 2590 2591 /*! 2592 @param loc Source code location 2593 @param gtid Global thread id 2594 2595 Mark the end of a dynamic loop. 2596 */ 2597 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2598 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2599 } 2600 2601 /*! 2602 See @ref __kmpc_dispatch_fini_4 2603 */ 2604 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2605 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2606 } 2607 2608 /*! 2609 See @ref __kmpc_dispatch_fini_4 2610 */ 2611 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2612 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2613 } 2614 2615 /*! 2616 See @ref __kmpc_dispatch_fini_4 2617 */ 2618 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2619 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2620 } 2621 /*! @} */ 2622 2623 //----------------------------------------------------------------------------- 2624 // Non-template routines from kmp_dispatch.cpp used in other sources 2625 2626 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2627 return value == checker; 2628 } 2629 2630 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2631 return value != checker; 2632 } 2633 2634 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2635 return value < checker; 2636 } 2637 2638 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2639 return value >= checker; 2640 } 2641 2642 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2643 return value <= checker; 2644 } 2645 2646 kmp_uint32 2647 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2648 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2649 void *obj // Higher-level synchronization object, or NULL. 2650 ) { 2651 // note: we may not belong to a team at this point 2652 volatile kmp_uint32 *spin = spinner; 2653 kmp_uint32 check = checker; 2654 kmp_uint32 spins; 2655 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2656 kmp_uint32 r; 2657 2658 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2659 KMP_INIT_YIELD(spins); 2660 // main wait spin loop 2661 while (!f(r = TCR_4(*spin), check)) { 2662 KMP_FSYNC_SPIN_PREPARE(obj); 2663 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2664 split. It causes problems with infinite recursion because of exit lock */ 2665 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2666 __kmp_abort_thread(); */ 2667 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2668 } 2669 KMP_FSYNC_SPIN_ACQUIRED(obj); 2670 return r; 2671 } 2672 2673 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2674 kmp_uint32 (*pred)(void *, kmp_uint32), 2675 void *obj // Higher-level synchronization object, or NULL. 2676 ) { 2677 // note: we may not belong to a team at this point 2678 void *spin = spinner; 2679 kmp_uint32 check = checker; 2680 kmp_uint32 spins; 2681 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2682 2683 KMP_FSYNC_SPIN_INIT(obj, spin); 2684 KMP_INIT_YIELD(spins); 2685 // main wait spin loop 2686 while (!f(spin, check)) { 2687 KMP_FSYNC_SPIN_PREPARE(obj); 2688 /* if we have waited a bit, or are noversubscribed, yield */ 2689 /* pause is in the following code */ 2690 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2691 } 2692 KMP_FSYNC_SPIN_ACQUIRED(obj); 2693 } 2694 2695 } // extern "C" 2696 2697 #ifdef KMP_GOMP_COMPAT 2698 2699 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2700 enum sched_type schedule, kmp_int32 lb, 2701 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2702 int push_ws) { 2703 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2704 push_ws); 2705 } 2706 2707 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2708 enum sched_type schedule, kmp_uint32 lb, 2709 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2710 int push_ws) { 2711 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2712 push_ws); 2713 } 2714 2715 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2716 enum sched_type schedule, kmp_int64 lb, 2717 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2718 int push_ws) { 2719 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2720 push_ws); 2721 } 2722 2723 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2724 enum sched_type schedule, kmp_uint64 lb, 2725 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2726 int push_ws) { 2727 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2728 push_ws); 2729 } 2730 2731 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2732 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2733 } 2734 2735 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2736 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2737 } 2738 2739 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2740 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2741 } 2742 2743 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2744 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2745 } 2746 2747 #endif /* KMP_GOMP_COMPAT */ 2748 2749 /* ------------------------------------------------------------------------ */ 2750