1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 #include "kmp_lock.h" 38 #include "kmp_dispatch.h" 39 #if KMP_USE_HIER_SCHED 40 #include "kmp_dispatch_hier.h" 41 #endif 42 43 #if OMPT_SUPPORT 44 #include "ompt-specific.h" 45 #endif 46 47 /* ------------------------------------------------------------------------ */ 48 /* ------------------------------------------------------------------------ */ 49 50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 51 kmp_info_t *th; 52 53 KMP_DEBUG_ASSERT(gtid_ref); 54 55 if (__kmp_env_consistency_check) { 56 th = __kmp_threads[*gtid_ref]; 57 if (th->th.th_root->r.r_active && 58 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 59 #if KMP_USE_DYNAMIC_LOCK 60 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 61 #else 62 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 63 #endif 64 } 65 } 66 } 67 68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 69 kmp_info_t *th; 70 71 if (__kmp_env_consistency_check) { 72 th = __kmp_threads[*gtid_ref]; 73 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 74 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 75 } 76 } 77 } 78 79 // Initialize a dispatch_private_info_template<T> buffer for a particular 80 // type of schedule,chunk. The loop description is found in lb (lower bound), 81 // ub (upper bound), and st (stride). nproc is the number of threads relevant 82 // to the scheduling (often the number of threads in a team, but not always if 83 // hierarchical scheduling is used). tid is the id of the thread calling 84 // the function within the group of nproc threads. It will have a value 85 // between 0 and nproc - 1. This is often just the thread id within a team, but 86 // is not necessarily the case when using hierarchical scheduling. 87 // loc is the source file location of the corresponding loop 88 // gtid is the global thread id 89 template <typename T> 90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 91 dispatch_private_info_template<T> *pr, 92 enum sched_type schedule, T lb, T ub, 93 typename traits_t<T>::signed_t st, 94 #if USE_ITT_BUILD 95 kmp_uint64 *cur_chunk, 96 #endif 97 typename traits_t<T>::signed_t chunk, 98 T nproc, T tid) { 99 typedef typename traits_t<T>::unsigned_t UT; 100 typedef typename traits_t<T>::floating_t DBL; 101 102 int active; 103 T tc; 104 kmp_info_t *th; 105 kmp_team_t *team; 106 107 #ifdef KMP_DEBUG 108 typedef typename traits_t<T>::signed_t ST; 109 { 110 char *buff; 111 // create format specifiers before the debug output 112 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 113 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 114 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 115 traits_t<T>::spec, traits_t<T>::spec, 116 traits_t<ST>::spec, traits_t<ST>::spec, 117 traits_t<T>::spec, traits_t<T>::spec); 118 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 119 __kmp_str_free(&buff); 120 } 121 #endif 122 /* setup data */ 123 th = __kmp_threads[gtid]; 124 team = th->th.th_team; 125 active = !team->t.t_serialized; 126 127 #if USE_ITT_BUILD 128 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 129 __kmp_forkjoin_frames_mode == 3 && 130 KMP_MASTER_GTID(gtid) && 131 #if OMP_40_ENABLED 132 th->th.th_teams_microtask == NULL && 133 #endif 134 team->t.t_active_level == 1; 135 #endif 136 #if (KMP_STATIC_STEAL_ENABLED) 137 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 138 // AC: we now have only one implementation of stealing, so use it 139 schedule = kmp_sch_static_steal; 140 else 141 #endif 142 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 143 144 /* Pick up the nomerge/ordered bits from the scheduling type */ 145 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 146 pr->flags.nomerge = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.nomerge = FALSE; 151 } 152 pr->type_size = traits_t<T>::type_size; // remember the size of variables 153 if (kmp_ord_lower & schedule) { 154 pr->flags.ordered = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.ordered = FALSE; 159 } 160 161 if (schedule == kmp_sch_static) { 162 schedule = __kmp_static; 163 } else { 164 if (schedule == kmp_sch_runtime) { 165 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 166 // not specified) 167 schedule = team->t.t_sched.r_sched_type; 168 // Detail the schedule if needed (global controls are differentiated 169 // appropriately) 170 if (schedule == kmp_sch_guided_chunked) { 171 schedule = __kmp_guided; 172 } else if (schedule == kmp_sch_static) { 173 schedule = __kmp_static; 174 } 175 // Use the chunk size specified by OMP_SCHEDULE (or default if not 176 // specified) 177 chunk = team->t.t_sched.chunk; 178 #if USE_ITT_BUILD 179 if (cur_chunk) 180 *cur_chunk = chunk; 181 #endif 182 #ifdef KMP_DEBUG 183 { 184 char *buff; 185 // create format specifiers before the debug output 186 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 187 "schedule:%%d chunk:%%%s\n", 188 traits_t<ST>::spec); 189 KD_TRACE(10, (buff, gtid, schedule, chunk)); 190 __kmp_str_free(&buff); 191 } 192 #endif 193 } else { 194 if (schedule == kmp_sch_guided_chunked) { 195 schedule = __kmp_guided; 196 } 197 if (chunk <= 0) { 198 chunk = KMP_DEFAULT_CHUNK; 199 } 200 } 201 202 if (schedule == kmp_sch_auto) { 203 // mapping and differentiation: in the __kmp_do_serial_initialize() 204 schedule = __kmp_auto; 205 #ifdef KMP_DEBUG 206 { 207 char *buff; 208 // create format specifiers before the debug output 209 buff = __kmp_str_format( 210 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 211 "schedule:%%d chunk:%%%s\n", 212 traits_t<ST>::spec); 213 KD_TRACE(10, (buff, gtid, schedule, chunk)); 214 __kmp_str_free(&buff); 215 } 216 #endif 217 } 218 219 /* guided analytical not safe for too many threads */ 220 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 221 schedule = kmp_sch_guided_iterative_chunked; 222 KMP_WARNING(DispatchManyThreads); 223 } 224 #if OMP_45_ENABLED 225 if (schedule == kmp_sch_runtime_simd) { 226 // compiler provides simd_width in the chunk parameter 227 schedule = team->t.t_sched.r_sched_type; 228 // Detail the schedule if needed (global controls are differentiated 229 // appropriately) 230 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 231 schedule == __kmp_static) { 232 schedule = kmp_sch_static_balanced_chunked; 233 } else { 234 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 235 schedule = kmp_sch_guided_simd; 236 } 237 chunk = team->t.t_sched.chunk * chunk; 238 } 239 #if USE_ITT_BUILD 240 if (cur_chunk) 241 *cur_chunk = chunk; 242 #endif 243 #ifdef KMP_DEBUG 244 { 245 char *buff; 246 // create format specifiers before the debug output 247 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 248 " chunk:%%%s\n", 249 traits_t<ST>::spec); 250 KD_TRACE(10, (buff, gtid, schedule, chunk)); 251 __kmp_str_free(&buff); 252 } 253 #endif 254 } 255 #endif // OMP_45_ENABLED 256 pr->u.p.parm1 = chunk; 257 } 258 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 259 "unknown scheduling type"); 260 261 pr->u.p.count = 0; 262 263 if (__kmp_env_consistency_check) { 264 if (st == 0) { 265 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 266 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 267 } 268 } 269 // compute trip count 270 if (st == 1) { // most common case 271 if (ub >= lb) { 272 tc = ub - lb + 1; 273 } else { // ub < lb 274 tc = 0; // zero-trip 275 } 276 } else if (st < 0) { 277 if (lb >= ub) { 278 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(lb - ub) / (-st) + 1; 281 } else { // lb < ub 282 tc = 0; // zero-trip 283 } 284 } else { // st > 0 285 if (ub >= lb) { 286 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 287 // where the division needs to be unsigned regardless of the result type 288 tc = (UT)(ub - lb) / st + 1; 289 } else { // ub < lb 290 tc = 0; // zero-trip 291 } 292 } 293 294 pr->u.p.lb = lb; 295 pr->u.p.ub = ub; 296 pr->u.p.st = st; 297 pr->u.p.tc = tc; 298 299 #if KMP_OS_WINDOWS 300 pr->u.p.last_upper = ub + st; 301 #endif /* KMP_OS_WINDOWS */ 302 303 /* NOTE: only the active parallel region(s) has active ordered sections */ 304 305 if (active) { 306 if (pr->flags.ordered) { 307 pr->ordered_bumped = 0; 308 pr->u.p.ordered_lower = 1; 309 pr->u.p.ordered_upper = 0; 310 } 311 } 312 313 switch (schedule) { 314 #if (KMP_STATIC_STEAL_ENABLED) 315 case kmp_sch_static_steal: { 316 T ntc, init; 317 318 KD_TRACE(100, 319 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 320 gtid)); 321 322 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 323 if (nproc > 1 && ntc >= nproc) { 324 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 325 T id = tid; 326 T small_chunk, extras; 327 328 small_chunk = ntc / nproc; 329 extras = ntc % nproc; 330 331 init = id * small_chunk + (id < extras ? id : extras); 332 pr->u.p.count = init; 333 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 334 335 pr->u.p.parm2 = lb; 336 // pr->pfields.parm3 = 0; // it's not used in static_steal 337 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 338 pr->u.p.st = st; 339 if (traits_t<T>::type_size > 4) { 340 // AC: TODO: check if 16-byte CAS available and use it to 341 // improve performance (probably wait for explicit request 342 // before spending time on this). 343 // For now use dynamically allocated per-thread lock, 344 // free memory in __kmp_dispatch_next when status==0. 345 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 346 th->th.th_dispatch->th_steal_lock = 347 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 348 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 349 } 350 break; 351 } else { 352 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 353 "kmp_sch_static_balanced\n", 354 gtid)); 355 schedule = kmp_sch_static_balanced; 356 /* too few iterations: fall-through to kmp_sch_static_balanced */ 357 } // if 358 /* FALL-THROUGH to static balanced */ 359 } // case 360 #endif 361 case kmp_sch_static_balanced: { 362 T init, limit; 363 364 KD_TRACE( 365 100, 366 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 367 gtid)); 368 369 if (nproc > 1) { 370 T id = tid; 371 372 if (tc < nproc) { 373 if (id < tc) { 374 init = id; 375 limit = id; 376 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 377 } else { 378 pr->u.p.count = 1; /* means no more chunks to execute */ 379 pr->u.p.parm1 = FALSE; 380 break; 381 } 382 } else { 383 T small_chunk = tc / nproc; 384 T extras = tc % nproc; 385 init = id * small_chunk + (id < extras ? id : extras); 386 limit = init + small_chunk - (id < extras ? 0 : 1); 387 pr->u.p.parm1 = (id == nproc - 1); 388 } 389 } else { 390 if (tc > 0) { 391 init = 0; 392 limit = tc - 1; 393 pr->u.p.parm1 = TRUE; 394 } else { 395 // zero trip count 396 pr->u.p.count = 1; /* means no more chunks to execute */ 397 pr->u.p.parm1 = FALSE; 398 break; 399 } 400 } 401 #if USE_ITT_BUILD 402 // Calculate chunk for metadata report 403 if (itt_need_metadata_reporting) 404 if (cur_chunk) 405 *cur_chunk = limit - init + 1; 406 #endif 407 if (st == 1) { 408 pr->u.p.lb = lb + init; 409 pr->u.p.ub = lb + limit; 410 } else { 411 // calculated upper bound, "ub" is user-defined upper bound 412 T ub_tmp = lb + limit * st; 413 pr->u.p.lb = lb + init * st; 414 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 415 // it exactly 416 if (st > 0) { 417 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 418 } else { 419 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 420 } 421 } 422 if (pr->flags.ordered) { 423 pr->u.p.ordered_lower = init; 424 pr->u.p.ordered_upper = limit; 425 } 426 break; 427 } // case 428 #if OMP_45_ENABLED 429 case kmp_sch_static_balanced_chunked: { 430 // similar to balanced, but chunk adjusted to multiple of simd width 431 T nth = nproc; 432 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 433 " -> falling-through to static_greedy\n", 434 gtid)); 435 schedule = kmp_sch_static_greedy; 436 if (nth > 1) 437 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 438 else 439 pr->u.p.parm1 = tc; 440 break; 441 } // case 442 case kmp_sch_guided_simd: 443 #endif // OMP_45_ENABLED 444 case kmp_sch_guided_iterative_chunked: { 445 KD_TRACE( 446 100, 447 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 448 " case\n", 449 gtid)); 450 451 if (nproc > 1) { 452 if ((2L * chunk + 1) * nproc >= tc) { 453 /* chunk size too large, switch to dynamic */ 454 schedule = kmp_sch_dynamic_chunked; 455 } else { 456 // when remaining iters become less than parm2 - switch to dynamic 457 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 458 *(double *)&pr->u.p.parm3 = 459 guided_flt_param / nproc; // may occupy parm3 and parm4 460 } 461 } else { 462 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 463 "kmp_sch_static_greedy\n", 464 gtid)); 465 schedule = kmp_sch_static_greedy; 466 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 467 KD_TRACE( 468 100, 469 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 470 gtid)); 471 pr->u.p.parm1 = tc; 472 } // if 473 } // case 474 break; 475 case kmp_sch_guided_analytical_chunked: { 476 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 477 "kmp_sch_guided_analytical_chunked case\n", 478 gtid)); 479 480 if (nproc > 1) { 481 if ((2L * chunk + 1) * nproc >= tc) { 482 /* chunk size too large, switch to dynamic */ 483 schedule = kmp_sch_dynamic_chunked; 484 } else { 485 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 486 DBL x; 487 488 #if KMP_OS_WINDOWS && KMP_ARCH_X86 489 /* Linux* OS already has 64-bit computation by default for long double, 490 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 491 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 492 instead of the default 53-bit. Even though long double doesn't work 493 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 494 expected to impact the correctness of the algorithm, but this has not 495 been mathematically proven. */ 496 // save original FPCW and set precision to 64-bit, as 497 // Windows* OS on IA-32 architecture defaults to 53-bit 498 unsigned int oldFpcw = _control87(0, 0); 499 _control87(_PC_64, _MCW_PC); // 0,0x30000 500 #endif 501 /* value used for comparison in solver for cross-over point */ 502 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 503 504 /* crossover point--chunk indexes equal to or greater than 505 this point switch to dynamic-style scheduling */ 506 UT cross; 507 508 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 509 x = (long double)1.0 - (long double)0.5 / nproc; 510 511 #ifdef KMP_DEBUG 512 { // test natural alignment 513 struct _test_a { 514 char a; 515 union { 516 char b; 517 DBL d; 518 }; 519 } t; 520 ptrdiff_t natural_alignment = 521 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 522 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 523 // long)natural_alignment ); 524 KMP_DEBUG_ASSERT( 525 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 526 } 527 #endif // KMP_DEBUG 528 529 /* save the term in thread private dispatch structure */ 530 *(DBL *)&pr->u.p.parm3 = x; 531 532 /* solve for the crossover point to the nearest integer i for which C_i 533 <= chunk */ 534 { 535 UT left, right, mid; 536 long double p; 537 538 /* estimate initial upper and lower bound */ 539 540 /* doesn't matter what value right is as long as it is positive, but 541 it affects performance of the solver */ 542 right = 229; 543 p = __kmp_pow<UT>(x, right); 544 if (p > target) { 545 do { 546 p *= p; 547 right <<= 1; 548 } while (p > target && right < (1 << 27)); 549 /* lower bound is previous (failed) estimate of upper bound */ 550 left = right >> 1; 551 } else { 552 left = 0; 553 } 554 555 /* bisection root-finding method */ 556 while (left + 1 < right) { 557 mid = (left + right) / 2; 558 if (__kmp_pow<UT>(x, mid) > target) { 559 left = mid; 560 } else { 561 right = mid; 562 } 563 } // while 564 cross = right; 565 } 566 /* assert sanity of computed crossover point */ 567 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 568 __kmp_pow<UT>(x, cross) <= target); 569 570 /* save the crossover point in thread private dispatch structure */ 571 pr->u.p.parm2 = cross; 572 573 // C75803 574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 576 #else 577 #define GUIDED_ANALYTICAL_WORKAROUND (x) 578 #endif 579 /* dynamic-style scheduling offset */ 580 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 581 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 582 cross * chunk; 583 #if KMP_OS_WINDOWS && KMP_ARCH_X86 584 // restore FPCW 585 _control87(oldFpcw, _MCW_PC); 586 #endif 587 } // if 588 } else { 589 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 590 "kmp_sch_static_greedy\n", 591 gtid)); 592 schedule = kmp_sch_static_greedy; 593 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 594 pr->u.p.parm1 = tc; 595 } // if 596 } // case 597 break; 598 case kmp_sch_static_greedy: 599 KD_TRACE( 600 100, 601 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 602 gtid)); 603 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 604 break; 605 case kmp_sch_static_chunked: 606 case kmp_sch_dynamic_chunked: 607 if (pr->u.p.parm1 <= 0) { 608 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 609 } 610 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 611 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 612 gtid)); 613 break; 614 case kmp_sch_trapezoidal: { 615 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 616 617 T parm1, parm2, parm3, parm4; 618 KD_TRACE(100, 619 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 620 gtid)); 621 622 parm1 = chunk; 623 624 /* F : size of the first cycle */ 625 parm2 = (tc / (2 * nproc)); 626 627 if (parm2 < 1) { 628 parm2 = 1; 629 } 630 631 /* L : size of the last cycle. Make sure the last cycle is not larger 632 than the first cycle. */ 633 if (parm1 < 1) { 634 parm1 = 1; 635 } else if (parm1 > parm2) { 636 parm1 = parm2; 637 } 638 639 /* N : number of cycles */ 640 parm3 = (parm2 + parm1); 641 parm3 = (2 * tc + parm3 - 1) / parm3; 642 643 if (parm3 < 2) { 644 parm3 = 2; 645 } 646 647 /* sigma : decreasing incr of the trapezoid */ 648 parm4 = (parm3 - 1); 649 parm4 = (parm2 - parm1) / parm4; 650 651 // pointless check, because parm4 >= 0 always 652 // if ( parm4 < 0 ) { 653 // parm4 = 0; 654 //} 655 656 pr->u.p.parm1 = parm1; 657 pr->u.p.parm2 = parm2; 658 pr->u.p.parm3 = parm3; 659 pr->u.p.parm4 = parm4; 660 } // case 661 break; 662 663 default: { 664 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 665 KMP_HNT(GetNewerLibrary), // Hint 666 __kmp_msg_null // Variadic argument list terminator 667 ); 668 } break; 669 } // switch 670 pr->schedule = schedule; 671 } 672 673 #if KMP_USE_HIER_SCHED 674 template <typename T> 675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 676 typename traits_t<T>::signed_t st); 677 template <> 678 inline void 679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 680 kmp_int32 ub, kmp_int32 st) { 681 __kmp_dispatch_init_hierarchy<kmp_int32>( 682 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 683 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 684 } 685 template <> 686 inline void 687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 688 kmp_uint32 ub, kmp_int32 st) { 689 __kmp_dispatch_init_hierarchy<kmp_uint32>( 690 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 691 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 692 } 693 template <> 694 inline void 695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 696 kmp_int64 ub, kmp_int64 st) { 697 __kmp_dispatch_init_hierarchy<kmp_int64>( 698 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 699 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 700 } 701 template <> 702 inline void 703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 704 kmp_uint64 ub, kmp_int64 st) { 705 __kmp_dispatch_init_hierarchy<kmp_uint64>( 706 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 707 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 708 } 709 710 // free all the hierarchy scheduling memory associated with the team 711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 712 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 713 for (int i = 0; i < num_disp_buff; ++i) { 714 // type does not matter here so use kmp_int32 715 auto sh = 716 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 717 &team->t.t_disp_buffer[i]); 718 if (sh->hier) { 719 sh->hier->deallocate(); 720 __kmp_free(sh->hier); 721 } 722 } 723 } 724 #endif 725 726 // UT - unsigned flavor of T, ST - signed flavor of T, 727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 728 template <typename T> 729 static void 730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 731 T ub, typename traits_t<T>::signed_t st, 732 typename traits_t<T>::signed_t chunk, int push_ws) { 733 typedef typename traits_t<T>::unsigned_t UT; 734 735 int active; 736 kmp_info_t *th; 737 kmp_team_t *team; 738 kmp_uint32 my_buffer_index; 739 dispatch_private_info_template<T> *pr; 740 dispatch_shared_info_template<T> volatile *sh; 741 742 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 743 sizeof(dispatch_private_info)); 744 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 745 sizeof(dispatch_shared_info)); 746 747 if (!TCR_4(__kmp_init_parallel)) 748 __kmp_parallel_initialize(); 749 750 #if INCLUDE_SSC_MARKS 751 SSC_MARK_DISPATCH_INIT(); 752 #endif 753 #ifdef KMP_DEBUG 754 typedef typename traits_t<T>::signed_t ST; 755 { 756 char *buff; 757 // create format specifiers before the debug output 758 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 759 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 760 traits_t<ST>::spec, traits_t<T>::spec, 761 traits_t<T>::spec, traits_t<ST>::spec); 762 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 763 __kmp_str_free(&buff); 764 } 765 #endif 766 /* setup data */ 767 th = __kmp_threads[gtid]; 768 team = th->th.th_team; 769 active = !team->t.t_serialized; 770 th->th.th_ident = loc; 771 772 // Any half-decent optimizer will remove this test when the blocks are empty 773 // since the macros expand to nothing 774 // when statistics are disabled. 775 if (schedule == __kmp_static) { 776 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 777 } else { 778 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 779 } 780 781 #if KMP_USE_HIER_SCHED 782 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 783 // Hierarchical scheduling does not work with ordered, so if ordered is 784 // detected, then revert back to threaded scheduling. 785 bool ordered; 786 enum sched_type my_sched = schedule; 787 my_buffer_index = th->th.th_dispatch->th_disp_index; 788 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 789 &th->th.th_dispatch 790 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 791 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 792 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 793 my_sched = 794 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 795 ordered = (kmp_ord_lower & my_sched); 796 if (pr->flags.use_hier) { 797 if (ordered) { 798 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 799 "Disabling hierarchical scheduling.\n", 800 gtid)); 801 pr->flags.use_hier = FALSE; 802 } 803 } 804 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 805 // Don't use hierarchical for ordered parallel loops and don't 806 // use the runtime hierarchy if one was specified in the program 807 if (!ordered && !pr->flags.use_hier) 808 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 809 } 810 #endif // KMP_USE_HIER_SCHED 811 812 #if USE_ITT_BUILD 813 kmp_uint64 cur_chunk = chunk; 814 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 815 __kmp_forkjoin_frames_mode == 3 && 816 KMP_MASTER_GTID(gtid) && 817 #if OMP_40_ENABLED 818 th->th.th_teams_microtask == NULL && 819 #endif 820 team->t.t_active_level == 1; 821 #endif 822 if (!active) { 823 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 824 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 825 } else { 826 KMP_DEBUG_ASSERT(th->th.th_dispatch == 827 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 828 829 my_buffer_index = th->th.th_dispatch->th_disp_index++; 830 831 /* What happens when number of threads changes, need to resize buffer? */ 832 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 833 &th->th.th_dispatch 834 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 835 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 836 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 837 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 838 my_buffer_index)); 839 } 840 841 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 842 #if USE_ITT_BUILD 843 &cur_chunk, 844 #endif 845 chunk, (T)th->th.th_team_nproc, 846 (T)th->th.th_info.ds.ds_tid); 847 if (active) { 848 if (pr->flags.ordered == 0) { 849 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 850 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 851 } else { 852 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 853 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 854 } 855 } 856 857 if (active) { 858 /* The name of this buffer should be my_buffer_index when it's free to use 859 * it */ 860 861 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 862 "sh->buffer_index:%d\n", 863 gtid, my_buffer_index, sh->buffer_index)); 864 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 865 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 866 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 867 // my_buffer_index are *always* 32-bit integers. 868 KMP_MB(); /* is this necessary? */ 869 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 870 "sh->buffer_index:%d\n", 871 gtid, my_buffer_index, sh->buffer_index)); 872 873 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 874 th->th.th_dispatch->th_dispatch_sh_current = 875 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 876 #if USE_ITT_BUILD 877 if (pr->flags.ordered) { 878 __kmp_itt_ordered_init(gtid); 879 } 880 // Report loop metadata 881 if (itt_need_metadata_reporting) { 882 // Only report metadata by master of active team at level 1 883 kmp_uint64 schedtype = 0; 884 switch (schedule) { 885 case kmp_sch_static_chunked: 886 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 887 break; 888 case kmp_sch_static_greedy: 889 cur_chunk = pr->u.p.parm1; 890 break; 891 case kmp_sch_dynamic_chunked: 892 schedtype = 1; 893 break; 894 case kmp_sch_guided_iterative_chunked: 895 case kmp_sch_guided_analytical_chunked: 896 #if OMP_45_ENABLED 897 case kmp_sch_guided_simd: 898 #endif 899 schedtype = 2; 900 break; 901 default: 902 // Should we put this case under "static"? 903 // case kmp_sch_static_steal: 904 schedtype = 3; 905 break; 906 } 907 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 908 } 909 #if KMP_USE_HIER_SCHED 910 if (pr->flags.use_hier) { 911 pr->u.p.count = 0; 912 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 913 } 914 #endif // KMP_USER_HIER_SCHED 915 #endif /* USE_ITT_BUILD */ 916 } 917 918 #ifdef KMP_DEBUG 919 { 920 char *buff; 921 // create format specifiers before the debug output 922 buff = __kmp_str_format( 923 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 924 "lb:%%%s ub:%%%s" 925 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 926 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 927 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 928 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 929 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 930 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 931 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 932 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 933 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 934 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 935 __kmp_str_free(&buff); 936 } 937 #endif 938 #if (KMP_STATIC_STEAL_ENABLED) 939 // It cannot be guaranteed that after execution of a loop with some other 940 // schedule kind all the parm3 variables will contain the same value. Even if 941 // all parm3 will be the same, it still exists a bad case like using 0 and 1 942 // rather than program life-time increment. So the dedicated variable is 943 // required. The 'static_steal_counter' is used. 944 if (schedule == kmp_sch_static_steal) { 945 // Other threads will inspect this variable when searching for a victim. 946 // This is a flag showing that other threads may steal from this thread 947 // since then. 948 volatile T *p = &pr->u.p.static_steal_counter; 949 *p = *p + 1; 950 } 951 #endif // ( KMP_STATIC_STEAL_ENABLED ) 952 953 #if OMPT_SUPPORT && OMPT_OPTIONAL 954 if (ompt_enabled.ompt_callback_work) { 955 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 956 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 957 ompt_callbacks.ompt_callback(ompt_callback_work)( 958 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 959 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 960 } 961 #endif 962 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 963 } 964 965 /* For ordered loops, either __kmp_dispatch_finish() should be called after 966 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 967 * every chunk of iterations. If the ordered section(s) were not executed 968 * for this iteration (or every iteration in this chunk), we need to set the 969 * ordered iteration counters so that the next thread can proceed. */ 970 template <typename UT> 971 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 972 typedef typename traits_t<UT>::signed_t ST; 973 kmp_info_t *th = __kmp_threads[gtid]; 974 975 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 976 if (!th->th.th_team->t.t_serialized) { 977 978 dispatch_private_info_template<UT> *pr = 979 reinterpret_cast<dispatch_private_info_template<UT> *>( 980 th->th.th_dispatch->th_dispatch_pr_current); 981 dispatch_shared_info_template<UT> volatile *sh = 982 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 983 th->th.th_dispatch->th_dispatch_sh_current); 984 KMP_DEBUG_ASSERT(pr); 985 KMP_DEBUG_ASSERT(sh); 986 KMP_DEBUG_ASSERT(th->th.th_dispatch == 987 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 988 989 if (pr->ordered_bumped) { 990 KD_TRACE( 991 1000, 992 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 993 gtid)); 994 pr->ordered_bumped = 0; 995 } else { 996 UT lower = pr->u.p.ordered_lower; 997 998 #ifdef KMP_DEBUG 999 { 1000 char *buff; 1001 // create format specifiers before the debug output 1002 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1003 "ordered_iteration:%%%s lower:%%%s\n", 1004 traits_t<UT>::spec, traits_t<UT>::spec); 1005 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1006 __kmp_str_free(&buff); 1007 } 1008 #endif 1009 1010 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1011 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1012 KMP_MB(); /* is this necessary? */ 1013 #ifdef KMP_DEBUG 1014 { 1015 char *buff; 1016 // create format specifiers before the debug output 1017 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1018 "ordered_iteration:%%%s lower:%%%s\n", 1019 traits_t<UT>::spec, traits_t<UT>::spec); 1020 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1021 __kmp_str_free(&buff); 1022 } 1023 #endif 1024 1025 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1026 } // if 1027 } // if 1028 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1029 } 1030 1031 #ifdef KMP_GOMP_COMPAT 1032 1033 template <typename UT> 1034 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1035 typedef typename traits_t<UT>::signed_t ST; 1036 kmp_info_t *th = __kmp_threads[gtid]; 1037 1038 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1039 if (!th->th.th_team->t.t_serialized) { 1040 // int cid; 1041 dispatch_private_info_template<UT> *pr = 1042 reinterpret_cast<dispatch_private_info_template<UT> *>( 1043 th->th.th_dispatch->th_dispatch_pr_current); 1044 dispatch_shared_info_template<UT> volatile *sh = 1045 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1046 th->th.th_dispatch->th_dispatch_sh_current); 1047 KMP_DEBUG_ASSERT(pr); 1048 KMP_DEBUG_ASSERT(sh); 1049 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1050 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1051 1052 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1053 UT lower = pr->u.p.ordered_lower; 1054 UT upper = pr->u.p.ordered_upper; 1055 UT inc = upper - lower + 1; 1056 1057 if (pr->ordered_bumped == inc) { 1058 KD_TRACE( 1059 1000, 1060 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1061 gtid)); 1062 pr->ordered_bumped = 0; 1063 } else { 1064 inc -= pr->ordered_bumped; 1065 1066 #ifdef KMP_DEBUG 1067 { 1068 char *buff; 1069 // create format specifiers before the debug output 1070 buff = __kmp_str_format( 1071 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1072 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1073 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1074 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1075 __kmp_str_free(&buff); 1076 } 1077 #endif 1078 1079 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1080 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1081 1082 KMP_MB(); /* is this necessary? */ 1083 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1084 "ordered_bumped to zero\n", 1085 gtid)); 1086 pr->ordered_bumped = 0; 1087 //!!!!! TODO check if the inc should be unsigned, or signed??? 1088 #ifdef KMP_DEBUG 1089 { 1090 char *buff; 1091 // create format specifiers before the debug output 1092 buff = __kmp_str_format( 1093 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1094 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1095 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1096 traits_t<UT>::spec); 1097 KD_TRACE(1000, 1098 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1099 __kmp_str_free(&buff); 1100 } 1101 #endif 1102 1103 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1104 } 1105 // } 1106 } 1107 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1108 } 1109 1110 #endif /* KMP_GOMP_COMPAT */ 1111 1112 template <typename T> 1113 int __kmp_dispatch_next_algorithm(int gtid, 1114 dispatch_private_info_template<T> *pr, 1115 dispatch_shared_info_template<T> volatile *sh, 1116 kmp_int32 *p_last, T *p_lb, T *p_ub, 1117 typename traits_t<T>::signed_t *p_st, T nproc, 1118 T tid) { 1119 typedef typename traits_t<T>::unsigned_t UT; 1120 typedef typename traits_t<T>::signed_t ST; 1121 typedef typename traits_t<T>::floating_t DBL; 1122 int status = 0; 1123 kmp_int32 last = 0; 1124 T start; 1125 ST incr; 1126 UT limit, trip, init; 1127 kmp_info_t *th = __kmp_threads[gtid]; 1128 kmp_team_t *team = th->th.th_team; 1129 1130 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1131 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1132 KMP_DEBUG_ASSERT(pr); 1133 KMP_DEBUG_ASSERT(sh); 1134 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1135 #ifdef KMP_DEBUG 1136 { 1137 char *buff; 1138 // create format specifiers before the debug output 1139 buff = 1140 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1141 "sh:%%p nproc:%%%s tid:%%%s\n", 1142 traits_t<T>::spec, traits_t<T>::spec); 1143 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1144 __kmp_str_free(&buff); 1145 } 1146 #endif 1147 1148 // zero trip count 1149 if (pr->u.p.tc == 0) { 1150 KD_TRACE(10, 1151 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1152 "zero status:%d\n", 1153 gtid, status)); 1154 return 0; 1155 } 1156 1157 switch (pr->schedule) { 1158 #if (KMP_STATIC_STEAL_ENABLED) 1159 case kmp_sch_static_steal: { 1160 T chunk = pr->u.p.parm1; 1161 1162 KD_TRACE(100, 1163 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1164 gtid)); 1165 1166 trip = pr->u.p.tc - 1; 1167 1168 if (traits_t<T>::type_size > 4) { 1169 // use lock for 8-byte and CAS for 4-byte induction 1170 // variable. TODO (optional): check and use 16-byte CAS 1171 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1172 KMP_DEBUG_ASSERT(lck != NULL); 1173 if (pr->u.p.count < (UT)pr->u.p.ub) { 1174 __kmp_acquire_lock(lck, gtid); 1175 // try to get own chunk of iterations 1176 init = (pr->u.p.count)++; 1177 status = (init < (UT)pr->u.p.ub); 1178 __kmp_release_lock(lck, gtid); 1179 } else { 1180 status = 0; // no own chunks 1181 } 1182 if (!status) { // try to steal 1183 kmp_info_t **other_threads = team->t.t_threads; 1184 int while_limit = nproc; // nproc attempts to find a victim 1185 int while_index = 0; 1186 // TODO: algorithm of searching for a victim 1187 // should be cleaned up and measured 1188 while ((!status) && (while_limit != ++while_index)) { 1189 T remaining; 1190 T victimIdx = pr->u.p.parm4; 1191 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1192 dispatch_private_info_template<T> *victim = 1193 reinterpret_cast<dispatch_private_info_template<T> *>( 1194 other_threads[victimIdx] 1195 ->th.th_dispatch->th_dispatch_pr_current); 1196 while ((victim == NULL || victim == pr || 1197 (*(volatile T *)&victim->u.p.static_steal_counter != 1198 *(volatile T *)&pr->u.p.static_steal_counter)) && 1199 oldVictimIdx != victimIdx) { 1200 victimIdx = (victimIdx + 1) % nproc; 1201 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1202 other_threads[victimIdx] 1203 ->th.th_dispatch->th_dispatch_pr_current); 1204 } 1205 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1206 *(volatile T *)&pr->u.p.static_steal_counter)) { 1207 continue; // try once more (nproc attempts in total) 1208 // no victim is ready yet to participate in stealing 1209 // because all victims are still in kmp_init_dispatch 1210 } 1211 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1212 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1213 continue; // not enough chunks to steal, goto next victim 1214 } 1215 1216 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1217 KMP_ASSERT(lck != NULL); 1218 __kmp_acquire_lock(lck, gtid); 1219 limit = victim->u.p.ub; // keep initial ub 1220 if (victim->u.p.count >= limit || 1221 (remaining = limit - victim->u.p.count) < 2) { 1222 __kmp_release_lock(lck, gtid); 1223 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1224 continue; // not enough chunks to steal 1225 } 1226 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1227 // by 1 1228 if (remaining > 3) { 1229 // steal 1/4 of remaining 1230 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1231 init = (victim->u.p.ub -= (remaining >> 2)); 1232 } else { 1233 // steal 1 chunk of 2 or 3 remaining 1234 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1235 init = (victim->u.p.ub -= 1); 1236 } 1237 __kmp_release_lock(lck, gtid); 1238 1239 KMP_DEBUG_ASSERT(init + 1 <= limit); 1240 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1241 status = 1; 1242 while_index = 0; 1243 // now update own count and ub with stolen range but init chunk 1244 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1245 pr->u.p.count = init + 1; 1246 pr->u.p.ub = limit; 1247 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1248 } // while (search for victim) 1249 } // if (try to find victim and steal) 1250 } else { 1251 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1252 typedef union { 1253 struct { 1254 UT count; 1255 T ub; 1256 } p; 1257 kmp_int64 b; 1258 } union_i4; 1259 // All operations on 'count' or 'ub' must be combined atomically 1260 // together. 1261 { 1262 union_i4 vold, vnew; 1263 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1264 vnew = vold; 1265 vnew.p.count++; 1266 while (!KMP_COMPARE_AND_STORE_ACQ64( 1267 (volatile kmp_int64 *)&pr->u.p.count, 1268 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1269 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1270 KMP_CPU_PAUSE(); 1271 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1272 vnew = vold; 1273 vnew.p.count++; 1274 } 1275 vnew = vold; 1276 init = vnew.p.count; 1277 status = (init < (UT)vnew.p.ub); 1278 } 1279 1280 if (!status) { 1281 kmp_info_t **other_threads = team->t.t_threads; 1282 int while_limit = nproc; // nproc attempts to find a victim 1283 int while_index = 0; 1284 1285 // TODO: algorithm of searching for a victim 1286 // should be cleaned up and measured 1287 while ((!status) && (while_limit != ++while_index)) { 1288 union_i4 vold, vnew; 1289 kmp_int32 remaining; 1290 T victimIdx = pr->u.p.parm4; 1291 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1292 dispatch_private_info_template<T> *victim = 1293 reinterpret_cast<dispatch_private_info_template<T> *>( 1294 other_threads[victimIdx] 1295 ->th.th_dispatch->th_dispatch_pr_current); 1296 while ((victim == NULL || victim == pr || 1297 (*(volatile T *)&victim->u.p.static_steal_counter != 1298 *(volatile T *)&pr->u.p.static_steal_counter)) && 1299 oldVictimIdx != victimIdx) { 1300 victimIdx = (victimIdx + 1) % nproc; 1301 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1302 other_threads[victimIdx] 1303 ->th.th_dispatch->th_dispatch_pr_current); 1304 } 1305 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1306 *(volatile T *)&pr->u.p.static_steal_counter)) { 1307 continue; // try once more (nproc attempts in total) 1308 // no victim is ready yet to participate in stealing 1309 // because all victims are still in kmp_init_dispatch 1310 } 1311 pr->u.p.parm4 = victimIdx; // new victim found 1312 while (1) { // CAS loop if victim has enough chunks to steal 1313 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1314 vnew = vold; 1315 1316 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1317 if (vnew.p.count >= (UT)vnew.p.ub || 1318 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1319 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1320 break; // not enough chunks to steal, goto next victim 1321 } 1322 if (remaining > 3) { 1323 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1324 } else { 1325 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1326 } 1327 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1328 // TODO: Should this be acquire or release? 1329 if (KMP_COMPARE_AND_STORE_ACQ64( 1330 (volatile kmp_int64 *)&victim->u.p.count, 1331 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1332 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1333 // stealing succedded 1334 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1335 vold.p.ub - vnew.p.ub); 1336 status = 1; 1337 while_index = 0; 1338 // now update own count and ub 1339 init = vnew.p.ub; 1340 vold.p.count = init + 1; 1341 #if KMP_ARCH_X86 1342 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1343 #else 1344 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1345 #endif 1346 break; 1347 } // if (check CAS result) 1348 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1349 } // while (try to steal from particular victim) 1350 } // while (search for victim) 1351 } // if (try to find victim and steal) 1352 } // if (4-byte induction variable) 1353 if (!status) { 1354 *p_lb = 0; 1355 *p_ub = 0; 1356 if (p_st != NULL) 1357 *p_st = 0; 1358 } else { 1359 start = pr->u.p.parm2; 1360 init *= chunk; 1361 limit = chunk + init - 1; 1362 incr = pr->u.p.st; 1363 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1364 1365 KMP_DEBUG_ASSERT(init <= trip); 1366 if ((last = (limit >= trip)) != 0) 1367 limit = trip; 1368 if (p_st != NULL) 1369 *p_st = incr; 1370 1371 if (incr == 1) { 1372 *p_lb = start + init; 1373 *p_ub = start + limit; 1374 } else { 1375 *p_lb = start + init * incr; 1376 *p_ub = start + limit * incr; 1377 } 1378 1379 if (pr->flags.ordered) { 1380 pr->u.p.ordered_lower = init; 1381 pr->u.p.ordered_upper = limit; 1382 } // if 1383 } // if 1384 break; 1385 } // case 1386 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1387 case kmp_sch_static_balanced: { 1388 KD_TRACE( 1389 10, 1390 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1391 gtid)); 1392 /* check if thread has any iteration to do */ 1393 if ((status = !pr->u.p.count) != 0) { 1394 pr->u.p.count = 1; 1395 *p_lb = pr->u.p.lb; 1396 *p_ub = pr->u.p.ub; 1397 last = pr->u.p.parm1; 1398 if (p_st != NULL) 1399 *p_st = pr->u.p.st; 1400 } else { /* no iterations to do */ 1401 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1402 } 1403 } // case 1404 break; 1405 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1406 merged here */ 1407 case kmp_sch_static_chunked: { 1408 T parm1; 1409 1410 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1411 "kmp_sch_static_[affinity|chunked] case\n", 1412 gtid)); 1413 parm1 = pr->u.p.parm1; 1414 1415 trip = pr->u.p.tc - 1; 1416 init = parm1 * (pr->u.p.count + tid); 1417 1418 if ((status = (init <= trip)) != 0) { 1419 start = pr->u.p.lb; 1420 incr = pr->u.p.st; 1421 limit = parm1 + init - 1; 1422 1423 if ((last = (limit >= trip)) != 0) 1424 limit = trip; 1425 1426 if (p_st != NULL) 1427 *p_st = incr; 1428 1429 pr->u.p.count += nproc; 1430 1431 if (incr == 1) { 1432 *p_lb = start + init; 1433 *p_ub = start + limit; 1434 } else { 1435 *p_lb = start + init * incr; 1436 *p_ub = start + limit * incr; 1437 } 1438 1439 if (pr->flags.ordered) { 1440 pr->u.p.ordered_lower = init; 1441 pr->u.p.ordered_upper = limit; 1442 } // if 1443 } // if 1444 } // case 1445 break; 1446 1447 case kmp_sch_dynamic_chunked: { 1448 T chunk = pr->u.p.parm1; 1449 1450 KD_TRACE( 1451 100, 1452 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1453 gtid)); 1454 1455 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1456 trip = pr->u.p.tc - 1; 1457 1458 if ((status = (init <= trip)) == 0) { 1459 *p_lb = 0; 1460 *p_ub = 0; 1461 if (p_st != NULL) 1462 *p_st = 0; 1463 } else { 1464 start = pr->u.p.lb; 1465 limit = chunk + init - 1; 1466 incr = pr->u.p.st; 1467 1468 if ((last = (limit >= trip)) != 0) 1469 limit = trip; 1470 1471 if (p_st != NULL) 1472 *p_st = incr; 1473 1474 if (incr == 1) { 1475 *p_lb = start + init; 1476 *p_ub = start + limit; 1477 } else { 1478 *p_lb = start + init * incr; 1479 *p_ub = start + limit * incr; 1480 } 1481 1482 if (pr->flags.ordered) { 1483 pr->u.p.ordered_lower = init; 1484 pr->u.p.ordered_upper = limit; 1485 } // if 1486 } // if 1487 } // case 1488 break; 1489 1490 case kmp_sch_guided_iterative_chunked: { 1491 T chunkspec = pr->u.p.parm1; 1492 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1493 "iterative case\n", 1494 gtid)); 1495 trip = pr->u.p.tc; 1496 // Start atomic part of calculations 1497 while (1) { 1498 ST remaining; // signed, because can be < 0 1499 init = sh->u.s.iteration; // shared value 1500 remaining = trip - init; 1501 if (remaining <= 0) { // AC: need to compare with 0 first 1502 // nothing to do, don't try atomic op 1503 status = 0; 1504 break; 1505 } 1506 if ((T)remaining < 1507 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1508 // use dynamic-style shcedule 1509 // atomically inrement iterations, get old value 1510 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1511 (ST)chunkspec); 1512 remaining = trip - init; 1513 if (remaining <= 0) { 1514 status = 0; // all iterations got by other threads 1515 } else { 1516 // got some iterations to work on 1517 status = 1; 1518 if ((T)remaining > chunkspec) { 1519 limit = init + chunkspec - 1; 1520 } else { 1521 last = 1; // the last chunk 1522 limit = init + remaining - 1; 1523 } // if 1524 } // if 1525 break; 1526 } // if 1527 limit = init + 1528 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1529 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1530 (ST)init, (ST)limit)) { 1531 // CAS was successful, chunk obtained 1532 status = 1; 1533 --limit; 1534 break; 1535 } // if 1536 } // while 1537 if (status != 0) { 1538 start = pr->u.p.lb; 1539 incr = pr->u.p.st; 1540 if (p_st != NULL) 1541 *p_st = incr; 1542 *p_lb = start + init * incr; 1543 *p_ub = start + limit * incr; 1544 if (pr->flags.ordered) { 1545 pr->u.p.ordered_lower = init; 1546 pr->u.p.ordered_upper = limit; 1547 } // if 1548 } else { 1549 *p_lb = 0; 1550 *p_ub = 0; 1551 if (p_st != NULL) 1552 *p_st = 0; 1553 } // if 1554 } // case 1555 break; 1556 1557 #if OMP_45_ENABLED 1558 case kmp_sch_guided_simd: { 1559 // same as iterative but curr-chunk adjusted to be multiple of given 1560 // chunk 1561 T chunk = pr->u.p.parm1; 1562 KD_TRACE(100, 1563 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1564 gtid)); 1565 trip = pr->u.p.tc; 1566 // Start atomic part of calculations 1567 while (1) { 1568 ST remaining; // signed, because can be < 0 1569 init = sh->u.s.iteration; // shared value 1570 remaining = trip - init; 1571 if (remaining <= 0) { // AC: need to compare with 0 first 1572 status = 0; // nothing to do, don't try atomic op 1573 break; 1574 } 1575 KMP_DEBUG_ASSERT(init % chunk == 0); 1576 // compare with K*nproc*(chunk+1), K=2 by default 1577 if ((T)remaining < pr->u.p.parm2) { 1578 // use dynamic-style shcedule 1579 // atomically inrement iterations, get old value 1580 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1581 (ST)chunk); 1582 remaining = trip - init; 1583 if (remaining <= 0) { 1584 status = 0; // all iterations got by other threads 1585 } else { 1586 // got some iterations to work on 1587 status = 1; 1588 if ((T)remaining > chunk) { 1589 limit = init + chunk - 1; 1590 } else { 1591 last = 1; // the last chunk 1592 limit = init + remaining - 1; 1593 } // if 1594 } // if 1595 break; 1596 } // if 1597 // divide by K*nproc 1598 UT span = remaining * (*(double *)&pr->u.p.parm3); 1599 UT rem = span % chunk; 1600 if (rem) // adjust so that span%chunk == 0 1601 span += chunk - rem; 1602 limit = init + span; 1603 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1604 (ST)init, (ST)limit)) { 1605 // CAS was successful, chunk obtained 1606 status = 1; 1607 --limit; 1608 break; 1609 } // if 1610 } // while 1611 if (status != 0) { 1612 start = pr->u.p.lb; 1613 incr = pr->u.p.st; 1614 if (p_st != NULL) 1615 *p_st = incr; 1616 *p_lb = start + init * incr; 1617 *p_ub = start + limit * incr; 1618 if (pr->flags.ordered) { 1619 pr->u.p.ordered_lower = init; 1620 pr->u.p.ordered_upper = limit; 1621 } // if 1622 } else { 1623 *p_lb = 0; 1624 *p_ub = 0; 1625 if (p_st != NULL) 1626 *p_st = 0; 1627 } // if 1628 } // case 1629 break; 1630 #endif // OMP_45_ENABLED 1631 1632 case kmp_sch_guided_analytical_chunked: { 1633 T chunkspec = pr->u.p.parm1; 1634 UT chunkIdx; 1635 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1636 /* for storing original FPCW value for Windows* OS on 1637 IA-32 architecture 8-byte version */ 1638 unsigned int oldFpcw; 1639 unsigned int fpcwSet = 0; 1640 #endif 1641 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1642 "kmp_sch_guided_analytical_chunked case\n", 1643 gtid)); 1644 1645 trip = pr->u.p.tc; 1646 1647 KMP_DEBUG_ASSERT(nproc > 1); 1648 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1649 1650 while (1) { /* this while loop is a safeguard against unexpected zero 1651 chunk sizes */ 1652 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1653 if (chunkIdx >= (UT)pr->u.p.parm2) { 1654 --trip; 1655 /* use dynamic-style scheduling */ 1656 init = chunkIdx * chunkspec + pr->u.p.count; 1657 /* need to verify init > 0 in case of overflow in the above 1658 * calculation */ 1659 if ((status = (init > 0 && init <= trip)) != 0) { 1660 limit = init + chunkspec - 1; 1661 1662 if ((last = (limit >= trip)) != 0) 1663 limit = trip; 1664 } 1665 break; 1666 } else { 1667 /* use exponential-style scheduling */ 1668 /* The following check is to workaround the lack of long double precision on 1669 Windows* OS. 1670 This check works around the possible effect that init != 0 for chunkIdx == 0. 1671 */ 1672 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1673 /* If we haven't already done so, save original 1674 FPCW and set precision to 64-bit, as Windows* OS 1675 on IA-32 architecture defaults to 53-bit */ 1676 if (!fpcwSet) { 1677 oldFpcw = _control87(0, 0); 1678 _control87(_PC_64, _MCW_PC); 1679 fpcwSet = 0x30000; 1680 } 1681 #endif 1682 if (chunkIdx) { 1683 init = __kmp_dispatch_guided_remaining<T>( 1684 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1685 KMP_DEBUG_ASSERT(init); 1686 init = trip - init; 1687 } else 1688 init = 0; 1689 limit = trip - __kmp_dispatch_guided_remaining<T>( 1690 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1691 KMP_ASSERT(init <= limit); 1692 if (init < limit) { 1693 KMP_DEBUG_ASSERT(limit <= trip); 1694 --limit; 1695 status = 1; 1696 break; 1697 } // if 1698 } // if 1699 } // while (1) 1700 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1701 /* restore FPCW if necessary 1702 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1703 */ 1704 if (fpcwSet && (oldFpcw & fpcwSet)) 1705 _control87(oldFpcw, _MCW_PC); 1706 #endif 1707 if (status != 0) { 1708 start = pr->u.p.lb; 1709 incr = pr->u.p.st; 1710 if (p_st != NULL) 1711 *p_st = incr; 1712 *p_lb = start + init * incr; 1713 *p_ub = start + limit * incr; 1714 if (pr->flags.ordered) { 1715 pr->u.p.ordered_lower = init; 1716 pr->u.p.ordered_upper = limit; 1717 } 1718 } else { 1719 *p_lb = 0; 1720 *p_ub = 0; 1721 if (p_st != NULL) 1722 *p_st = 0; 1723 } 1724 } // case 1725 break; 1726 1727 case kmp_sch_trapezoidal: { 1728 UT index; 1729 T parm2 = pr->u.p.parm2; 1730 T parm3 = pr->u.p.parm3; 1731 T parm4 = pr->u.p.parm4; 1732 KD_TRACE(100, 1733 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1734 gtid)); 1735 1736 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1737 1738 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1739 trip = pr->u.p.tc - 1; 1740 1741 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1742 *p_lb = 0; 1743 *p_ub = 0; 1744 if (p_st != NULL) 1745 *p_st = 0; 1746 } else { 1747 start = pr->u.p.lb; 1748 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1749 incr = pr->u.p.st; 1750 1751 if ((last = (limit >= trip)) != 0) 1752 limit = trip; 1753 1754 if (p_st != NULL) 1755 *p_st = incr; 1756 1757 if (incr == 1) { 1758 *p_lb = start + init; 1759 *p_ub = start + limit; 1760 } else { 1761 *p_lb = start + init * incr; 1762 *p_ub = start + limit * incr; 1763 } 1764 1765 if (pr->flags.ordered) { 1766 pr->u.p.ordered_lower = init; 1767 pr->u.p.ordered_upper = limit; 1768 } // if 1769 } // if 1770 } // case 1771 break; 1772 default: { 1773 status = 0; // to avoid complaints on uninitialized variable use 1774 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1775 KMP_HNT(GetNewerLibrary), // Hint 1776 __kmp_msg_null // Variadic argument list terminator 1777 ); 1778 } break; 1779 } // switch 1780 if (p_last) 1781 *p_last = last; 1782 #ifdef KMP_DEBUG 1783 if (pr->flags.ordered) { 1784 char *buff; 1785 // create format specifiers before the debug output 1786 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1787 "ordered_lower:%%%s ordered_upper:%%%s\n", 1788 traits_t<UT>::spec, traits_t<UT>::spec); 1789 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1790 __kmp_str_free(&buff); 1791 } 1792 { 1793 char *buff; 1794 // create format specifiers before the debug output 1795 buff = __kmp_str_format( 1796 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1797 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1798 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1799 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1800 __kmp_str_free(&buff); 1801 } 1802 #endif 1803 return status; 1804 } 1805 1806 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1807 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1808 is not called. */ 1809 #if OMPT_SUPPORT && OMPT_OPTIONAL 1810 #define OMPT_LOOP_END \ 1811 if (status == 0) { \ 1812 if (ompt_enabled.ompt_callback_work) { \ 1813 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1814 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1815 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1816 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1817 &(task_info->task_data), 0, codeptr); \ 1818 } \ 1819 } 1820 // TODO: implement count 1821 #else 1822 #define OMPT_LOOP_END // no-op 1823 #endif 1824 1825 #if KMP_STATS_ENABLED 1826 #define KMP_STATS_LOOP_END \ 1827 { \ 1828 kmp_int64 u, l, t, i; \ 1829 l = (kmp_int64)(*p_lb); \ 1830 u = (kmp_int64)(*p_ub); \ 1831 i = (kmp_int64)(pr->u.p.st); \ 1832 if (status == 0) { \ 1833 t = 0; \ 1834 KMP_POP_PARTITIONED_TIMER(); \ 1835 } else if (i == 1) { \ 1836 if (u >= l) \ 1837 t = u - l + 1; \ 1838 else \ 1839 t = 0; \ 1840 } else if (i < 0) { \ 1841 if (l >= u) \ 1842 t = (l - u) / (-i) + 1; \ 1843 else \ 1844 t = 0; \ 1845 } else { \ 1846 if (u >= l) \ 1847 t = (u - l) / i + 1; \ 1848 else \ 1849 t = 0; \ 1850 } \ 1851 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1852 } 1853 #else 1854 #define KMP_STATS_LOOP_END /* Nothing */ 1855 #endif 1856 1857 template <typename T> 1858 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1859 T *p_lb, T *p_ub, 1860 typename traits_t<T>::signed_t *p_st 1861 #if OMPT_SUPPORT && OMPT_OPTIONAL 1862 , 1863 void *codeptr 1864 #endif 1865 ) { 1866 1867 typedef typename traits_t<T>::unsigned_t UT; 1868 typedef typename traits_t<T>::signed_t ST; 1869 // This is potentially slightly misleading, schedule(runtime) will appear here 1870 // even if the actual runtme schedule is static. (Which points out a 1871 // disadavantage of schedule(runtime): even when static scheduling is used it 1872 // costs more than a compile time choice to use static scheduling would.) 1873 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1874 1875 int status; 1876 dispatch_private_info_template<T> *pr; 1877 kmp_info_t *th = __kmp_threads[gtid]; 1878 kmp_team_t *team = th->th.th_team; 1879 1880 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1881 KD_TRACE( 1882 1000, 1883 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1884 gtid, p_lb, p_ub, p_st, p_last)); 1885 1886 if (team->t.t_serialized) { 1887 /* NOTE: serialize this dispatch becase we are not at the active level */ 1888 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1889 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1890 KMP_DEBUG_ASSERT(pr); 1891 1892 if ((status = (pr->u.p.tc != 0)) == 0) { 1893 *p_lb = 0; 1894 *p_ub = 0; 1895 // if ( p_last != NULL ) 1896 // *p_last = 0; 1897 if (p_st != NULL) 1898 *p_st = 0; 1899 if (__kmp_env_consistency_check) { 1900 if (pr->pushed_ws != ct_none) { 1901 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1902 } 1903 } 1904 } else if (pr->flags.nomerge) { 1905 kmp_int32 last; 1906 T start; 1907 UT limit, trip, init; 1908 ST incr; 1909 T chunk = pr->u.p.parm1; 1910 1911 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1912 gtid)); 1913 1914 init = chunk * pr->u.p.count++; 1915 trip = pr->u.p.tc - 1; 1916 1917 if ((status = (init <= trip)) == 0) { 1918 *p_lb = 0; 1919 *p_ub = 0; 1920 // if ( p_last != NULL ) 1921 // *p_last = 0; 1922 if (p_st != NULL) 1923 *p_st = 0; 1924 if (__kmp_env_consistency_check) { 1925 if (pr->pushed_ws != ct_none) { 1926 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1927 } 1928 } 1929 } else { 1930 start = pr->u.p.lb; 1931 limit = chunk + init - 1; 1932 incr = pr->u.p.st; 1933 1934 if ((last = (limit >= trip)) != 0) { 1935 limit = trip; 1936 #if KMP_OS_WINDOWS 1937 pr->u.p.last_upper = pr->u.p.ub; 1938 #endif /* KMP_OS_WINDOWS */ 1939 } 1940 if (p_last != NULL) 1941 *p_last = last; 1942 if (p_st != NULL) 1943 *p_st = incr; 1944 if (incr == 1) { 1945 *p_lb = start + init; 1946 *p_ub = start + limit; 1947 } else { 1948 *p_lb = start + init * incr; 1949 *p_ub = start + limit * incr; 1950 } 1951 1952 if (pr->flags.ordered) { 1953 pr->u.p.ordered_lower = init; 1954 pr->u.p.ordered_upper = limit; 1955 #ifdef KMP_DEBUG 1956 { 1957 char *buff; 1958 // create format specifiers before the debug output 1959 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1960 "ordered_lower:%%%s ordered_upper:%%%s\n", 1961 traits_t<UT>::spec, traits_t<UT>::spec); 1962 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1963 pr->u.p.ordered_upper)); 1964 __kmp_str_free(&buff); 1965 } 1966 #endif 1967 } // if 1968 } // if 1969 } else { 1970 pr->u.p.tc = 0; 1971 *p_lb = pr->u.p.lb; 1972 *p_ub = pr->u.p.ub; 1973 #if KMP_OS_WINDOWS 1974 pr->u.p.last_upper = *p_ub; 1975 #endif /* KMP_OS_WINDOWS */ 1976 if (p_last != NULL) 1977 *p_last = TRUE; 1978 if (p_st != NULL) 1979 *p_st = pr->u.p.st; 1980 } // if 1981 #ifdef KMP_DEBUG 1982 { 1983 char *buff; 1984 // create format specifiers before the debug output 1985 buff = __kmp_str_format( 1986 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1987 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1988 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1989 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1990 __kmp_str_free(&buff); 1991 } 1992 #endif 1993 #if INCLUDE_SSC_MARKS 1994 SSC_MARK_DISPATCH_NEXT(); 1995 #endif 1996 OMPT_LOOP_END; 1997 KMP_STATS_LOOP_END; 1998 return status; 1999 } else { 2000 kmp_int32 last = 0; 2001 dispatch_shared_info_template<T> volatile *sh; 2002 2003 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2004 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2005 2006 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2007 th->th.th_dispatch->th_dispatch_pr_current); 2008 KMP_DEBUG_ASSERT(pr); 2009 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2010 th->th.th_dispatch->th_dispatch_sh_current); 2011 KMP_DEBUG_ASSERT(sh); 2012 2013 #if KMP_USE_HIER_SCHED 2014 if (pr->flags.use_hier) 2015 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2016 else 2017 #endif // KMP_USE_HIER_SCHED 2018 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2019 p_st, th->th.th_team_nproc, 2020 th->th.th_info.ds.ds_tid); 2021 // status == 0: no more iterations to execute 2022 if (status == 0) { 2023 UT num_done; 2024 2025 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2026 #ifdef KMP_DEBUG 2027 { 2028 char *buff; 2029 // create format specifiers before the debug output 2030 buff = __kmp_str_format( 2031 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2032 traits_t<UT>::spec); 2033 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2034 __kmp_str_free(&buff); 2035 } 2036 #endif 2037 2038 #if KMP_USE_HIER_SCHED 2039 pr->flags.use_hier = FALSE; 2040 #endif 2041 if ((ST)num_done == th->th.th_team_nproc - 1) { 2042 #if (KMP_STATIC_STEAL_ENABLED) 2043 if (pr->schedule == kmp_sch_static_steal && 2044 traits_t<T>::type_size > 4) { 2045 int i; 2046 kmp_info_t **other_threads = team->t.t_threads; 2047 // loop complete, safe to destroy locks used for stealing 2048 for (i = 0; i < th->th.th_team_nproc; ++i) { 2049 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2050 KMP_ASSERT(lck != NULL); 2051 __kmp_destroy_lock(lck); 2052 __kmp_free(lck); 2053 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2054 } 2055 } 2056 #endif 2057 /* NOTE: release this buffer to be reused */ 2058 2059 KMP_MB(); /* Flush all pending memory write invalidates. */ 2060 2061 sh->u.s.num_done = 0; 2062 sh->u.s.iteration = 0; 2063 2064 /* TODO replace with general release procedure? */ 2065 if (pr->flags.ordered) { 2066 sh->u.s.ordered_iteration = 0; 2067 } 2068 2069 KMP_MB(); /* Flush all pending memory write invalidates. */ 2070 2071 sh->buffer_index += __kmp_dispatch_num_buffers; 2072 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2073 gtid, sh->buffer_index)); 2074 2075 KMP_MB(); /* Flush all pending memory write invalidates. */ 2076 2077 } // if 2078 if (__kmp_env_consistency_check) { 2079 if (pr->pushed_ws != ct_none) { 2080 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2081 } 2082 } 2083 2084 th->th.th_dispatch->th_deo_fcn = NULL; 2085 th->th.th_dispatch->th_dxo_fcn = NULL; 2086 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2087 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2088 } // if (status == 0) 2089 #if KMP_OS_WINDOWS 2090 else if (last) { 2091 pr->u.p.last_upper = pr->u.p.ub; 2092 } 2093 #endif /* KMP_OS_WINDOWS */ 2094 if (p_last != NULL && status != 0) 2095 *p_last = last; 2096 } // if 2097 2098 #ifdef KMP_DEBUG 2099 { 2100 char *buff; 2101 // create format specifiers before the debug output 2102 buff = __kmp_str_format( 2103 "__kmp_dispatch_next: T#%%d normal case: " 2104 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2105 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2106 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2107 (p_last ? *p_last : 0), status)); 2108 __kmp_str_free(&buff); 2109 } 2110 #endif 2111 #if INCLUDE_SSC_MARKS 2112 SSC_MARK_DISPATCH_NEXT(); 2113 #endif 2114 OMPT_LOOP_END; 2115 KMP_STATS_LOOP_END; 2116 return status; 2117 } 2118 2119 template <typename T> 2120 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2121 kmp_int32 *plastiter, T *plower, T *pupper, 2122 typename traits_t<T>::signed_t incr) { 2123 typedef typename traits_t<T>::unsigned_t UT; 2124 kmp_uint32 team_id; 2125 kmp_uint32 nteams; 2126 UT trip_count; 2127 kmp_team_t *team; 2128 kmp_info_t *th; 2129 2130 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2131 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2132 #ifdef KMP_DEBUG 2133 typedef typename traits_t<T>::signed_t ST; 2134 { 2135 char *buff; 2136 // create format specifiers before the debug output 2137 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2138 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2139 traits_t<T>::spec, traits_t<T>::spec, 2140 traits_t<ST>::spec, traits_t<T>::spec); 2141 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2142 __kmp_str_free(&buff); 2143 } 2144 #endif 2145 2146 if (__kmp_env_consistency_check) { 2147 if (incr == 0) { 2148 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2149 loc); 2150 } 2151 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2152 // The loop is illegal. 2153 // Some zero-trip loops maintained by compiler, e.g.: 2154 // for(i=10;i<0;++i) // lower >= upper - run-time check 2155 // for(i=0;i>10;--i) // lower <= upper - run-time check 2156 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2157 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2158 // Compiler does not check the following illegal loops: 2159 // for(i=0;i<10;i+=incr) // where incr<0 2160 // for(i=10;i>0;i-=incr) // where incr<0 2161 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2162 } 2163 } 2164 th = __kmp_threads[gtid]; 2165 team = th->th.th_team; 2166 #if OMP_40_ENABLED 2167 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2168 nteams = th->th.th_teams_size.nteams; 2169 #endif 2170 team_id = team->t.t_master_tid; 2171 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2172 2173 // compute global trip count 2174 if (incr == 1) { 2175 trip_count = *pupper - *plower + 1; 2176 } else if (incr == -1) { 2177 trip_count = *plower - *pupper + 1; 2178 } else if (incr > 0) { 2179 // upper-lower can exceed the limit of signed type 2180 trip_count = (UT)(*pupper - *plower) / incr + 1; 2181 } else { 2182 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2183 } 2184 2185 if (trip_count <= nteams) { 2186 KMP_DEBUG_ASSERT( 2187 __kmp_static == kmp_sch_static_greedy || 2188 __kmp_static == 2189 kmp_sch_static_balanced); // Unknown static scheduling type. 2190 // only some teams get single iteration, others get nothing 2191 if (team_id < trip_count) { 2192 *pupper = *plower = *plower + team_id * incr; 2193 } else { 2194 *plower = *pupper + incr; // zero-trip loop 2195 } 2196 if (plastiter != NULL) 2197 *plastiter = (team_id == trip_count - 1); 2198 } else { 2199 if (__kmp_static == kmp_sch_static_balanced) { 2200 UT chunk = trip_count / nteams; 2201 UT extras = trip_count % nteams; 2202 *plower += 2203 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2204 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2205 if (plastiter != NULL) 2206 *plastiter = (team_id == nteams - 1); 2207 } else { 2208 T chunk_inc_count = 2209 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2210 T upper = *pupper; 2211 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2212 // Unknown static scheduling type. 2213 *plower += team_id * chunk_inc_count; 2214 *pupper = *plower + chunk_inc_count - incr; 2215 // Check/correct bounds if needed 2216 if (incr > 0) { 2217 if (*pupper < *plower) 2218 *pupper = traits_t<T>::max_value; 2219 if (plastiter != NULL) 2220 *plastiter = *plower <= upper && *pupper > upper - incr; 2221 if (*pupper > upper) 2222 *pupper = upper; // tracker C73258 2223 } else { 2224 if (*pupper > *plower) 2225 *pupper = traits_t<T>::min_value; 2226 if (plastiter != NULL) 2227 *plastiter = *plower >= upper && *pupper < upper - incr; 2228 if (*pupper < upper) 2229 *pupper = upper; // tracker C73258 2230 } 2231 } 2232 } 2233 } 2234 2235 //----------------------------------------------------------------------------- 2236 // Dispatch routines 2237 // Transfer call to template< type T > 2238 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2239 // T lb, T ub, ST st, ST chunk ) 2240 extern "C" { 2241 2242 /*! 2243 @ingroup WORK_SHARING 2244 @{ 2245 @param loc Source location 2246 @param gtid Global thread id 2247 @param schedule Schedule type 2248 @param lb Lower bound 2249 @param ub Upper bound 2250 @param st Step (or increment if you prefer) 2251 @param chunk The chunk size to block with 2252 2253 This function prepares the runtime to start a dynamically scheduled for loop, 2254 saving the loop arguments. 2255 These functions are all identical apart from the types of the arguments. 2256 */ 2257 2258 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2259 enum sched_type schedule, kmp_int32 lb, 2260 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2261 KMP_DEBUG_ASSERT(__kmp_init_serial); 2262 #if OMPT_SUPPORT && OMPT_OPTIONAL 2263 OMPT_STORE_RETURN_ADDRESS(gtid); 2264 #endif 2265 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2266 } 2267 /*! 2268 See @ref __kmpc_dispatch_init_4 2269 */ 2270 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2271 enum sched_type schedule, kmp_uint32 lb, 2272 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2273 KMP_DEBUG_ASSERT(__kmp_init_serial); 2274 #if OMPT_SUPPORT && OMPT_OPTIONAL 2275 OMPT_STORE_RETURN_ADDRESS(gtid); 2276 #endif 2277 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2278 } 2279 2280 /*! 2281 See @ref __kmpc_dispatch_init_4 2282 */ 2283 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2284 enum sched_type schedule, kmp_int64 lb, 2285 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2286 KMP_DEBUG_ASSERT(__kmp_init_serial); 2287 #if OMPT_SUPPORT && OMPT_OPTIONAL 2288 OMPT_STORE_RETURN_ADDRESS(gtid); 2289 #endif 2290 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2291 } 2292 2293 /*! 2294 See @ref __kmpc_dispatch_init_4 2295 */ 2296 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2297 enum sched_type schedule, kmp_uint64 lb, 2298 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2299 KMP_DEBUG_ASSERT(__kmp_init_serial); 2300 #if OMPT_SUPPORT && OMPT_OPTIONAL 2301 OMPT_STORE_RETURN_ADDRESS(gtid); 2302 #endif 2303 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2304 } 2305 2306 /*! 2307 See @ref __kmpc_dispatch_init_4 2308 2309 Difference from __kmpc_dispatch_init set of functions is these functions 2310 are called for composite distribute parallel for construct. Thus before 2311 regular iterations dispatching we need to calc per-team iteration space. 2312 2313 These functions are all identical apart from the types of the arguments. 2314 */ 2315 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2316 enum sched_type schedule, kmp_int32 *p_last, 2317 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2318 kmp_int32 chunk) { 2319 KMP_DEBUG_ASSERT(__kmp_init_serial); 2320 #if OMPT_SUPPORT && OMPT_OPTIONAL 2321 OMPT_STORE_RETURN_ADDRESS(gtid); 2322 #endif 2323 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2324 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2325 } 2326 2327 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2328 enum sched_type schedule, kmp_int32 *p_last, 2329 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2330 kmp_int32 chunk) { 2331 KMP_DEBUG_ASSERT(__kmp_init_serial); 2332 #if OMPT_SUPPORT && OMPT_OPTIONAL 2333 OMPT_STORE_RETURN_ADDRESS(gtid); 2334 #endif 2335 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2336 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2337 } 2338 2339 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2340 enum sched_type schedule, kmp_int32 *p_last, 2341 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2342 kmp_int64 chunk) { 2343 KMP_DEBUG_ASSERT(__kmp_init_serial); 2344 #if OMPT_SUPPORT && OMPT_OPTIONAL 2345 OMPT_STORE_RETURN_ADDRESS(gtid); 2346 #endif 2347 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2348 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2349 } 2350 2351 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2352 enum sched_type schedule, kmp_int32 *p_last, 2353 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2354 kmp_int64 chunk) { 2355 KMP_DEBUG_ASSERT(__kmp_init_serial); 2356 #if OMPT_SUPPORT && OMPT_OPTIONAL 2357 OMPT_STORE_RETURN_ADDRESS(gtid); 2358 #endif 2359 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2360 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2361 } 2362 2363 /*! 2364 @param loc Source code location 2365 @param gtid Global thread id 2366 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2367 otherwise 2368 @param p_lb Pointer to the lower bound for the next chunk of work 2369 @param p_ub Pointer to the upper bound for the next chunk of work 2370 @param p_st Pointer to the stride for the next chunk of work 2371 @return one if there is work to be done, zero otherwise 2372 2373 Get the next dynamically allocated chunk of work for this thread. 2374 If there is no more work, then the lb,ub and stride need not be modified. 2375 */ 2376 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2377 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 OMPT_STORE_RETURN_ADDRESS(gtid); 2380 #endif 2381 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2382 #if OMPT_SUPPORT && OMPT_OPTIONAL 2383 , 2384 OMPT_LOAD_RETURN_ADDRESS(gtid) 2385 #endif 2386 ); 2387 } 2388 2389 /*! 2390 See @ref __kmpc_dispatch_next_4 2391 */ 2392 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2393 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2394 kmp_int32 *p_st) { 2395 #if OMPT_SUPPORT && OMPT_OPTIONAL 2396 OMPT_STORE_RETURN_ADDRESS(gtid); 2397 #endif 2398 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2399 #if OMPT_SUPPORT && OMPT_OPTIONAL 2400 , 2401 OMPT_LOAD_RETURN_ADDRESS(gtid) 2402 #endif 2403 ); 2404 } 2405 2406 /*! 2407 See @ref __kmpc_dispatch_next_4 2408 */ 2409 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2410 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2411 #if OMPT_SUPPORT && OMPT_OPTIONAL 2412 OMPT_STORE_RETURN_ADDRESS(gtid); 2413 #endif 2414 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2415 #if OMPT_SUPPORT && OMPT_OPTIONAL 2416 , 2417 OMPT_LOAD_RETURN_ADDRESS(gtid) 2418 #endif 2419 ); 2420 } 2421 2422 /*! 2423 See @ref __kmpc_dispatch_next_4 2424 */ 2425 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2426 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2427 kmp_int64 *p_st) { 2428 #if OMPT_SUPPORT && OMPT_OPTIONAL 2429 OMPT_STORE_RETURN_ADDRESS(gtid); 2430 #endif 2431 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2432 #if OMPT_SUPPORT && OMPT_OPTIONAL 2433 , 2434 OMPT_LOAD_RETURN_ADDRESS(gtid) 2435 #endif 2436 ); 2437 } 2438 2439 /*! 2440 @param loc Source code location 2441 @param gtid Global thread id 2442 2443 Mark the end of a dynamic loop. 2444 */ 2445 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2446 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2447 } 2448 2449 /*! 2450 See @ref __kmpc_dispatch_fini_4 2451 */ 2452 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2453 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2454 } 2455 2456 /*! 2457 See @ref __kmpc_dispatch_fini_4 2458 */ 2459 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2460 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2461 } 2462 2463 /*! 2464 See @ref __kmpc_dispatch_fini_4 2465 */ 2466 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2467 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2468 } 2469 /*! @} */ 2470 2471 //----------------------------------------------------------------------------- 2472 // Non-template routines from kmp_dispatch.cpp used in other sources 2473 2474 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2475 return value == checker; 2476 } 2477 2478 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2479 return value != checker; 2480 } 2481 2482 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2483 return value < checker; 2484 } 2485 2486 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2487 return value >= checker; 2488 } 2489 2490 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2491 return value <= checker; 2492 } 2493 2494 kmp_uint32 2495 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2496 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2497 void *obj // Higher-level synchronization object, or NULL. 2498 ) { 2499 // note: we may not belong to a team at this point 2500 volatile kmp_uint32 *spin = spinner; 2501 kmp_uint32 check = checker; 2502 kmp_uint32 spins; 2503 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2504 kmp_uint32 r; 2505 2506 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2507 KMP_INIT_YIELD(spins); 2508 // main wait spin loop 2509 while (!f(r = TCR_4(*spin), check)) { 2510 KMP_FSYNC_SPIN_PREPARE(obj); 2511 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2512 split. It causes problems with infinite recursion because of exit lock */ 2513 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2514 __kmp_abort_thread(); */ 2515 2516 /* if we have waited a bit, or are oversubscribed, yield */ 2517 /* pause is in the following code */ 2518 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2519 KMP_YIELD_SPIN(spins); 2520 } 2521 KMP_FSYNC_SPIN_ACQUIRED(obj); 2522 return r; 2523 } 2524 2525 void __kmp_wait_yield_4_ptr( 2526 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2527 void *obj // Higher-level synchronization object, or NULL. 2528 ) { 2529 // note: we may not belong to a team at this point 2530 void *spin = spinner; 2531 kmp_uint32 check = checker; 2532 kmp_uint32 spins; 2533 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2534 2535 KMP_FSYNC_SPIN_INIT(obj, spin); 2536 KMP_INIT_YIELD(spins); 2537 // main wait spin loop 2538 while (!f(spin, check)) { 2539 KMP_FSYNC_SPIN_PREPARE(obj); 2540 /* if we have waited a bit, or are oversubscribed, yield */ 2541 /* pause is in the following code */ 2542 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2543 KMP_YIELD_SPIN(spins); 2544 } 2545 KMP_FSYNC_SPIN_ACQUIRED(obj); 2546 } 2547 2548 } // extern "C" 2549 2550 #ifdef KMP_GOMP_COMPAT 2551 2552 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2553 enum sched_type schedule, kmp_int32 lb, 2554 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2555 int push_ws) { 2556 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2557 push_ws); 2558 } 2559 2560 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2561 enum sched_type schedule, kmp_uint32 lb, 2562 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2563 int push_ws) { 2564 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2565 push_ws); 2566 } 2567 2568 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2569 enum sched_type schedule, kmp_int64 lb, 2570 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2571 int push_ws) { 2572 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2573 push_ws); 2574 } 2575 2576 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2577 enum sched_type schedule, kmp_uint64 lb, 2578 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2579 int push_ws) { 2580 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2581 push_ws); 2582 } 2583 2584 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2585 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2586 } 2587 2588 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2589 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2590 } 2591 2592 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2593 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2594 } 2595 2596 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2597 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2598 } 2599 2600 #endif /* KMP_GOMP_COMPAT */ 2601 2602 /* ------------------------------------------------------------------------ */ 2603