1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 #include "kmp_lock.h" 38 #include "kmp_dispatch.h" 39 #if KMP_USE_HIER_SCHED 40 #include "kmp_dispatch_hier.h" 41 #endif 42 43 #if OMPT_SUPPORT 44 #include "ompt-specific.h" 45 #endif 46 47 /* ------------------------------------------------------------------------ */ 48 /* ------------------------------------------------------------------------ */ 49 50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 51 kmp_info_t *th; 52 53 KMP_DEBUG_ASSERT(gtid_ref); 54 55 if (__kmp_env_consistency_check) { 56 th = __kmp_threads[*gtid_ref]; 57 if (th->th.th_root->r.r_active && 58 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 59 #if KMP_USE_DYNAMIC_LOCK 60 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 61 #else 62 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 63 #endif 64 } 65 } 66 } 67 68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 69 kmp_info_t *th; 70 71 if (__kmp_env_consistency_check) { 72 th = __kmp_threads[*gtid_ref]; 73 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 74 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 75 } 76 } 77 } 78 79 // Initialize a dispatch_private_info_template<T> buffer for a particular 80 // type of schedule,chunk. The loop description is found in lb (lower bound), 81 // ub (upper bound), and st (stride). nproc is the number of threads relevant 82 // to the scheduling (often the number of threads in a team, but not always if 83 // hierarchical scheduling is used). tid is the id of the thread calling 84 // the function within the group of nproc threads. It will have a value 85 // between 0 and nproc - 1. This is often just the thread id within a team, but 86 // is not necessarily the case when using hierarchical scheduling. 87 // loc is the source file location of the corresponding loop 88 // gtid is the global thread id 89 template <typename T> 90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 91 dispatch_private_info_template<T> *pr, 92 enum sched_type schedule, T lb, T ub, 93 typename traits_t<T>::signed_t st, 94 #if USE_ITT_BUILD 95 kmp_uint64 *cur_chunk, 96 #endif 97 typename traits_t<T>::signed_t chunk, 98 T nproc, T tid) { 99 typedef typename traits_t<T>::unsigned_t UT; 100 typedef typename traits_t<T>::signed_t ST; 101 typedef typename traits_t<T>::floating_t DBL; 102 103 int active; 104 T tc; 105 kmp_info_t *th; 106 kmp_team_t *team; 107 108 #ifdef KMP_DEBUG 109 { 110 char *buff; 111 // create format specifiers before the debug output 112 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 113 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 114 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 115 traits_t<T>::spec, traits_t<T>::spec, 116 traits_t<ST>::spec, traits_t<ST>::spec, 117 traits_t<T>::spec, traits_t<T>::spec); 118 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 119 __kmp_str_free(&buff); 120 } 121 #endif 122 /* setup data */ 123 th = __kmp_threads[gtid]; 124 team = th->th.th_team; 125 active = !team->t.t_serialized; 126 127 #if USE_ITT_BUILD 128 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 129 __kmp_forkjoin_frames_mode == 3 && 130 KMP_MASTER_GTID(gtid) && 131 #if OMP_40_ENABLED 132 th->th.th_teams_microtask == NULL && 133 #endif 134 team->t.t_active_level == 1; 135 #endif 136 #if (KMP_STATIC_STEAL_ENABLED) 137 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 138 // AC: we now have only one implementation of stealing, so use it 139 schedule = kmp_sch_static_steal; 140 else 141 #endif 142 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 143 144 /* Pick up the nomerge/ordered bits from the scheduling type */ 145 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 146 pr->flags.nomerge = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.nomerge = FALSE; 151 } 152 pr->type_size = traits_t<T>::type_size; // remember the size of variables 153 if (kmp_ord_lower & schedule) { 154 pr->flags.ordered = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.ordered = FALSE; 159 } 160 161 if (schedule == kmp_sch_static) { 162 schedule = __kmp_static; 163 } else { 164 if (schedule == kmp_sch_runtime) { 165 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 166 // not specified) 167 schedule = team->t.t_sched.r_sched_type; 168 // Detail the schedule if needed (global controls are differentiated 169 // appropriately) 170 if (schedule == kmp_sch_guided_chunked) { 171 schedule = __kmp_guided; 172 } else if (schedule == kmp_sch_static) { 173 schedule = __kmp_static; 174 } 175 // Use the chunk size specified by OMP_SCHEDULE (or default if not 176 // specified) 177 chunk = team->t.t_sched.chunk; 178 #if USE_ITT_BUILD 179 if (cur_chunk) 180 *cur_chunk = chunk; 181 #endif 182 #ifdef KMP_DEBUG 183 { 184 char *buff; 185 // create format specifiers before the debug output 186 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 187 "schedule:%%d chunk:%%%s\n", 188 traits_t<ST>::spec); 189 KD_TRACE(10, (buff, gtid, schedule, chunk)); 190 __kmp_str_free(&buff); 191 } 192 #endif 193 } else { 194 if (schedule == kmp_sch_guided_chunked) { 195 schedule = __kmp_guided; 196 } 197 if (chunk <= 0) { 198 chunk = KMP_DEFAULT_CHUNK; 199 } 200 } 201 202 if (schedule == kmp_sch_auto) { 203 // mapping and differentiation: in the __kmp_do_serial_initialize() 204 schedule = __kmp_auto; 205 #ifdef KMP_DEBUG 206 { 207 char *buff; 208 // create format specifiers before the debug output 209 buff = __kmp_str_format( 210 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 211 "schedule:%%d chunk:%%%s\n", 212 traits_t<ST>::spec); 213 KD_TRACE(10, (buff, gtid, schedule, chunk)); 214 __kmp_str_free(&buff); 215 } 216 #endif 217 } 218 219 /* guided analytical not safe for too many threads */ 220 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 221 schedule = kmp_sch_guided_iterative_chunked; 222 KMP_WARNING(DispatchManyThreads); 223 } 224 #if OMP_45_ENABLED 225 if (schedule == kmp_sch_runtime_simd) { 226 // compiler provides simd_width in the chunk parameter 227 schedule = team->t.t_sched.r_sched_type; 228 // Detail the schedule if needed (global controls are differentiated 229 // appropriately) 230 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 231 schedule == __kmp_static) { 232 schedule = kmp_sch_static_balanced_chunked; 233 } else { 234 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 235 schedule = kmp_sch_guided_simd; 236 } 237 chunk = team->t.t_sched.chunk * chunk; 238 } 239 #if USE_ITT_BUILD 240 if (cur_chunk) 241 *cur_chunk = chunk; 242 #endif 243 #ifdef KMP_DEBUG 244 { 245 char *buff; 246 // create format specifiers before the debug output 247 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 248 " chunk:%%%s\n", 249 traits_t<ST>::spec); 250 KD_TRACE(10, (buff, gtid, schedule, chunk)); 251 __kmp_str_free(&buff); 252 } 253 #endif 254 } 255 #endif // OMP_45_ENABLED 256 pr->u.p.parm1 = chunk; 257 } 258 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 259 "unknown scheduling type"); 260 261 pr->u.p.count = 0; 262 263 if (__kmp_env_consistency_check) { 264 if (st == 0) { 265 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 266 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 267 } 268 } 269 // compute trip count 270 if (st == 1) { // most common case 271 if (ub >= lb) { 272 tc = ub - lb + 1; 273 } else { // ub < lb 274 tc = 0; // zero-trip 275 } 276 } else if (st < 0) { 277 if (lb >= ub) { 278 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(lb - ub) / (-st) + 1; 281 } else { // lb < ub 282 tc = 0; // zero-trip 283 } 284 } else { // st > 0 285 if (ub >= lb) { 286 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 287 // where the division needs to be unsigned regardless of the result type 288 tc = (UT)(ub - lb) / st + 1; 289 } else { // ub < lb 290 tc = 0; // zero-trip 291 } 292 } 293 294 pr->u.p.lb = lb; 295 pr->u.p.ub = ub; 296 pr->u.p.st = st; 297 pr->u.p.tc = tc; 298 299 #if KMP_OS_WINDOWS 300 pr->u.p.last_upper = ub + st; 301 #endif /* KMP_OS_WINDOWS */ 302 303 /* NOTE: only the active parallel region(s) has active ordered sections */ 304 305 if (active) { 306 if (pr->flags.ordered) { 307 pr->ordered_bumped = 0; 308 pr->u.p.ordered_lower = 1; 309 pr->u.p.ordered_upper = 0; 310 } 311 } 312 313 switch (schedule) { 314 #if (KMP_STATIC_STEAL_ENABLED) 315 case kmp_sch_static_steal: { 316 T ntc, init; 317 318 KD_TRACE(100, 319 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 320 gtid)); 321 322 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 323 if (nproc > 1 && ntc >= nproc) { 324 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 325 T id = tid; 326 T small_chunk, extras; 327 328 small_chunk = ntc / nproc; 329 extras = ntc % nproc; 330 331 init = id * small_chunk + (id < extras ? id : extras); 332 pr->u.p.count = init; 333 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 334 335 pr->u.p.parm2 = lb; 336 // pr->pfields.parm3 = 0; // it's not used in static_steal 337 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 338 pr->u.p.st = st; 339 if (traits_t<T>::type_size > 4) { 340 // AC: TODO: check if 16-byte CAS available and use it to 341 // improve performance (probably wait for explicit request 342 // before spending time on this). 343 // For now use dynamically allocated per-thread lock, 344 // free memory in __kmp_dispatch_next when status==0. 345 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 346 th->th.th_dispatch->th_steal_lock = 347 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 348 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 349 } 350 break; 351 } else { 352 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 353 "kmp_sch_static_balanced\n", 354 gtid)); 355 schedule = kmp_sch_static_balanced; 356 /* too few iterations: fall-through to kmp_sch_static_balanced */ 357 } // if 358 /* FALL-THROUGH to static balanced */ 359 } // case 360 #endif 361 case kmp_sch_static_balanced: { 362 T init, limit; 363 364 KD_TRACE( 365 100, 366 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 367 gtid)); 368 369 if (nproc > 1) { 370 T id = tid; 371 372 if (tc < nproc) { 373 if (id < tc) { 374 init = id; 375 limit = id; 376 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 377 } else { 378 pr->u.p.count = 1; /* means no more chunks to execute */ 379 pr->u.p.parm1 = FALSE; 380 break; 381 } 382 } else { 383 T small_chunk = tc / nproc; 384 T extras = tc % nproc; 385 init = id * small_chunk + (id < extras ? id : extras); 386 limit = init + small_chunk - (id < extras ? 0 : 1); 387 pr->u.p.parm1 = (id == nproc - 1); 388 } 389 } else { 390 if (tc > 0) { 391 init = 0; 392 limit = tc - 1; 393 pr->u.p.parm1 = TRUE; 394 } else { 395 // zero trip count 396 pr->u.p.count = 1; /* means no more chunks to execute */ 397 pr->u.p.parm1 = FALSE; 398 break; 399 } 400 } 401 #if USE_ITT_BUILD 402 // Calculate chunk for metadata report 403 if (itt_need_metadata_reporting) 404 if (cur_chunk) 405 *cur_chunk = limit - init + 1; 406 #endif 407 if (st == 1) { 408 pr->u.p.lb = lb + init; 409 pr->u.p.ub = lb + limit; 410 } else { 411 // calculated upper bound, "ub" is user-defined upper bound 412 T ub_tmp = lb + limit * st; 413 pr->u.p.lb = lb + init * st; 414 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 415 // it exactly 416 if (st > 0) { 417 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 418 } else { 419 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 420 } 421 } 422 if (pr->flags.ordered) { 423 pr->u.p.ordered_lower = init; 424 pr->u.p.ordered_upper = limit; 425 } 426 break; 427 } // case 428 #if OMP_45_ENABLED 429 case kmp_sch_static_balanced_chunked: { 430 // similar to balanced, but chunk adjusted to multiple of simd width 431 T nth = nproc; 432 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 433 " -> falling-through to static_greedy\n", 434 gtid)); 435 schedule = kmp_sch_static_greedy; 436 if (nth > 1) 437 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 438 else 439 pr->u.p.parm1 = tc; 440 break; 441 } // case 442 case kmp_sch_guided_simd: 443 #endif // OMP_45_ENABLED 444 case kmp_sch_guided_iterative_chunked: { 445 KD_TRACE( 446 100, 447 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 448 " case\n", 449 gtid)); 450 451 if (nproc > 1) { 452 if ((2L * chunk + 1) * nproc >= tc) { 453 /* chunk size too large, switch to dynamic */ 454 schedule = kmp_sch_dynamic_chunked; 455 } else { 456 // when remaining iters become less than parm2 - switch to dynamic 457 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 458 *(double *)&pr->u.p.parm3 = 459 guided_flt_param / nproc; // may occupy parm3 and parm4 460 } 461 } else { 462 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 463 "kmp_sch_static_greedy\n", 464 gtid)); 465 schedule = kmp_sch_static_greedy; 466 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 467 KD_TRACE( 468 100, 469 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 470 gtid)); 471 pr->u.p.parm1 = tc; 472 } // if 473 } // case 474 break; 475 case kmp_sch_guided_analytical_chunked: { 476 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 477 "kmp_sch_guided_analytical_chunked case\n", 478 gtid)); 479 480 if (nproc > 1) { 481 if ((2L * chunk + 1) * nproc >= tc) { 482 /* chunk size too large, switch to dynamic */ 483 schedule = kmp_sch_dynamic_chunked; 484 } else { 485 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 486 DBL x; 487 488 #if KMP_OS_WINDOWS && KMP_ARCH_X86 489 /* Linux* OS already has 64-bit computation by default for long double, 490 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 491 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 492 instead of the default 53-bit. Even though long double doesn't work 493 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 494 expected to impact the correctness of the algorithm, but this has not 495 been mathematically proven. */ 496 // save original FPCW and set precision to 64-bit, as 497 // Windows* OS on IA-32 architecture defaults to 53-bit 498 unsigned int oldFpcw = _control87(0, 0); 499 _control87(_PC_64, _MCW_PC); // 0,0x30000 500 #endif 501 /* value used for comparison in solver for cross-over point */ 502 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 503 504 /* crossover point--chunk indexes equal to or greater than 505 this point switch to dynamic-style scheduling */ 506 UT cross; 507 508 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 509 x = (long double)1.0 - (long double)0.5 / nproc; 510 511 #ifdef KMP_DEBUG 512 { // test natural alignment 513 struct _test_a { 514 char a; 515 union { 516 char b; 517 DBL d; 518 }; 519 } t; 520 ptrdiff_t natural_alignment = 521 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 522 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 523 // long)natural_alignment ); 524 KMP_DEBUG_ASSERT( 525 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 526 } 527 #endif // KMP_DEBUG 528 529 /* save the term in thread private dispatch structure */ 530 *(DBL *)&pr->u.p.parm3 = x; 531 532 /* solve for the crossover point to the nearest integer i for which C_i 533 <= chunk */ 534 { 535 UT left, right, mid; 536 long double p; 537 538 /* estimate initial upper and lower bound */ 539 540 /* doesn't matter what value right is as long as it is positive, but 541 it affects performance of the solver */ 542 right = 229; 543 p = __kmp_pow<UT>(x, right); 544 if (p > target) { 545 do { 546 p *= p; 547 right <<= 1; 548 } while (p > target && right < (1 << 27)); 549 /* lower bound is previous (failed) estimate of upper bound */ 550 left = right >> 1; 551 } else { 552 left = 0; 553 } 554 555 /* bisection root-finding method */ 556 while (left + 1 < right) { 557 mid = (left + right) / 2; 558 if (__kmp_pow<UT>(x, mid) > target) { 559 left = mid; 560 } else { 561 right = mid; 562 } 563 } // while 564 cross = right; 565 } 566 /* assert sanity of computed crossover point */ 567 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 568 __kmp_pow<UT>(x, cross) <= target); 569 570 /* save the crossover point in thread private dispatch structure */ 571 pr->u.p.parm2 = cross; 572 573 // C75803 574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 576 #else 577 #define GUIDED_ANALYTICAL_WORKAROUND (x) 578 #endif 579 /* dynamic-style scheduling offset */ 580 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 581 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 582 cross * chunk; 583 #if KMP_OS_WINDOWS && KMP_ARCH_X86 584 // restore FPCW 585 _control87(oldFpcw, _MCW_PC); 586 #endif 587 } // if 588 } else { 589 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 590 "kmp_sch_static_greedy\n", 591 gtid)); 592 schedule = kmp_sch_static_greedy; 593 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 594 pr->u.p.parm1 = tc; 595 } // if 596 } // case 597 break; 598 case kmp_sch_static_greedy: 599 KD_TRACE( 600 100, 601 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 602 gtid)); 603 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 604 break; 605 case kmp_sch_static_chunked: 606 case kmp_sch_dynamic_chunked: 607 if (pr->u.p.parm1 <= 0) { 608 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 609 } 610 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 611 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 612 gtid)); 613 break; 614 case kmp_sch_trapezoidal: { 615 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 616 617 T parm1, parm2, parm3, parm4; 618 KD_TRACE(100, 619 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 620 gtid)); 621 622 parm1 = chunk; 623 624 /* F : size of the first cycle */ 625 parm2 = (tc / (2 * nproc)); 626 627 if (parm2 < 1) { 628 parm2 = 1; 629 } 630 631 /* L : size of the last cycle. Make sure the last cycle is not larger 632 than the first cycle. */ 633 if (parm1 < 1) { 634 parm1 = 1; 635 } else if (parm1 > parm2) { 636 parm1 = parm2; 637 } 638 639 /* N : number of cycles */ 640 parm3 = (parm2 + parm1); 641 parm3 = (2 * tc + parm3 - 1) / parm3; 642 643 if (parm3 < 2) { 644 parm3 = 2; 645 } 646 647 /* sigma : decreasing incr of the trapezoid */ 648 parm4 = (parm3 - 1); 649 parm4 = (parm2 - parm1) / parm4; 650 651 // pointless check, because parm4 >= 0 always 652 // if ( parm4 < 0 ) { 653 // parm4 = 0; 654 //} 655 656 pr->u.p.parm1 = parm1; 657 pr->u.p.parm2 = parm2; 658 pr->u.p.parm3 = parm3; 659 pr->u.p.parm4 = parm4; 660 } // case 661 break; 662 663 default: { 664 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 665 KMP_HNT(GetNewerLibrary), // Hint 666 __kmp_msg_null // Variadic argument list terminator 667 ); 668 } break; 669 } // switch 670 pr->schedule = schedule; 671 } 672 673 #if KMP_USE_HIER_SCHED 674 template <typename T> 675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 676 typename traits_t<T>::signed_t st); 677 template <> 678 inline void 679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 680 kmp_int32 ub, kmp_int32 st) { 681 __kmp_dispatch_init_hierarchy<kmp_int32>( 682 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 683 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 684 } 685 template <> 686 inline void 687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 688 kmp_uint32 ub, kmp_int32 st) { 689 __kmp_dispatch_init_hierarchy<kmp_uint32>( 690 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 691 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 692 } 693 template <> 694 inline void 695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 696 kmp_int64 ub, kmp_int64 st) { 697 __kmp_dispatch_init_hierarchy<kmp_int64>( 698 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 699 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 700 } 701 template <> 702 inline void 703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 704 kmp_uint64 ub, kmp_int64 st) { 705 __kmp_dispatch_init_hierarchy<kmp_uint64>( 706 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 707 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 708 } 709 710 // free all the hierarchy scheduling memory associated with the team 711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 712 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 713 for (int i = 0; i < num_disp_buff; ++i) { 714 // type does not matter here so use kmp_int32 715 auto sh = 716 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 717 &team->t.t_disp_buffer[i]); 718 if (sh->hier) { 719 sh->hier->deallocate(); 720 __kmp_free(sh->hier); 721 } 722 } 723 } 724 #endif 725 726 // UT - unsigned flavor of T, ST - signed flavor of T, 727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 728 template <typename T> 729 static void 730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 731 T ub, typename traits_t<T>::signed_t st, 732 typename traits_t<T>::signed_t chunk, int push_ws) { 733 typedef typename traits_t<T>::unsigned_t UT; 734 typedef typename traits_t<T>::signed_t ST; 735 typedef typename traits_t<T>::floating_t DBL; 736 737 int active; 738 kmp_info_t *th; 739 kmp_team_t *team; 740 kmp_uint32 my_buffer_index; 741 dispatch_private_info_template<T> *pr; 742 dispatch_shared_info_template<T> volatile *sh; 743 744 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 745 sizeof(dispatch_private_info)); 746 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 747 sizeof(dispatch_shared_info)); 748 749 if (!TCR_4(__kmp_init_parallel)) 750 __kmp_parallel_initialize(); 751 752 #if INCLUDE_SSC_MARKS 753 SSC_MARK_DISPATCH_INIT(); 754 #endif 755 #ifdef KMP_DEBUG 756 { 757 char *buff; 758 // create format specifiers before the debug output 759 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 760 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 761 traits_t<ST>::spec, traits_t<T>::spec, 762 traits_t<T>::spec, traits_t<ST>::spec); 763 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 764 __kmp_str_free(&buff); 765 } 766 #endif 767 /* setup data */ 768 th = __kmp_threads[gtid]; 769 team = th->th.th_team; 770 active = !team->t.t_serialized; 771 th->th.th_ident = loc; 772 773 #if KMP_USE_HIER_SCHED 774 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 775 // Hierarchical scheduling does not work with ordered, so if ordered is 776 // detected, then revert back to threaded scheduling. 777 bool ordered; 778 enum sched_type my_sched = schedule; 779 my_buffer_index = th->th.th_dispatch->th_disp_index; 780 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 781 &th->th.th_dispatch 782 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 783 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 784 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 785 my_sched = 786 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 787 ordered = (kmp_ord_lower & my_sched); 788 if (pr->flags.use_hier) { 789 if (ordered) { 790 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 791 "Disabling hierarchical scheduling.\n", 792 gtid)); 793 pr->flags.use_hier = FALSE; 794 } 795 } 796 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 797 // Don't use hierarchical for ordered parallel loops and don't 798 // use the runtime hierarchy if one was specified in the program 799 if (!ordered && !pr->flags.use_hier) 800 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 801 } 802 #endif // KMP_USE_HIER_SCHED 803 804 #if USE_ITT_BUILD 805 kmp_uint64 cur_chunk = chunk; 806 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 807 __kmp_forkjoin_frames_mode == 3 && 808 KMP_MASTER_GTID(gtid) && 809 #if OMP_40_ENABLED 810 th->th.th_teams_microtask == NULL && 811 #endif 812 team->t.t_active_level == 1; 813 #endif 814 if (!active) { 815 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 816 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 817 } else { 818 KMP_DEBUG_ASSERT(th->th.th_dispatch == 819 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 820 821 my_buffer_index = th->th.th_dispatch->th_disp_index++; 822 823 /* What happens when number of threads changes, need to resize buffer? */ 824 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 825 &th->th.th_dispatch 826 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 827 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 828 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 829 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 830 my_buffer_index)); 831 } 832 833 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 834 #if USE_ITT_BUILD 835 &cur_chunk, 836 #endif 837 chunk, (T)th->th.th_team_nproc, 838 (T)th->th.th_info.ds.ds_tid); 839 if (active) { 840 if (pr->flags.ordered == 0) { 841 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 842 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 843 } else { 844 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 845 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 846 } 847 } 848 849 // Any half-decent optimizer will remove this test when the blocks are empty 850 // since the macros expand to nothing 851 // when statistics are disabled. 852 if (schedule == __kmp_static) { 853 KMP_COUNT_BLOCK(OMP_FOR_static); 854 KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc); 855 } else { 856 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 857 KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc); 858 } 859 860 if (active) { 861 /* The name of this buffer should be my_buffer_index when it's free to use 862 * it */ 863 864 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 865 "sh->buffer_index:%d\n", 866 gtid, my_buffer_index, sh->buffer_index)); 867 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 868 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 869 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 870 // my_buffer_index are *always* 32-bit integers. 871 KMP_MB(); /* is this necessary? */ 872 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 873 "sh->buffer_index:%d\n", 874 gtid, my_buffer_index, sh->buffer_index)); 875 876 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 877 th->th.th_dispatch->th_dispatch_sh_current = 878 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 879 #if USE_ITT_BUILD 880 if (pr->flags.ordered) { 881 __kmp_itt_ordered_init(gtid); 882 } 883 // Report loop metadata 884 if (itt_need_metadata_reporting) { 885 // Only report metadata by master of active team at level 1 886 kmp_uint64 schedtype = 0; 887 switch (schedule) { 888 case kmp_sch_static_chunked: 889 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 890 break; 891 case kmp_sch_static_greedy: 892 cur_chunk = pr->u.p.parm1; 893 break; 894 case kmp_sch_dynamic_chunked: 895 schedtype = 1; 896 break; 897 case kmp_sch_guided_iterative_chunked: 898 case kmp_sch_guided_analytical_chunked: 899 #if OMP_45_ENABLED 900 case kmp_sch_guided_simd: 901 #endif 902 schedtype = 2; 903 break; 904 default: 905 // Should we put this case under "static"? 906 // case kmp_sch_static_steal: 907 schedtype = 3; 908 break; 909 } 910 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 911 } 912 #if KMP_USE_HIER_SCHED 913 if (pr->flags.use_hier) { 914 pr->u.p.count = 0; 915 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 916 } 917 #endif // KMP_USER_HIER_SCHED 918 #endif /* USE_ITT_BUILD */ 919 } 920 921 #ifdef KMP_DEBUG 922 { 923 char *buff; 924 // create format specifiers before the debug output 925 buff = __kmp_str_format( 926 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 927 "lb:%%%s ub:%%%s" 928 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 929 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 930 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 931 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 932 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 933 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 934 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 935 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 936 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 937 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 938 __kmp_str_free(&buff); 939 } 940 #endif 941 #if (KMP_STATIC_STEAL_ENABLED) 942 // It cannot be guaranteed that after execution of a loop with some other 943 // schedule kind all the parm3 variables will contain the same value. Even if 944 // all parm3 will be the same, it still exists a bad case like using 0 and 1 945 // rather than program life-time increment. So the dedicated variable is 946 // required. The 'static_steal_counter' is used. 947 if (schedule == kmp_sch_static_steal) { 948 // Other threads will inspect this variable when searching for a victim. 949 // This is a flag showing that other threads may steal from this thread 950 // since then. 951 volatile T *p = &pr->u.p.static_steal_counter; 952 *p = *p + 1; 953 } 954 #endif // ( KMP_STATIC_STEAL_ENABLED ) 955 956 #if OMPT_SUPPORT && OMPT_OPTIONAL 957 if (ompt_enabled.ompt_callback_work) { 958 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 959 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 960 ompt_callbacks.ompt_callback(ompt_callback_work)( 961 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 962 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 963 } 964 #endif 965 } 966 967 /* For ordered loops, either __kmp_dispatch_finish() should be called after 968 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 969 * every chunk of iterations. If the ordered section(s) were not executed 970 * for this iteration (or every iteration in this chunk), we need to set the 971 * ordered iteration counters so that the next thread can proceed. */ 972 template <typename UT> 973 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 974 typedef typename traits_t<UT>::signed_t ST; 975 kmp_info_t *th = __kmp_threads[gtid]; 976 977 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 978 if (!th->th.th_team->t.t_serialized) { 979 980 dispatch_private_info_template<UT> *pr = 981 reinterpret_cast<dispatch_private_info_template<UT> *>( 982 th->th.th_dispatch->th_dispatch_pr_current); 983 dispatch_shared_info_template<UT> volatile *sh = 984 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 985 th->th.th_dispatch->th_dispatch_sh_current); 986 KMP_DEBUG_ASSERT(pr); 987 KMP_DEBUG_ASSERT(sh); 988 KMP_DEBUG_ASSERT(th->th.th_dispatch == 989 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 990 991 if (pr->ordered_bumped) { 992 KD_TRACE( 993 1000, 994 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 995 gtid)); 996 pr->ordered_bumped = 0; 997 } else { 998 UT lower = pr->u.p.ordered_lower; 999 1000 #ifdef KMP_DEBUG 1001 { 1002 char *buff; 1003 // create format specifiers before the debug output 1004 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1005 "ordered_iteration:%%%s lower:%%%s\n", 1006 traits_t<UT>::spec, traits_t<UT>::spec); 1007 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1008 __kmp_str_free(&buff); 1009 } 1010 #endif 1011 1012 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1013 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1014 KMP_MB(); /* is this necessary? */ 1015 #ifdef KMP_DEBUG 1016 { 1017 char *buff; 1018 // create format specifiers before the debug output 1019 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1020 "ordered_iteration:%%%s lower:%%%s\n", 1021 traits_t<UT>::spec, traits_t<UT>::spec); 1022 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1023 __kmp_str_free(&buff); 1024 } 1025 #endif 1026 1027 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1028 } // if 1029 } // if 1030 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1031 } 1032 1033 #ifdef KMP_GOMP_COMPAT 1034 1035 template <typename UT> 1036 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1037 typedef typename traits_t<UT>::signed_t ST; 1038 kmp_info_t *th = __kmp_threads[gtid]; 1039 1040 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1041 if (!th->th.th_team->t.t_serialized) { 1042 // int cid; 1043 dispatch_private_info_template<UT> *pr = 1044 reinterpret_cast<dispatch_private_info_template<UT> *>( 1045 th->th.th_dispatch->th_dispatch_pr_current); 1046 dispatch_shared_info_template<UT> volatile *sh = 1047 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1048 th->th.th_dispatch->th_dispatch_sh_current); 1049 KMP_DEBUG_ASSERT(pr); 1050 KMP_DEBUG_ASSERT(sh); 1051 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1052 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1053 1054 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1055 UT lower = pr->u.p.ordered_lower; 1056 UT upper = pr->u.p.ordered_upper; 1057 UT inc = upper - lower + 1; 1058 1059 if (pr->ordered_bumped == inc) { 1060 KD_TRACE( 1061 1000, 1062 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1063 gtid)); 1064 pr->ordered_bumped = 0; 1065 } else { 1066 inc -= pr->ordered_bumped; 1067 1068 #ifdef KMP_DEBUG 1069 { 1070 char *buff; 1071 // create format specifiers before the debug output 1072 buff = __kmp_str_format( 1073 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1074 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1075 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1076 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1077 __kmp_str_free(&buff); 1078 } 1079 #endif 1080 1081 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1082 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1083 1084 KMP_MB(); /* is this necessary? */ 1085 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1086 "ordered_bumped to zero\n", 1087 gtid)); 1088 pr->ordered_bumped = 0; 1089 //!!!!! TODO check if the inc should be unsigned, or signed??? 1090 #ifdef KMP_DEBUG 1091 { 1092 char *buff; 1093 // create format specifiers before the debug output 1094 buff = __kmp_str_format( 1095 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1096 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1097 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1098 traits_t<UT>::spec); 1099 KD_TRACE(1000, 1100 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1101 __kmp_str_free(&buff); 1102 } 1103 #endif 1104 1105 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1106 } 1107 // } 1108 } 1109 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1110 } 1111 1112 #endif /* KMP_GOMP_COMPAT */ 1113 1114 template <typename T> 1115 int __kmp_dispatch_next_algorithm(int gtid, 1116 dispatch_private_info_template<T> *pr, 1117 dispatch_shared_info_template<T> volatile *sh, 1118 kmp_int32 *p_last, T *p_lb, T *p_ub, 1119 typename traits_t<T>::signed_t *p_st, T nproc, 1120 T tid) { 1121 typedef typename traits_t<T>::unsigned_t UT; 1122 typedef typename traits_t<T>::signed_t ST; 1123 typedef typename traits_t<T>::floating_t DBL; 1124 int status = 0; 1125 kmp_int32 last = 0; 1126 T start; 1127 ST incr; 1128 UT limit, trip, init; 1129 kmp_info_t *th = __kmp_threads[gtid]; 1130 kmp_team_t *team = th->th.th_team; 1131 1132 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1133 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1134 KMP_DEBUG_ASSERT(pr); 1135 KMP_DEBUG_ASSERT(sh); 1136 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1137 #ifdef KMP_DEBUG 1138 { 1139 char *buff; 1140 // create format specifiers before the debug output 1141 buff = 1142 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1143 "sh:%%p nproc:%%%s tid:%%%s\n", 1144 traits_t<T>::spec, traits_t<T>::spec); 1145 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1146 __kmp_str_free(&buff); 1147 } 1148 #endif 1149 1150 // zero trip count 1151 if (pr->u.p.tc == 0) { 1152 KD_TRACE(10, 1153 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1154 "zero status:%d\n", 1155 gtid, status)); 1156 return 0; 1157 } 1158 1159 switch (pr->schedule) { 1160 #if (KMP_STATIC_STEAL_ENABLED) 1161 case kmp_sch_static_steal: { 1162 T chunk = pr->u.p.parm1; 1163 1164 KD_TRACE(100, 1165 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1166 gtid)); 1167 1168 trip = pr->u.p.tc - 1; 1169 1170 if (traits_t<T>::type_size > 4) { 1171 // use lock for 8-byte and CAS for 4-byte induction 1172 // variable. TODO (optional): check and use 16-byte CAS 1173 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1174 KMP_DEBUG_ASSERT(lck != NULL); 1175 if (pr->u.p.count < (UT)pr->u.p.ub) { 1176 __kmp_acquire_lock(lck, gtid); 1177 // try to get own chunk of iterations 1178 init = (pr->u.p.count)++; 1179 status = (init < (UT)pr->u.p.ub); 1180 __kmp_release_lock(lck, gtid); 1181 } else { 1182 status = 0; // no own chunks 1183 } 1184 if (!status) { // try to steal 1185 kmp_info_t **other_threads = team->t.t_threads; 1186 int while_limit = nproc; // nproc attempts to find a victim 1187 int while_index = 0; 1188 // TODO: algorithm of searching for a victim 1189 // should be cleaned up and measured 1190 while ((!status) && (while_limit != ++while_index)) { 1191 T remaining; 1192 T victimIdx = pr->u.p.parm4; 1193 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1194 dispatch_private_info_template<T> *victim = 1195 reinterpret_cast<dispatch_private_info_template<T> *>( 1196 other_threads[victimIdx] 1197 ->th.th_dispatch->th_dispatch_pr_current); 1198 while ((victim == NULL || victim == pr || 1199 (*(volatile T *)&victim->u.p.static_steal_counter != 1200 *(volatile T *)&pr->u.p.static_steal_counter)) && 1201 oldVictimIdx != victimIdx) { 1202 victimIdx = (victimIdx + 1) % nproc; 1203 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1204 other_threads[victimIdx] 1205 ->th.th_dispatch->th_dispatch_pr_current); 1206 } 1207 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1208 *(volatile T *)&pr->u.p.static_steal_counter)) { 1209 continue; // try once more (nproc attempts in total) 1210 // no victim is ready yet to participate in stealing 1211 // because all victims are still in kmp_init_dispatch 1212 } 1213 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1214 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1215 continue; // not enough chunks to steal, goto next victim 1216 } 1217 1218 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1219 KMP_ASSERT(lck != NULL); 1220 __kmp_acquire_lock(lck, gtid); 1221 limit = victim->u.p.ub; // keep initial ub 1222 if (victim->u.p.count >= limit || 1223 (remaining = limit - victim->u.p.count) < 2) { 1224 __kmp_release_lock(lck, gtid); 1225 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1226 continue; // not enough chunks to steal 1227 } 1228 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1229 // by 1 1230 if (remaining > 3) { 1231 // steal 1/4 of remaining 1232 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1233 init = (victim->u.p.ub -= (remaining >> 2)); 1234 } else { 1235 // steal 1 chunk of 2 or 3 remaining 1236 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1237 init = (victim->u.p.ub -= 1); 1238 } 1239 __kmp_release_lock(lck, gtid); 1240 1241 KMP_DEBUG_ASSERT(init + 1 <= limit); 1242 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1243 status = 1; 1244 while_index = 0; 1245 // now update own count and ub with stolen range but init chunk 1246 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1247 pr->u.p.count = init + 1; 1248 pr->u.p.ub = limit; 1249 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1250 } // while (search for victim) 1251 } // if (try to find victim and steal) 1252 } else { 1253 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1254 typedef union { 1255 struct { 1256 UT count; 1257 T ub; 1258 } p; 1259 kmp_int64 b; 1260 } union_i4; 1261 // All operations on 'count' or 'ub' must be combined atomically 1262 // together. 1263 { 1264 union_i4 vold, vnew; 1265 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1266 vnew = vold; 1267 vnew.p.count++; 1268 while (!KMP_COMPARE_AND_STORE_ACQ64( 1269 (volatile kmp_int64 *)&pr->u.p.count, 1270 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1271 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1272 KMP_CPU_PAUSE(); 1273 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1274 vnew = vold; 1275 vnew.p.count++; 1276 } 1277 vnew = vold; 1278 init = vnew.p.count; 1279 status = (init < (UT)vnew.p.ub); 1280 } 1281 1282 if (!status) { 1283 kmp_info_t **other_threads = team->t.t_threads; 1284 int while_limit = nproc; // nproc attempts to find a victim 1285 int while_index = 0; 1286 1287 // TODO: algorithm of searching for a victim 1288 // should be cleaned up and measured 1289 while ((!status) && (while_limit != ++while_index)) { 1290 union_i4 vold, vnew; 1291 kmp_int32 remaining; 1292 T victimIdx = pr->u.p.parm4; 1293 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1294 dispatch_private_info_template<T> *victim = 1295 reinterpret_cast<dispatch_private_info_template<T> *>( 1296 other_threads[victimIdx] 1297 ->th.th_dispatch->th_dispatch_pr_current); 1298 while ((victim == NULL || victim == pr || 1299 (*(volatile T *)&victim->u.p.static_steal_counter != 1300 *(volatile T *)&pr->u.p.static_steal_counter)) && 1301 oldVictimIdx != victimIdx) { 1302 victimIdx = (victimIdx + 1) % nproc; 1303 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1304 other_threads[victimIdx] 1305 ->th.th_dispatch->th_dispatch_pr_current); 1306 } 1307 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1308 *(volatile T *)&pr->u.p.static_steal_counter)) { 1309 continue; // try once more (nproc attempts in total) 1310 // no victim is ready yet to participate in stealing 1311 // because all victims are still in kmp_init_dispatch 1312 } 1313 pr->u.p.parm4 = victimIdx; // new victim found 1314 while (1) { // CAS loop if victim has enough chunks to steal 1315 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1316 vnew = vold; 1317 1318 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1319 if (vnew.p.count >= (UT)vnew.p.ub || 1320 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1321 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1322 break; // not enough chunks to steal, goto next victim 1323 } 1324 if (remaining > 3) { 1325 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1326 } else { 1327 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1328 } 1329 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1330 // TODO: Should this be acquire or release? 1331 if (KMP_COMPARE_AND_STORE_ACQ64( 1332 (volatile kmp_int64 *)&victim->u.p.count, 1333 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1334 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1335 // stealing succedded 1336 KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub); 1337 status = 1; 1338 while_index = 0; 1339 // now update own count and ub 1340 init = vnew.p.ub; 1341 vold.p.count = init + 1; 1342 #if KMP_ARCH_X86 1343 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1344 #else 1345 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1346 #endif 1347 break; 1348 } // if (check CAS result) 1349 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1350 } // while (try to steal from particular victim) 1351 } // while (search for victim) 1352 } // if (try to find victim and steal) 1353 } // if (4-byte induction variable) 1354 if (!status) { 1355 *p_lb = 0; 1356 *p_ub = 0; 1357 if (p_st != NULL) 1358 *p_st = 0; 1359 } else { 1360 start = pr->u.p.parm2; 1361 init *= chunk; 1362 limit = chunk + init - 1; 1363 incr = pr->u.p.st; 1364 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1365 1366 KMP_DEBUG_ASSERT(init <= trip); 1367 if ((last = (limit >= trip)) != 0) 1368 limit = trip; 1369 if (p_st != NULL) 1370 *p_st = incr; 1371 1372 if (incr == 1) { 1373 *p_lb = start + init; 1374 *p_ub = start + limit; 1375 } else { 1376 *p_lb = start + init * incr; 1377 *p_ub = start + limit * incr; 1378 } 1379 1380 if (pr->flags.ordered) { 1381 pr->u.p.ordered_lower = init; 1382 pr->u.p.ordered_upper = limit; 1383 } // if 1384 } // if 1385 break; 1386 } // case 1387 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1388 case kmp_sch_static_balanced: { 1389 KD_TRACE( 1390 10, 1391 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1392 gtid)); 1393 /* check if thread has any iteration to do */ 1394 if ((status = !pr->u.p.count) != 0) { 1395 pr->u.p.count = 1; 1396 *p_lb = pr->u.p.lb; 1397 *p_ub = pr->u.p.ub; 1398 last = pr->u.p.parm1; 1399 if (p_st != NULL) 1400 *p_st = pr->u.p.st; 1401 } else { /* no iterations to do */ 1402 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1403 } 1404 } // case 1405 break; 1406 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1407 merged here */ 1408 case kmp_sch_static_chunked: { 1409 T parm1; 1410 1411 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1412 "kmp_sch_static_[affinity|chunked] case\n", 1413 gtid)); 1414 parm1 = pr->u.p.parm1; 1415 1416 trip = pr->u.p.tc - 1; 1417 init = parm1 * (pr->u.p.count + tid); 1418 1419 if ((status = (init <= trip)) != 0) { 1420 start = pr->u.p.lb; 1421 incr = pr->u.p.st; 1422 limit = parm1 + init - 1; 1423 1424 if ((last = (limit >= trip)) != 0) 1425 limit = trip; 1426 1427 if (p_st != NULL) 1428 *p_st = incr; 1429 1430 pr->u.p.count += nproc; 1431 1432 if (incr == 1) { 1433 *p_lb = start + init; 1434 *p_ub = start + limit; 1435 } else { 1436 *p_lb = start + init * incr; 1437 *p_ub = start + limit * incr; 1438 } 1439 1440 if (pr->flags.ordered) { 1441 pr->u.p.ordered_lower = init; 1442 pr->u.p.ordered_upper = limit; 1443 } // if 1444 } // if 1445 } // case 1446 break; 1447 1448 case kmp_sch_dynamic_chunked: { 1449 T chunk = pr->u.p.parm1; 1450 1451 KD_TRACE( 1452 100, 1453 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1454 gtid)); 1455 1456 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1457 trip = pr->u.p.tc - 1; 1458 1459 if ((status = (init <= trip)) == 0) { 1460 *p_lb = 0; 1461 *p_ub = 0; 1462 if (p_st != NULL) 1463 *p_st = 0; 1464 } else { 1465 start = pr->u.p.lb; 1466 limit = chunk + init - 1; 1467 incr = pr->u.p.st; 1468 1469 if ((last = (limit >= trip)) != 0) 1470 limit = trip; 1471 1472 if (p_st != NULL) 1473 *p_st = incr; 1474 1475 if (incr == 1) { 1476 *p_lb = start + init; 1477 *p_ub = start + limit; 1478 } else { 1479 *p_lb = start + init * incr; 1480 *p_ub = start + limit * incr; 1481 } 1482 1483 if (pr->flags.ordered) { 1484 pr->u.p.ordered_lower = init; 1485 pr->u.p.ordered_upper = limit; 1486 } // if 1487 } // if 1488 } // case 1489 break; 1490 1491 case kmp_sch_guided_iterative_chunked: { 1492 T chunkspec = pr->u.p.parm1; 1493 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1494 "iterative case\n", 1495 gtid)); 1496 trip = pr->u.p.tc; 1497 // Start atomic part of calculations 1498 while (1) { 1499 ST remaining; // signed, because can be < 0 1500 init = sh->u.s.iteration; // shared value 1501 remaining = trip - init; 1502 if (remaining <= 0) { // AC: need to compare with 0 first 1503 // nothing to do, don't try atomic op 1504 status = 0; 1505 break; 1506 } 1507 if ((T)remaining < 1508 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1509 // use dynamic-style shcedule 1510 // atomically inrement iterations, get old value 1511 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1512 (ST)chunkspec); 1513 remaining = trip - init; 1514 if (remaining <= 0) { 1515 status = 0; // all iterations got by other threads 1516 } else { 1517 // got some iterations to work on 1518 status = 1; 1519 if ((T)remaining > chunkspec) { 1520 limit = init + chunkspec - 1; 1521 } else { 1522 last = 1; // the last chunk 1523 limit = init + remaining - 1; 1524 } // if 1525 } // if 1526 break; 1527 } // if 1528 limit = init + 1529 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1530 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1531 (ST)init, (ST)limit)) { 1532 // CAS was successful, chunk obtained 1533 status = 1; 1534 --limit; 1535 break; 1536 } // if 1537 } // while 1538 if (status != 0) { 1539 start = pr->u.p.lb; 1540 incr = pr->u.p.st; 1541 if (p_st != NULL) 1542 *p_st = incr; 1543 *p_lb = start + init * incr; 1544 *p_ub = start + limit * incr; 1545 if (pr->flags.ordered) { 1546 pr->u.p.ordered_lower = init; 1547 pr->u.p.ordered_upper = limit; 1548 } // if 1549 } else { 1550 *p_lb = 0; 1551 *p_ub = 0; 1552 if (p_st != NULL) 1553 *p_st = 0; 1554 } // if 1555 } // case 1556 break; 1557 1558 #if OMP_45_ENABLED 1559 case kmp_sch_guided_simd: { 1560 // same as iterative but curr-chunk adjusted to be multiple of given 1561 // chunk 1562 T chunk = pr->u.p.parm1; 1563 KD_TRACE(100, 1564 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1565 gtid)); 1566 trip = pr->u.p.tc; 1567 // Start atomic part of calculations 1568 while (1) { 1569 ST remaining; // signed, because can be < 0 1570 init = sh->u.s.iteration; // shared value 1571 remaining = trip - init; 1572 if (remaining <= 0) { // AC: need to compare with 0 first 1573 status = 0; // nothing to do, don't try atomic op 1574 break; 1575 } 1576 KMP_DEBUG_ASSERT(init % chunk == 0); 1577 // compare with K*nproc*(chunk+1), K=2 by default 1578 if ((T)remaining < pr->u.p.parm2) { 1579 // use dynamic-style shcedule 1580 // atomically inrement iterations, get old value 1581 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1582 (ST)chunk); 1583 remaining = trip - init; 1584 if (remaining <= 0) { 1585 status = 0; // all iterations got by other threads 1586 } else { 1587 // got some iterations to work on 1588 status = 1; 1589 if ((T)remaining > chunk) { 1590 limit = init + chunk - 1; 1591 } else { 1592 last = 1; // the last chunk 1593 limit = init + remaining - 1; 1594 } // if 1595 } // if 1596 break; 1597 } // if 1598 // divide by K*nproc 1599 UT span = remaining * (*(double *)&pr->u.p.parm3); 1600 UT rem = span % chunk; 1601 if (rem) // adjust so that span%chunk == 0 1602 span += chunk - rem; 1603 limit = init + span; 1604 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1605 (ST)init, (ST)limit)) { 1606 // CAS was successful, chunk obtained 1607 status = 1; 1608 --limit; 1609 break; 1610 } // if 1611 } // while 1612 if (status != 0) { 1613 start = pr->u.p.lb; 1614 incr = pr->u.p.st; 1615 if (p_st != NULL) 1616 *p_st = incr; 1617 *p_lb = start + init * incr; 1618 *p_ub = start + limit * incr; 1619 if (pr->flags.ordered) { 1620 pr->u.p.ordered_lower = init; 1621 pr->u.p.ordered_upper = limit; 1622 } // if 1623 } else { 1624 *p_lb = 0; 1625 *p_ub = 0; 1626 if (p_st != NULL) 1627 *p_st = 0; 1628 } // if 1629 } // case 1630 break; 1631 #endif // OMP_45_ENABLED 1632 1633 case kmp_sch_guided_analytical_chunked: { 1634 T chunkspec = pr->u.p.parm1; 1635 UT chunkIdx; 1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1637 /* for storing original FPCW value for Windows* OS on 1638 IA-32 architecture 8-byte version */ 1639 unsigned int oldFpcw; 1640 unsigned int fpcwSet = 0; 1641 #endif 1642 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1643 "kmp_sch_guided_analytical_chunked case\n", 1644 gtid)); 1645 1646 trip = pr->u.p.tc; 1647 1648 KMP_DEBUG_ASSERT(nproc > 1); 1649 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1650 1651 while (1) { /* this while loop is a safeguard against unexpected zero 1652 chunk sizes */ 1653 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1654 if (chunkIdx >= (UT)pr->u.p.parm2) { 1655 --trip; 1656 /* use dynamic-style scheduling */ 1657 init = chunkIdx * chunkspec + pr->u.p.count; 1658 /* need to verify init > 0 in case of overflow in the above 1659 * calculation */ 1660 if ((status = (init > 0 && init <= trip)) != 0) { 1661 limit = init + chunkspec - 1; 1662 1663 if ((last = (limit >= trip)) != 0) 1664 limit = trip; 1665 } 1666 break; 1667 } else { 1668 /* use exponential-style scheduling */ 1669 /* The following check is to workaround the lack of long double precision on 1670 Windows* OS. 1671 This check works around the possible effect that init != 0 for chunkIdx == 0. 1672 */ 1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1674 /* If we haven't already done so, save original 1675 FPCW and set precision to 64-bit, as Windows* OS 1676 on IA-32 architecture defaults to 53-bit */ 1677 if (!fpcwSet) { 1678 oldFpcw = _control87(0, 0); 1679 _control87(_PC_64, _MCW_PC); 1680 fpcwSet = 0x30000; 1681 } 1682 #endif 1683 if (chunkIdx) { 1684 init = __kmp_dispatch_guided_remaining<T>( 1685 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1686 KMP_DEBUG_ASSERT(init); 1687 init = trip - init; 1688 } else 1689 init = 0; 1690 limit = trip - __kmp_dispatch_guided_remaining<T>( 1691 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1692 KMP_ASSERT(init <= limit); 1693 if (init < limit) { 1694 KMP_DEBUG_ASSERT(limit <= trip); 1695 --limit; 1696 status = 1; 1697 break; 1698 } // if 1699 } // if 1700 } // while (1) 1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1702 /* restore FPCW if necessary 1703 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1704 */ 1705 if (fpcwSet && (oldFpcw & fpcwSet)) 1706 _control87(oldFpcw, _MCW_PC); 1707 #endif 1708 if (status != 0) { 1709 start = pr->u.p.lb; 1710 incr = pr->u.p.st; 1711 if (p_st != NULL) 1712 *p_st = incr; 1713 *p_lb = start + init * incr; 1714 *p_ub = start + limit * incr; 1715 if (pr->flags.ordered) { 1716 pr->u.p.ordered_lower = init; 1717 pr->u.p.ordered_upper = limit; 1718 } 1719 } else { 1720 *p_lb = 0; 1721 *p_ub = 0; 1722 if (p_st != NULL) 1723 *p_st = 0; 1724 } 1725 } // case 1726 break; 1727 1728 case kmp_sch_trapezoidal: { 1729 UT index; 1730 T parm2 = pr->u.p.parm2; 1731 T parm3 = pr->u.p.parm3; 1732 T parm4 = pr->u.p.parm4; 1733 KD_TRACE(100, 1734 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1735 gtid)); 1736 1737 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1738 1739 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1740 trip = pr->u.p.tc - 1; 1741 1742 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1743 *p_lb = 0; 1744 *p_ub = 0; 1745 if (p_st != NULL) 1746 *p_st = 0; 1747 } else { 1748 start = pr->u.p.lb; 1749 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1750 incr = pr->u.p.st; 1751 1752 if ((last = (limit >= trip)) != 0) 1753 limit = trip; 1754 1755 if (p_st != NULL) 1756 *p_st = incr; 1757 1758 if (incr == 1) { 1759 *p_lb = start + init; 1760 *p_ub = start + limit; 1761 } else { 1762 *p_lb = start + init * incr; 1763 *p_ub = start + limit * incr; 1764 } 1765 1766 if (pr->flags.ordered) { 1767 pr->u.p.ordered_lower = init; 1768 pr->u.p.ordered_upper = limit; 1769 } // if 1770 } // if 1771 } // case 1772 break; 1773 default: { 1774 status = 0; // to avoid complaints on uninitialized variable use 1775 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1776 KMP_HNT(GetNewerLibrary), // Hint 1777 __kmp_msg_null // Variadic argument list terminator 1778 ); 1779 } break; 1780 } // switch 1781 if (p_last) 1782 *p_last = last; 1783 #ifdef KMP_DEBUG 1784 if (pr->flags.ordered) { 1785 char *buff; 1786 // create format specifiers before the debug output 1787 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1788 "ordered_lower:%%%s ordered_upper:%%%s\n", 1789 traits_t<UT>::spec, traits_t<UT>::spec); 1790 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1791 __kmp_str_free(&buff); 1792 } 1793 { 1794 char *buff; 1795 // create format specifiers before the debug output 1796 buff = __kmp_str_format( 1797 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1798 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1799 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1800 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1801 __kmp_str_free(&buff); 1802 } 1803 #endif 1804 return status; 1805 } 1806 1807 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1808 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1809 is not called. */ 1810 #if OMPT_SUPPORT && OMPT_OPTIONAL 1811 #define OMPT_LOOP_END \ 1812 if (status == 0) { \ 1813 if (ompt_enabled.ompt_callback_work) { \ 1814 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1815 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1816 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1817 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1818 &(task_info->task_data), 0, codeptr); \ 1819 } \ 1820 } 1821 // TODO: implement count 1822 #else 1823 #define OMPT_LOOP_END // no-op 1824 #endif 1825 1826 template <typename T> 1827 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1828 T *p_lb, T *p_ub, 1829 typename traits_t<T>::signed_t *p_st 1830 #if OMPT_SUPPORT && OMPT_OPTIONAL 1831 , 1832 void *codeptr 1833 #endif 1834 ) { 1835 1836 typedef typename traits_t<T>::unsigned_t UT; 1837 typedef typename traits_t<T>::signed_t ST; 1838 typedef typename traits_t<T>::floating_t DBL; 1839 // This is potentially slightly misleading, schedule(runtime) will appear here 1840 // even if the actual runtme schedule is static. (Which points out a 1841 // disadavantage of schedule(runtime): even when static scheduling is used it 1842 // costs more than a compile time choice to use static scheduling would.) 1843 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1844 1845 int status; 1846 dispatch_private_info_template<T> *pr; 1847 kmp_info_t *th = __kmp_threads[gtid]; 1848 kmp_team_t *team = th->th.th_team; 1849 1850 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1851 KD_TRACE( 1852 1000, 1853 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1854 gtid, p_lb, p_ub, p_st, p_last)); 1855 1856 if (team->t.t_serialized) { 1857 /* NOTE: serialize this dispatch becase we are not at the active level */ 1858 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1859 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1860 KMP_DEBUG_ASSERT(pr); 1861 1862 if ((status = (pr->u.p.tc != 0)) == 0) { 1863 *p_lb = 0; 1864 *p_ub = 0; 1865 // if ( p_last != NULL ) 1866 // *p_last = 0; 1867 if (p_st != NULL) 1868 *p_st = 0; 1869 if (__kmp_env_consistency_check) { 1870 if (pr->pushed_ws != ct_none) { 1871 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1872 } 1873 } 1874 } else if (pr->flags.nomerge) { 1875 kmp_int32 last; 1876 T start; 1877 UT limit, trip, init; 1878 ST incr; 1879 T chunk = pr->u.p.parm1; 1880 1881 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1882 gtid)); 1883 1884 init = chunk * pr->u.p.count++; 1885 trip = pr->u.p.tc - 1; 1886 1887 if ((status = (init <= trip)) == 0) { 1888 *p_lb = 0; 1889 *p_ub = 0; 1890 // if ( p_last != NULL ) 1891 // *p_last = 0; 1892 if (p_st != NULL) 1893 *p_st = 0; 1894 if (__kmp_env_consistency_check) { 1895 if (pr->pushed_ws != ct_none) { 1896 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1897 } 1898 } 1899 } else { 1900 start = pr->u.p.lb; 1901 limit = chunk + init - 1; 1902 incr = pr->u.p.st; 1903 1904 if ((last = (limit >= trip)) != 0) { 1905 limit = trip; 1906 #if KMP_OS_WINDOWS 1907 pr->u.p.last_upper = pr->u.p.ub; 1908 #endif /* KMP_OS_WINDOWS */ 1909 } 1910 if (p_last != NULL) 1911 *p_last = last; 1912 if (p_st != NULL) 1913 *p_st = incr; 1914 if (incr == 1) { 1915 *p_lb = start + init; 1916 *p_ub = start + limit; 1917 } else { 1918 *p_lb = start + init * incr; 1919 *p_ub = start + limit * incr; 1920 } 1921 1922 if (pr->flags.ordered) { 1923 pr->u.p.ordered_lower = init; 1924 pr->u.p.ordered_upper = limit; 1925 #ifdef KMP_DEBUG 1926 { 1927 char *buff; 1928 // create format specifiers before the debug output 1929 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1930 "ordered_lower:%%%s ordered_upper:%%%s\n", 1931 traits_t<UT>::spec, traits_t<UT>::spec); 1932 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1933 pr->u.p.ordered_upper)); 1934 __kmp_str_free(&buff); 1935 } 1936 #endif 1937 } // if 1938 } // if 1939 } else { 1940 pr->u.p.tc = 0; 1941 *p_lb = pr->u.p.lb; 1942 *p_ub = pr->u.p.ub; 1943 #if KMP_OS_WINDOWS 1944 pr->u.p.last_upper = *p_ub; 1945 #endif /* KMP_OS_WINDOWS */ 1946 if (p_last != NULL) 1947 *p_last = TRUE; 1948 if (p_st != NULL) 1949 *p_st = pr->u.p.st; 1950 } // if 1951 #ifdef KMP_DEBUG 1952 { 1953 char *buff; 1954 // create format specifiers before the debug output 1955 buff = __kmp_str_format( 1956 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1957 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1958 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1959 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1960 __kmp_str_free(&buff); 1961 } 1962 #endif 1963 #if INCLUDE_SSC_MARKS 1964 SSC_MARK_DISPATCH_NEXT(); 1965 #endif 1966 OMPT_LOOP_END; 1967 return status; 1968 } else { 1969 kmp_int32 last = 0; 1970 dispatch_shared_info_template<T> volatile *sh; 1971 1972 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1973 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1974 1975 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1976 th->th.th_dispatch->th_dispatch_pr_current); 1977 KMP_DEBUG_ASSERT(pr); 1978 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 1979 th->th.th_dispatch->th_dispatch_sh_current); 1980 KMP_DEBUG_ASSERT(sh); 1981 1982 #if KMP_USE_HIER_SCHED 1983 if (pr->flags.use_hier) 1984 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 1985 else 1986 #endif // KMP_USE_HIER_SCHED 1987 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 1988 p_st, th->th.th_team_nproc, 1989 th->th.th_info.ds.ds_tid); 1990 // status == 0: no more iterations to execute 1991 if (status == 0) { 1992 UT num_done; 1993 1994 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 1995 #ifdef KMP_DEBUG 1996 { 1997 char *buff; 1998 // create format specifiers before the debug output 1999 buff = __kmp_str_format( 2000 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2001 traits_t<UT>::spec); 2002 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2003 __kmp_str_free(&buff); 2004 } 2005 #endif 2006 2007 #if KMP_USE_HIER_SCHED 2008 pr->flags.use_hier = FALSE; 2009 #endif 2010 if ((ST)num_done == th->th.th_team_nproc - 1) { 2011 #if (KMP_STATIC_STEAL_ENABLED) 2012 if (pr->schedule == kmp_sch_static_steal && 2013 traits_t<T>::type_size > 4) { 2014 int i; 2015 kmp_info_t **other_threads = team->t.t_threads; 2016 // loop complete, safe to destroy locks used for stealing 2017 for (i = 0; i < th->th.th_team_nproc; ++i) { 2018 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2019 KMP_ASSERT(lck != NULL); 2020 __kmp_destroy_lock(lck); 2021 __kmp_free(lck); 2022 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2023 } 2024 } 2025 #endif 2026 /* NOTE: release this buffer to be reused */ 2027 2028 KMP_MB(); /* Flush all pending memory write invalidates. */ 2029 2030 sh->u.s.num_done = 0; 2031 sh->u.s.iteration = 0; 2032 2033 /* TODO replace with general release procedure? */ 2034 if (pr->flags.ordered) { 2035 sh->u.s.ordered_iteration = 0; 2036 } 2037 2038 KMP_MB(); /* Flush all pending memory write invalidates. */ 2039 2040 sh->buffer_index += __kmp_dispatch_num_buffers; 2041 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2042 gtid, sh->buffer_index)); 2043 2044 KMP_MB(); /* Flush all pending memory write invalidates. */ 2045 2046 } // if 2047 if (__kmp_env_consistency_check) { 2048 if (pr->pushed_ws != ct_none) { 2049 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2050 } 2051 } 2052 2053 th->th.th_dispatch->th_deo_fcn = NULL; 2054 th->th.th_dispatch->th_dxo_fcn = NULL; 2055 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2056 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2057 } // if (status == 0) 2058 #if KMP_OS_WINDOWS 2059 else if (last) { 2060 pr->u.p.last_upper = pr->u.p.ub; 2061 } 2062 #endif /* KMP_OS_WINDOWS */ 2063 if (p_last != NULL && status != 0) 2064 *p_last = last; 2065 } // if 2066 2067 #ifdef KMP_DEBUG 2068 { 2069 char *buff; 2070 // create format specifiers before the debug output 2071 buff = __kmp_str_format( 2072 "__kmp_dispatch_next: T#%%d normal case: " 2073 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2074 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2075 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2076 (p_last ? *p_last : 0), status)); 2077 __kmp_str_free(&buff); 2078 } 2079 #endif 2080 #if INCLUDE_SSC_MARKS 2081 SSC_MARK_DISPATCH_NEXT(); 2082 #endif 2083 OMPT_LOOP_END; 2084 return status; 2085 } 2086 2087 template <typename T> 2088 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2089 kmp_int32 *plastiter, T *plower, T *pupper, 2090 typename traits_t<T>::signed_t incr) { 2091 typedef typename traits_t<T>::unsigned_t UT; 2092 typedef typename traits_t<T>::signed_t ST; 2093 kmp_uint32 team_id; 2094 kmp_uint32 nteams; 2095 UT trip_count; 2096 kmp_team_t *team; 2097 kmp_info_t *th; 2098 2099 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2100 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2101 #ifdef KMP_DEBUG 2102 { 2103 char *buff; 2104 // create format specifiers before the debug output 2105 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2106 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2107 traits_t<T>::spec, traits_t<T>::spec, 2108 traits_t<ST>::spec, traits_t<T>::spec); 2109 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2110 __kmp_str_free(&buff); 2111 } 2112 #endif 2113 2114 if (__kmp_env_consistency_check) { 2115 if (incr == 0) { 2116 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2117 loc); 2118 } 2119 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2120 // The loop is illegal. 2121 // Some zero-trip loops maintained by compiler, e.g.: 2122 // for(i=10;i<0;++i) // lower >= upper - run-time check 2123 // for(i=0;i>10;--i) // lower <= upper - run-time check 2124 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2125 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2126 // Compiler does not check the following illegal loops: 2127 // for(i=0;i<10;i+=incr) // where incr<0 2128 // for(i=10;i>0;i-=incr) // where incr<0 2129 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2130 } 2131 } 2132 th = __kmp_threads[gtid]; 2133 team = th->th.th_team; 2134 #if OMP_40_ENABLED 2135 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2136 nteams = th->th.th_teams_size.nteams; 2137 #endif 2138 team_id = team->t.t_master_tid; 2139 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2140 2141 // compute global trip count 2142 if (incr == 1) { 2143 trip_count = *pupper - *plower + 1; 2144 } else if (incr == -1) { 2145 trip_count = *plower - *pupper + 1; 2146 } else if (incr > 0) { 2147 // upper-lower can exceed the limit of signed type 2148 trip_count = (UT)(*pupper - *plower) / incr + 1; 2149 } else { 2150 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2151 } 2152 2153 if (trip_count <= nteams) { 2154 KMP_DEBUG_ASSERT( 2155 __kmp_static == kmp_sch_static_greedy || 2156 __kmp_static == 2157 kmp_sch_static_balanced); // Unknown static scheduling type. 2158 // only some teams get single iteration, others get nothing 2159 if (team_id < trip_count) { 2160 *pupper = *plower = *plower + team_id * incr; 2161 } else { 2162 *plower = *pupper + incr; // zero-trip loop 2163 } 2164 if (plastiter != NULL) 2165 *plastiter = (team_id == trip_count - 1); 2166 } else { 2167 if (__kmp_static == kmp_sch_static_balanced) { 2168 UT chunk = trip_count / nteams; 2169 UT extras = trip_count % nteams; 2170 *plower += 2171 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2172 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2173 if (plastiter != NULL) 2174 *plastiter = (team_id == nteams - 1); 2175 } else { 2176 T chunk_inc_count = 2177 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2178 T upper = *pupper; 2179 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2180 // Unknown static scheduling type. 2181 *plower += team_id * chunk_inc_count; 2182 *pupper = *plower + chunk_inc_count - incr; 2183 // Check/correct bounds if needed 2184 if (incr > 0) { 2185 if (*pupper < *plower) 2186 *pupper = traits_t<T>::max_value; 2187 if (plastiter != NULL) 2188 *plastiter = *plower <= upper && *pupper > upper - incr; 2189 if (*pupper > upper) 2190 *pupper = upper; // tracker C73258 2191 } else { 2192 if (*pupper > *plower) 2193 *pupper = traits_t<T>::min_value; 2194 if (plastiter != NULL) 2195 *plastiter = *plower >= upper && *pupper < upper - incr; 2196 if (*pupper < upper) 2197 *pupper = upper; // tracker C73258 2198 } 2199 } 2200 } 2201 } 2202 2203 //----------------------------------------------------------------------------- 2204 // Dispatch routines 2205 // Transfer call to template< type T > 2206 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2207 // T lb, T ub, ST st, ST chunk ) 2208 extern "C" { 2209 2210 /*! 2211 @ingroup WORK_SHARING 2212 @{ 2213 @param loc Source location 2214 @param gtid Global thread id 2215 @param schedule Schedule type 2216 @param lb Lower bound 2217 @param ub Upper bound 2218 @param st Step (or increment if you prefer) 2219 @param chunk The chunk size to block with 2220 2221 This function prepares the runtime to start a dynamically scheduled for loop, 2222 saving the loop arguments. 2223 These functions are all identical apart from the types of the arguments. 2224 */ 2225 2226 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2227 enum sched_type schedule, kmp_int32 lb, 2228 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2229 KMP_DEBUG_ASSERT(__kmp_init_serial); 2230 #if OMPT_SUPPORT && OMPT_OPTIONAL 2231 OMPT_STORE_RETURN_ADDRESS(gtid); 2232 #endif 2233 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2234 } 2235 /*! 2236 See @ref __kmpc_dispatch_init_4 2237 */ 2238 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2239 enum sched_type schedule, kmp_uint32 lb, 2240 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2241 KMP_DEBUG_ASSERT(__kmp_init_serial); 2242 #if OMPT_SUPPORT && OMPT_OPTIONAL 2243 OMPT_STORE_RETURN_ADDRESS(gtid); 2244 #endif 2245 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2246 } 2247 2248 /*! 2249 See @ref __kmpc_dispatch_init_4 2250 */ 2251 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2252 enum sched_type schedule, kmp_int64 lb, 2253 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2254 KMP_DEBUG_ASSERT(__kmp_init_serial); 2255 #if OMPT_SUPPORT && OMPT_OPTIONAL 2256 OMPT_STORE_RETURN_ADDRESS(gtid); 2257 #endif 2258 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2259 } 2260 2261 /*! 2262 See @ref __kmpc_dispatch_init_4 2263 */ 2264 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2265 enum sched_type schedule, kmp_uint64 lb, 2266 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2267 KMP_DEBUG_ASSERT(__kmp_init_serial); 2268 #if OMPT_SUPPORT && OMPT_OPTIONAL 2269 OMPT_STORE_RETURN_ADDRESS(gtid); 2270 #endif 2271 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2272 } 2273 2274 /*! 2275 See @ref __kmpc_dispatch_init_4 2276 2277 Difference from __kmpc_dispatch_init set of functions is these functions 2278 are called for composite distribute parallel for construct. Thus before 2279 regular iterations dispatching we need to calc per-team iteration space. 2280 2281 These functions are all identical apart from the types of the arguments. 2282 */ 2283 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2284 enum sched_type schedule, kmp_int32 *p_last, 2285 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2286 kmp_int32 chunk) { 2287 KMP_DEBUG_ASSERT(__kmp_init_serial); 2288 #if OMPT_SUPPORT && OMPT_OPTIONAL 2289 OMPT_STORE_RETURN_ADDRESS(gtid); 2290 #endif 2291 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2292 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2293 } 2294 2295 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2296 enum sched_type schedule, kmp_int32 *p_last, 2297 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2298 kmp_int32 chunk) { 2299 KMP_DEBUG_ASSERT(__kmp_init_serial); 2300 #if OMPT_SUPPORT && OMPT_OPTIONAL 2301 OMPT_STORE_RETURN_ADDRESS(gtid); 2302 #endif 2303 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2304 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2305 } 2306 2307 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2308 enum sched_type schedule, kmp_int32 *p_last, 2309 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2310 kmp_int64 chunk) { 2311 KMP_DEBUG_ASSERT(__kmp_init_serial); 2312 #if OMPT_SUPPORT && OMPT_OPTIONAL 2313 OMPT_STORE_RETURN_ADDRESS(gtid); 2314 #endif 2315 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2316 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2317 } 2318 2319 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2320 enum sched_type schedule, kmp_int32 *p_last, 2321 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2322 kmp_int64 chunk) { 2323 KMP_DEBUG_ASSERT(__kmp_init_serial); 2324 #if OMPT_SUPPORT && OMPT_OPTIONAL 2325 OMPT_STORE_RETURN_ADDRESS(gtid); 2326 #endif 2327 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2328 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2329 } 2330 2331 /*! 2332 @param loc Source code location 2333 @param gtid Global thread id 2334 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2335 otherwise 2336 @param p_lb Pointer to the lower bound for the next chunk of work 2337 @param p_ub Pointer to the upper bound for the next chunk of work 2338 @param p_st Pointer to the stride for the next chunk of work 2339 @return one if there is work to be done, zero otherwise 2340 2341 Get the next dynamically allocated chunk of work for this thread. 2342 If there is no more work, then the lb,ub and stride need not be modified. 2343 */ 2344 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2345 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2346 #if OMPT_SUPPORT && OMPT_OPTIONAL 2347 OMPT_STORE_RETURN_ADDRESS(gtid); 2348 #endif 2349 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2350 #if OMPT_SUPPORT && OMPT_OPTIONAL 2351 , 2352 OMPT_LOAD_RETURN_ADDRESS(gtid) 2353 #endif 2354 ); 2355 } 2356 2357 /*! 2358 See @ref __kmpc_dispatch_next_4 2359 */ 2360 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2361 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2362 kmp_int32 *p_st) { 2363 #if OMPT_SUPPORT && OMPT_OPTIONAL 2364 OMPT_STORE_RETURN_ADDRESS(gtid); 2365 #endif 2366 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2367 #if OMPT_SUPPORT && OMPT_OPTIONAL 2368 , 2369 OMPT_LOAD_RETURN_ADDRESS(gtid) 2370 #endif 2371 ); 2372 } 2373 2374 /*! 2375 See @ref __kmpc_dispatch_next_4 2376 */ 2377 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2378 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2379 #if OMPT_SUPPORT && OMPT_OPTIONAL 2380 OMPT_STORE_RETURN_ADDRESS(gtid); 2381 #endif 2382 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2383 #if OMPT_SUPPORT && OMPT_OPTIONAL 2384 , 2385 OMPT_LOAD_RETURN_ADDRESS(gtid) 2386 #endif 2387 ); 2388 } 2389 2390 /*! 2391 See @ref __kmpc_dispatch_next_4 2392 */ 2393 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2394 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2395 kmp_int64 *p_st) { 2396 #if OMPT_SUPPORT && OMPT_OPTIONAL 2397 OMPT_STORE_RETURN_ADDRESS(gtid); 2398 #endif 2399 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2400 #if OMPT_SUPPORT && OMPT_OPTIONAL 2401 , 2402 OMPT_LOAD_RETURN_ADDRESS(gtid) 2403 #endif 2404 ); 2405 } 2406 2407 /*! 2408 @param loc Source code location 2409 @param gtid Global thread id 2410 2411 Mark the end of a dynamic loop. 2412 */ 2413 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2414 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2415 } 2416 2417 /*! 2418 See @ref __kmpc_dispatch_fini_4 2419 */ 2420 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2421 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2422 } 2423 2424 /*! 2425 See @ref __kmpc_dispatch_fini_4 2426 */ 2427 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2428 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2429 } 2430 2431 /*! 2432 See @ref __kmpc_dispatch_fini_4 2433 */ 2434 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2435 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2436 } 2437 /*! @} */ 2438 2439 //----------------------------------------------------------------------------- 2440 // Non-template routines from kmp_dispatch.cpp used in other sources 2441 2442 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2443 return value == checker; 2444 } 2445 2446 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2447 return value != checker; 2448 } 2449 2450 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2451 return value < checker; 2452 } 2453 2454 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2455 return value >= checker; 2456 } 2457 2458 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2459 return value <= checker; 2460 } 2461 2462 kmp_uint32 2463 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2464 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2465 void *obj // Higher-level synchronization object, or NULL. 2466 ) { 2467 // note: we may not belong to a team at this point 2468 volatile kmp_uint32 *spin = spinner; 2469 kmp_uint32 check = checker; 2470 kmp_uint32 spins; 2471 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2472 kmp_uint32 r; 2473 2474 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2475 KMP_INIT_YIELD(spins); 2476 // main wait spin loop 2477 while (!f(r = TCR_4(*spin), check)) { 2478 KMP_FSYNC_SPIN_PREPARE(obj); 2479 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2480 split. It causes problems with infinite recursion because of exit lock */ 2481 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2482 __kmp_abort_thread(); */ 2483 2484 /* if we have waited a bit, or are oversubscribed, yield */ 2485 /* pause is in the following code */ 2486 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2487 KMP_YIELD_SPIN(spins); 2488 } 2489 KMP_FSYNC_SPIN_ACQUIRED(obj); 2490 return r; 2491 } 2492 2493 void __kmp_wait_yield_4_ptr( 2494 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2495 void *obj // Higher-level synchronization object, or NULL. 2496 ) { 2497 // note: we may not belong to a team at this point 2498 void *spin = spinner; 2499 kmp_uint32 check = checker; 2500 kmp_uint32 spins; 2501 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2502 2503 KMP_FSYNC_SPIN_INIT(obj, spin); 2504 KMP_INIT_YIELD(spins); 2505 // main wait spin loop 2506 while (!f(spin, check)) { 2507 KMP_FSYNC_SPIN_PREPARE(obj); 2508 /* if we have waited a bit, or are oversubscribed, yield */ 2509 /* pause is in the following code */ 2510 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2511 KMP_YIELD_SPIN(spins); 2512 } 2513 KMP_FSYNC_SPIN_ACQUIRED(obj); 2514 } 2515 2516 } // extern "C" 2517 2518 #ifdef KMP_GOMP_COMPAT 2519 2520 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2521 enum sched_type schedule, kmp_int32 lb, 2522 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2523 int push_ws) { 2524 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2525 push_ws); 2526 } 2527 2528 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2529 enum sched_type schedule, kmp_uint32 lb, 2530 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2531 int push_ws) { 2532 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2533 push_ws); 2534 } 2535 2536 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2537 enum sched_type schedule, kmp_int64 lb, 2538 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2539 int push_ws) { 2540 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2541 push_ws); 2542 } 2543 2544 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2545 enum sched_type schedule, kmp_uint64 lb, 2546 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2547 int push_ws) { 2548 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2549 push_ws); 2550 } 2551 2552 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2553 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2554 } 2555 2556 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2557 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2558 } 2559 2560 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2561 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2562 } 2563 2564 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2565 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2566 } 2567 2568 #endif /* KMP_GOMP_COMPAT */ 2569 2570 /* ------------------------------------------------------------------------ */ 2571