1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 #include "kmp.h" 22 #include "kmp_error.h" 23 #include "kmp_i18n.h" 24 #include "kmp_itt.h" 25 #include "kmp_stats.h" 26 #include "kmp_str.h" 27 #if KMP_USE_X87CONTROL 28 #include <float.h> 29 #endif 30 #include "kmp_lock.h" 31 #include "kmp_dispatch.h" 32 #if KMP_USE_HIER_SCHED 33 #include "kmp_dispatch_hier.h" 34 #endif 35 36 #if OMPT_SUPPORT 37 #include "ompt-specific.h" 38 #endif 39 40 /* ------------------------------------------------------------------------ */ 41 /* ------------------------------------------------------------------------ */ 42 43 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 44 kmp_info_t *th; 45 46 KMP_DEBUG_ASSERT(gtid_ref); 47 48 if (__kmp_env_consistency_check) { 49 th = __kmp_threads[*gtid_ref]; 50 if (th->th.th_root->r.r_active && 51 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 52 #if KMP_USE_DYNAMIC_LOCK 53 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 54 #else 55 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 56 #endif 57 } 58 } 59 } 60 61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 62 kmp_info_t *th; 63 64 if (__kmp_env_consistency_check) { 65 th = __kmp_threads[*gtid_ref]; 66 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 67 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 68 } 69 } 70 } 71 72 // Initialize a dispatch_private_info_template<T> buffer for a particular 73 // type of schedule,chunk. The loop description is found in lb (lower bound), 74 // ub (upper bound), and st (stride). nproc is the number of threads relevant 75 // to the scheduling (often the number of threads in a team, but not always if 76 // hierarchical scheduling is used). tid is the id of the thread calling 77 // the function within the group of nproc threads. It will have a value 78 // between 0 and nproc - 1. This is often just the thread id within a team, but 79 // is not necessarily the case when using hierarchical scheduling. 80 // loc is the source file location of the corresponding loop 81 // gtid is the global thread id 82 template <typename T> 83 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 84 dispatch_private_info_template<T> *pr, 85 enum sched_type schedule, T lb, T ub, 86 typename traits_t<T>::signed_t st, 87 #if USE_ITT_BUILD 88 kmp_uint64 *cur_chunk, 89 #endif 90 typename traits_t<T>::signed_t chunk, 91 T nproc, T tid) { 92 typedef typename traits_t<T>::unsigned_t UT; 93 typedef typename traits_t<T>::floating_t DBL; 94 95 int active; 96 T tc; 97 kmp_info_t *th; 98 kmp_team_t *team; 99 100 #ifdef KMP_DEBUG 101 typedef typename traits_t<T>::signed_t ST; 102 { 103 char *buff; 104 // create format specifiers before the debug output 105 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 106 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 107 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 108 traits_t<T>::spec, traits_t<T>::spec, 109 traits_t<ST>::spec, traits_t<ST>::spec, 110 traits_t<T>::spec, traits_t<T>::spec); 111 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 112 __kmp_str_free(&buff); 113 } 114 #endif 115 /* setup data */ 116 th = __kmp_threads[gtid]; 117 team = th->th.th_team; 118 active = !team->t.t_serialized; 119 120 #if USE_ITT_BUILD 121 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 122 __kmp_forkjoin_frames_mode == 3 && 123 KMP_MASTER_GTID(gtid) && 124 #if OMP_40_ENABLED 125 th->th.th_teams_microtask == NULL && 126 #endif 127 team->t.t_active_level == 1; 128 #endif 129 #if (KMP_STATIC_STEAL_ENABLED) 130 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 131 // AC: we now have only one implementation of stealing, so use it 132 schedule = kmp_sch_static_steal; 133 else 134 #endif 135 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 136 137 /* Pick up the nomerge/ordered bits from the scheduling type */ 138 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 139 pr->flags.nomerge = TRUE; 140 schedule = 141 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 142 } else { 143 pr->flags.nomerge = FALSE; 144 } 145 pr->type_size = traits_t<T>::type_size; // remember the size of variables 146 if (kmp_ord_lower & schedule) { 147 pr->flags.ordered = TRUE; 148 schedule = 149 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 150 } else { 151 pr->flags.ordered = FALSE; 152 } 153 154 if (schedule == kmp_sch_static) { 155 schedule = __kmp_static; 156 } else { 157 if (schedule == kmp_sch_runtime) { 158 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 159 // not specified) 160 schedule = team->t.t_sched.r_sched_type; 161 // Detail the schedule if needed (global controls are differentiated 162 // appropriately) 163 if (schedule == kmp_sch_guided_chunked) { 164 schedule = __kmp_guided; 165 } else if (schedule == kmp_sch_static) { 166 schedule = __kmp_static; 167 } 168 // Use the chunk size specified by OMP_SCHEDULE (or default if not 169 // specified) 170 chunk = team->t.t_sched.chunk; 171 #if USE_ITT_BUILD 172 if (cur_chunk) 173 *cur_chunk = chunk; 174 #endif 175 #ifdef KMP_DEBUG 176 { 177 char *buff; 178 // create format specifiers before the debug output 179 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 180 "schedule:%%d chunk:%%%s\n", 181 traits_t<ST>::spec); 182 KD_TRACE(10, (buff, gtid, schedule, chunk)); 183 __kmp_str_free(&buff); 184 } 185 #endif 186 } else { 187 if (schedule == kmp_sch_guided_chunked) { 188 schedule = __kmp_guided; 189 } 190 if (chunk <= 0) { 191 chunk = KMP_DEFAULT_CHUNK; 192 } 193 } 194 195 if (schedule == kmp_sch_auto) { 196 // mapping and differentiation: in the __kmp_do_serial_initialize() 197 schedule = __kmp_auto; 198 #ifdef KMP_DEBUG 199 { 200 char *buff; 201 // create format specifiers before the debug output 202 buff = __kmp_str_format( 203 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 204 "schedule:%%d chunk:%%%s\n", 205 traits_t<ST>::spec); 206 KD_TRACE(10, (buff, gtid, schedule, chunk)); 207 __kmp_str_free(&buff); 208 } 209 #endif 210 } 211 212 /* guided analytical not safe for too many threads */ 213 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 214 schedule = kmp_sch_guided_iterative_chunked; 215 KMP_WARNING(DispatchManyThreads); 216 } 217 #if OMP_45_ENABLED 218 if (schedule == kmp_sch_runtime_simd) { 219 // compiler provides simd_width in the chunk parameter 220 schedule = team->t.t_sched.r_sched_type; 221 // Detail the schedule if needed (global controls are differentiated 222 // appropriately) 223 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 224 schedule == __kmp_static) { 225 schedule = kmp_sch_static_balanced_chunked; 226 } else { 227 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 228 schedule = kmp_sch_guided_simd; 229 } 230 chunk = team->t.t_sched.chunk * chunk; 231 } 232 #if USE_ITT_BUILD 233 if (cur_chunk) 234 *cur_chunk = chunk; 235 #endif 236 #ifdef KMP_DEBUG 237 { 238 char *buff; 239 // create format specifiers before the debug output 240 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 241 " chunk:%%%s\n", 242 traits_t<ST>::spec); 243 KD_TRACE(10, (buff, gtid, schedule, chunk)); 244 __kmp_str_free(&buff); 245 } 246 #endif 247 } 248 #endif // OMP_45_ENABLED 249 pr->u.p.parm1 = chunk; 250 } 251 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 252 "unknown scheduling type"); 253 254 pr->u.p.count = 0; 255 256 if (__kmp_env_consistency_check) { 257 if (st == 0) { 258 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 259 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 260 } 261 } 262 // compute trip count 263 if (st == 1) { // most common case 264 if (ub >= lb) { 265 tc = ub - lb + 1; 266 } else { // ub < lb 267 tc = 0; // zero-trip 268 } 269 } else if (st < 0) { 270 if (lb >= ub) { 271 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 272 // where the division needs to be unsigned regardless of the result type 273 tc = (UT)(lb - ub) / (-st) + 1; 274 } else { // lb < ub 275 tc = 0; // zero-trip 276 } 277 } else { // st > 0 278 if (ub >= lb) { 279 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 280 // where the division needs to be unsigned regardless of the result type 281 tc = (UT)(ub - lb) / st + 1; 282 } else { // ub < lb 283 tc = 0; // zero-trip 284 } 285 } 286 287 pr->u.p.lb = lb; 288 pr->u.p.ub = ub; 289 pr->u.p.st = st; 290 pr->u.p.tc = tc; 291 292 #if KMP_OS_WINDOWS 293 pr->u.p.last_upper = ub + st; 294 #endif /* KMP_OS_WINDOWS */ 295 296 /* NOTE: only the active parallel region(s) has active ordered sections */ 297 298 if (active) { 299 if (pr->flags.ordered) { 300 pr->ordered_bumped = 0; 301 pr->u.p.ordered_lower = 1; 302 pr->u.p.ordered_upper = 0; 303 } 304 } 305 306 switch (schedule) { 307 #if (KMP_STATIC_STEAL_ENABLED) 308 case kmp_sch_static_steal: { 309 T ntc, init; 310 311 KD_TRACE(100, 312 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 313 gtid)); 314 315 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 316 if (nproc > 1 && ntc >= nproc) { 317 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 318 T id = tid; 319 T small_chunk, extras; 320 321 small_chunk = ntc / nproc; 322 extras = ntc % nproc; 323 324 init = id * small_chunk + (id < extras ? id : extras); 325 pr->u.p.count = init; 326 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 327 328 pr->u.p.parm2 = lb; 329 // pr->pfields.parm3 = 0; // it's not used in static_steal 330 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 331 pr->u.p.st = st; 332 if (traits_t<T>::type_size > 4) { 333 // AC: TODO: check if 16-byte CAS available and use it to 334 // improve performance (probably wait for explicit request 335 // before spending time on this). 336 // For now use dynamically allocated per-thread lock, 337 // free memory in __kmp_dispatch_next when status==0. 338 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 339 th->th.th_dispatch->th_steal_lock = 340 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 341 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 342 } 343 break; 344 } else { 345 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 346 "kmp_sch_static_balanced\n", 347 gtid)); 348 schedule = kmp_sch_static_balanced; 349 /* too few iterations: fall-through to kmp_sch_static_balanced */ 350 } // if 351 /* FALL-THROUGH to static balanced */ 352 } // case 353 #endif 354 case kmp_sch_static_balanced: { 355 T init, limit; 356 357 KD_TRACE( 358 100, 359 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 360 gtid)); 361 362 if (nproc > 1) { 363 T id = tid; 364 365 if (tc < nproc) { 366 if (id < tc) { 367 init = id; 368 limit = id; 369 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 370 } else { 371 pr->u.p.count = 1; /* means no more chunks to execute */ 372 pr->u.p.parm1 = FALSE; 373 break; 374 } 375 } else { 376 T small_chunk = tc / nproc; 377 T extras = tc % nproc; 378 init = id * small_chunk + (id < extras ? id : extras); 379 limit = init + small_chunk - (id < extras ? 0 : 1); 380 pr->u.p.parm1 = (id == nproc - 1); 381 } 382 } else { 383 if (tc > 0) { 384 init = 0; 385 limit = tc - 1; 386 pr->u.p.parm1 = TRUE; 387 } else { 388 // zero trip count 389 pr->u.p.count = 1; /* means no more chunks to execute */ 390 pr->u.p.parm1 = FALSE; 391 break; 392 } 393 } 394 #if USE_ITT_BUILD 395 // Calculate chunk for metadata report 396 if (itt_need_metadata_reporting) 397 if (cur_chunk) 398 *cur_chunk = limit - init + 1; 399 #endif 400 if (st == 1) { 401 pr->u.p.lb = lb + init; 402 pr->u.p.ub = lb + limit; 403 } else { 404 // calculated upper bound, "ub" is user-defined upper bound 405 T ub_tmp = lb + limit * st; 406 pr->u.p.lb = lb + init * st; 407 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 408 // it exactly 409 if (st > 0) { 410 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 411 } else { 412 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 413 } 414 } 415 if (pr->flags.ordered) { 416 pr->u.p.ordered_lower = init; 417 pr->u.p.ordered_upper = limit; 418 } 419 break; 420 } // case 421 #if OMP_45_ENABLED 422 case kmp_sch_static_balanced_chunked: { 423 // similar to balanced, but chunk adjusted to multiple of simd width 424 T nth = nproc; 425 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 426 " -> falling-through to static_greedy\n", 427 gtid)); 428 schedule = kmp_sch_static_greedy; 429 if (nth > 1) 430 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 431 else 432 pr->u.p.parm1 = tc; 433 break; 434 } // case 435 case kmp_sch_guided_simd: 436 #endif // OMP_45_ENABLED 437 case kmp_sch_guided_iterative_chunked: { 438 KD_TRACE( 439 100, 440 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 441 " case\n", 442 gtid)); 443 444 if (nproc > 1) { 445 if ((2L * chunk + 1) * nproc >= tc) { 446 /* chunk size too large, switch to dynamic */ 447 schedule = kmp_sch_dynamic_chunked; 448 } else { 449 // when remaining iters become less than parm2 - switch to dynamic 450 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 451 *(double *)&pr->u.p.parm3 = 452 guided_flt_param / nproc; // may occupy parm3 and parm4 453 } 454 } else { 455 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 456 "kmp_sch_static_greedy\n", 457 gtid)); 458 schedule = kmp_sch_static_greedy; 459 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 460 KD_TRACE( 461 100, 462 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 463 gtid)); 464 pr->u.p.parm1 = tc; 465 } // if 466 } // case 467 break; 468 case kmp_sch_guided_analytical_chunked: { 469 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 470 "kmp_sch_guided_analytical_chunked case\n", 471 gtid)); 472 473 if (nproc > 1) { 474 if ((2L * chunk + 1) * nproc >= tc) { 475 /* chunk size too large, switch to dynamic */ 476 schedule = kmp_sch_dynamic_chunked; 477 } else { 478 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 479 DBL x; 480 481 #if KMP_USE_X87CONTROL 482 /* Linux* OS already has 64-bit computation by default for long double, 483 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 484 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 485 instead of the default 53-bit. Even though long double doesn't work 486 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 487 expected to impact the correctness of the algorithm, but this has not 488 been mathematically proven. */ 489 // save original FPCW and set precision to 64-bit, as 490 // Windows* OS on IA-32 architecture defaults to 53-bit 491 unsigned int oldFpcw = _control87(0, 0); 492 _control87(_PC_64, _MCW_PC); // 0,0x30000 493 #endif 494 /* value used for comparison in solver for cross-over point */ 495 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 496 497 /* crossover point--chunk indexes equal to or greater than 498 this point switch to dynamic-style scheduling */ 499 UT cross; 500 501 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 502 x = (long double)1.0 - (long double)0.5 / nproc; 503 504 #ifdef KMP_DEBUG 505 { // test natural alignment 506 struct _test_a { 507 char a; 508 union { 509 char b; 510 DBL d; 511 }; 512 } t; 513 ptrdiff_t natural_alignment = 514 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 515 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 516 // long)natural_alignment ); 517 KMP_DEBUG_ASSERT( 518 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 519 } 520 #endif // KMP_DEBUG 521 522 /* save the term in thread private dispatch structure */ 523 *(DBL *)&pr->u.p.parm3 = x; 524 525 /* solve for the crossover point to the nearest integer i for which C_i 526 <= chunk */ 527 { 528 UT left, right, mid; 529 long double p; 530 531 /* estimate initial upper and lower bound */ 532 533 /* doesn't matter what value right is as long as it is positive, but 534 it affects performance of the solver */ 535 right = 229; 536 p = __kmp_pow<UT>(x, right); 537 if (p > target) { 538 do { 539 p *= p; 540 right <<= 1; 541 } while (p > target && right < (1 << 27)); 542 /* lower bound is previous (failed) estimate of upper bound */ 543 left = right >> 1; 544 } else { 545 left = 0; 546 } 547 548 /* bisection root-finding method */ 549 while (left + 1 < right) { 550 mid = (left + right) / 2; 551 if (__kmp_pow<UT>(x, mid) > target) { 552 left = mid; 553 } else { 554 right = mid; 555 } 556 } // while 557 cross = right; 558 } 559 /* assert sanity of computed crossover point */ 560 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 561 __kmp_pow<UT>(x, cross) <= target); 562 563 /* save the crossover point in thread private dispatch structure */ 564 pr->u.p.parm2 = cross; 565 566 // C75803 567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 569 #else 570 #define GUIDED_ANALYTICAL_WORKAROUND (x) 571 #endif 572 /* dynamic-style scheduling offset */ 573 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 574 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 575 cross * chunk; 576 #if KMP_USE_X87CONTROL 577 // restore FPCW 578 _control87(oldFpcw, _MCW_PC); 579 #endif 580 } // if 581 } else { 582 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 583 "kmp_sch_static_greedy\n", 584 gtid)); 585 schedule = kmp_sch_static_greedy; 586 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 587 pr->u.p.parm1 = tc; 588 } // if 589 } // case 590 break; 591 case kmp_sch_static_greedy: 592 KD_TRACE( 593 100, 594 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 595 gtid)); 596 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 597 break; 598 case kmp_sch_static_chunked: 599 case kmp_sch_dynamic_chunked: 600 if (pr->u.p.parm1 <= 0) { 601 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 602 } 603 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 604 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 605 gtid)); 606 break; 607 case kmp_sch_trapezoidal: { 608 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 609 610 T parm1, parm2, parm3, parm4; 611 KD_TRACE(100, 612 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 613 gtid)); 614 615 parm1 = chunk; 616 617 /* F : size of the first cycle */ 618 parm2 = (tc / (2 * nproc)); 619 620 if (parm2 < 1) { 621 parm2 = 1; 622 } 623 624 /* L : size of the last cycle. Make sure the last cycle is not larger 625 than the first cycle. */ 626 if (parm1 < 1) { 627 parm1 = 1; 628 } else if (parm1 > parm2) { 629 parm1 = parm2; 630 } 631 632 /* N : number of cycles */ 633 parm3 = (parm2 + parm1); 634 parm3 = (2 * tc + parm3 - 1) / parm3; 635 636 if (parm3 < 2) { 637 parm3 = 2; 638 } 639 640 /* sigma : decreasing incr of the trapezoid */ 641 parm4 = (parm3 - 1); 642 parm4 = (parm2 - parm1) / parm4; 643 644 // pointless check, because parm4 >= 0 always 645 // if ( parm4 < 0 ) { 646 // parm4 = 0; 647 //} 648 649 pr->u.p.parm1 = parm1; 650 pr->u.p.parm2 = parm2; 651 pr->u.p.parm3 = parm3; 652 pr->u.p.parm4 = parm4; 653 } // case 654 break; 655 656 default: { 657 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 658 KMP_HNT(GetNewerLibrary), // Hint 659 __kmp_msg_null // Variadic argument list terminator 660 ); 661 } break; 662 } // switch 663 pr->schedule = schedule; 664 } 665 666 #if KMP_USE_HIER_SCHED 667 template <typename T> 668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 669 typename traits_t<T>::signed_t st); 670 template <> 671 inline void 672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 673 kmp_int32 ub, kmp_int32 st) { 674 __kmp_dispatch_init_hierarchy<kmp_int32>( 675 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 676 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 677 } 678 template <> 679 inline void 680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 681 kmp_uint32 ub, kmp_int32 st) { 682 __kmp_dispatch_init_hierarchy<kmp_uint32>( 683 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 684 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 685 } 686 template <> 687 inline void 688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 689 kmp_int64 ub, kmp_int64 st) { 690 __kmp_dispatch_init_hierarchy<kmp_int64>( 691 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 692 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 693 } 694 template <> 695 inline void 696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 697 kmp_uint64 ub, kmp_int64 st) { 698 __kmp_dispatch_init_hierarchy<kmp_uint64>( 699 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 700 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 701 } 702 703 // free all the hierarchy scheduling memory associated with the team 704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 705 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 706 for (int i = 0; i < num_disp_buff; ++i) { 707 // type does not matter here so use kmp_int32 708 auto sh = 709 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 710 &team->t.t_disp_buffer[i]); 711 if (sh->hier) { 712 sh->hier->deallocate(); 713 __kmp_free(sh->hier); 714 } 715 } 716 } 717 #endif 718 719 // UT - unsigned flavor of T, ST - signed flavor of T, 720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 721 template <typename T> 722 static void 723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 724 T ub, typename traits_t<T>::signed_t st, 725 typename traits_t<T>::signed_t chunk, int push_ws) { 726 typedef typename traits_t<T>::unsigned_t UT; 727 728 int active; 729 kmp_info_t *th; 730 kmp_team_t *team; 731 kmp_uint32 my_buffer_index; 732 dispatch_private_info_template<T> *pr; 733 dispatch_shared_info_template<T> volatile *sh; 734 735 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 736 sizeof(dispatch_private_info)); 737 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 738 sizeof(dispatch_shared_info)); 739 740 if (!TCR_4(__kmp_init_parallel)) 741 __kmp_parallel_initialize(); 742 743 #if INCLUDE_SSC_MARKS 744 SSC_MARK_DISPATCH_INIT(); 745 #endif 746 #ifdef KMP_DEBUG 747 typedef typename traits_t<T>::signed_t ST; 748 { 749 char *buff; 750 // create format specifiers before the debug output 751 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 752 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 753 traits_t<ST>::spec, traits_t<T>::spec, 754 traits_t<T>::spec, traits_t<ST>::spec); 755 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 756 __kmp_str_free(&buff); 757 } 758 #endif 759 /* setup data */ 760 th = __kmp_threads[gtid]; 761 team = th->th.th_team; 762 active = !team->t.t_serialized; 763 th->th.th_ident = loc; 764 765 // Any half-decent optimizer will remove this test when the blocks are empty 766 // since the macros expand to nothing 767 // when statistics are disabled. 768 if (schedule == __kmp_static) { 769 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 770 } else { 771 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 772 } 773 774 #if KMP_USE_HIER_SCHED 775 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 776 // Hierarchical scheduling does not work with ordered, so if ordered is 777 // detected, then revert back to threaded scheduling. 778 bool ordered; 779 enum sched_type my_sched = schedule; 780 my_buffer_index = th->th.th_dispatch->th_disp_index; 781 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 782 &th->th.th_dispatch 783 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 784 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 785 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 786 my_sched = 787 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 788 ordered = (kmp_ord_lower & my_sched); 789 if (pr->flags.use_hier) { 790 if (ordered) { 791 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 792 "Disabling hierarchical scheduling.\n", 793 gtid)); 794 pr->flags.use_hier = FALSE; 795 } 796 } 797 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 798 // Don't use hierarchical for ordered parallel loops and don't 799 // use the runtime hierarchy if one was specified in the program 800 if (!ordered && !pr->flags.use_hier) 801 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 802 } 803 #endif // KMP_USE_HIER_SCHED 804 805 #if USE_ITT_BUILD 806 kmp_uint64 cur_chunk = chunk; 807 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 808 __kmp_forkjoin_frames_mode == 3 && 809 KMP_MASTER_GTID(gtid) && 810 #if OMP_40_ENABLED 811 th->th.th_teams_microtask == NULL && 812 #endif 813 team->t.t_active_level == 1; 814 #endif 815 if (!active) { 816 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 817 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 818 } else { 819 KMP_DEBUG_ASSERT(th->th.th_dispatch == 820 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 821 822 my_buffer_index = th->th.th_dispatch->th_disp_index++; 823 824 /* What happens when number of threads changes, need to resize buffer? */ 825 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 826 &th->th.th_dispatch 827 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 828 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 829 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 830 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 831 my_buffer_index)); 832 } 833 834 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 835 #if USE_ITT_BUILD 836 &cur_chunk, 837 #endif 838 chunk, (T)th->th.th_team_nproc, 839 (T)th->th.th_info.ds.ds_tid); 840 if (active) { 841 if (pr->flags.ordered == 0) { 842 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 843 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 844 } else { 845 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 846 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 847 } 848 } 849 850 if (active) { 851 /* The name of this buffer should be my_buffer_index when it's free to use 852 * it */ 853 854 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 855 "sh->buffer_index:%d\n", 856 gtid, my_buffer_index, sh->buffer_index)); 857 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 858 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 859 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 860 // my_buffer_index are *always* 32-bit integers. 861 KMP_MB(); /* is this necessary? */ 862 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 863 "sh->buffer_index:%d\n", 864 gtid, my_buffer_index, sh->buffer_index)); 865 866 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 867 th->th.th_dispatch->th_dispatch_sh_current = 868 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 869 #if USE_ITT_BUILD 870 if (pr->flags.ordered) { 871 __kmp_itt_ordered_init(gtid); 872 } 873 // Report loop metadata 874 if (itt_need_metadata_reporting) { 875 // Only report metadata by master of active team at level 1 876 kmp_uint64 schedtype = 0; 877 switch (schedule) { 878 case kmp_sch_static_chunked: 879 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 880 break; 881 case kmp_sch_static_greedy: 882 cur_chunk = pr->u.p.parm1; 883 break; 884 case kmp_sch_dynamic_chunked: 885 schedtype = 1; 886 break; 887 case kmp_sch_guided_iterative_chunked: 888 case kmp_sch_guided_analytical_chunked: 889 #if OMP_45_ENABLED 890 case kmp_sch_guided_simd: 891 #endif 892 schedtype = 2; 893 break; 894 default: 895 // Should we put this case under "static"? 896 // case kmp_sch_static_steal: 897 schedtype = 3; 898 break; 899 } 900 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 901 } 902 #if KMP_USE_HIER_SCHED 903 if (pr->flags.use_hier) { 904 pr->u.p.count = 0; 905 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 906 } 907 #endif // KMP_USER_HIER_SCHED 908 #endif /* USE_ITT_BUILD */ 909 } 910 911 #ifdef KMP_DEBUG 912 { 913 char *buff; 914 // create format specifiers before the debug output 915 buff = __kmp_str_format( 916 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 917 "lb:%%%s ub:%%%s" 918 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 919 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 920 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 921 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 922 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 923 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 924 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 925 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 926 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 927 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 928 __kmp_str_free(&buff); 929 } 930 #endif 931 #if (KMP_STATIC_STEAL_ENABLED) 932 // It cannot be guaranteed that after execution of a loop with some other 933 // schedule kind all the parm3 variables will contain the same value. Even if 934 // all parm3 will be the same, it still exists a bad case like using 0 and 1 935 // rather than program life-time increment. So the dedicated variable is 936 // required. The 'static_steal_counter' is used. 937 if (schedule == kmp_sch_static_steal) { 938 // Other threads will inspect this variable when searching for a victim. 939 // This is a flag showing that other threads may steal from this thread 940 // since then. 941 volatile T *p = &pr->u.p.static_steal_counter; 942 *p = *p + 1; 943 } 944 #endif // ( KMP_STATIC_STEAL_ENABLED ) 945 946 #if OMPT_SUPPORT && OMPT_OPTIONAL 947 if (ompt_enabled.ompt_callback_work) { 948 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 949 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 950 ompt_callbacks.ompt_callback(ompt_callback_work)( 951 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 952 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 953 } 954 #endif 955 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 956 } 957 958 /* For ordered loops, either __kmp_dispatch_finish() should be called after 959 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 960 * every chunk of iterations. If the ordered section(s) were not executed 961 * for this iteration (or every iteration in this chunk), we need to set the 962 * ordered iteration counters so that the next thread can proceed. */ 963 template <typename UT> 964 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 965 typedef typename traits_t<UT>::signed_t ST; 966 kmp_info_t *th = __kmp_threads[gtid]; 967 968 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 969 if (!th->th.th_team->t.t_serialized) { 970 971 dispatch_private_info_template<UT> *pr = 972 reinterpret_cast<dispatch_private_info_template<UT> *>( 973 th->th.th_dispatch->th_dispatch_pr_current); 974 dispatch_shared_info_template<UT> volatile *sh = 975 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 976 th->th.th_dispatch->th_dispatch_sh_current); 977 KMP_DEBUG_ASSERT(pr); 978 KMP_DEBUG_ASSERT(sh); 979 KMP_DEBUG_ASSERT(th->th.th_dispatch == 980 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 981 982 if (pr->ordered_bumped) { 983 KD_TRACE( 984 1000, 985 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 986 gtid)); 987 pr->ordered_bumped = 0; 988 } else { 989 UT lower = pr->u.p.ordered_lower; 990 991 #ifdef KMP_DEBUG 992 { 993 char *buff; 994 // create format specifiers before the debug output 995 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 996 "ordered_iteration:%%%s lower:%%%s\n", 997 traits_t<UT>::spec, traits_t<UT>::spec); 998 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 999 __kmp_str_free(&buff); 1000 } 1001 #endif 1002 1003 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1004 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1005 KMP_MB(); /* is this necessary? */ 1006 #ifdef KMP_DEBUG 1007 { 1008 char *buff; 1009 // create format specifiers before the debug output 1010 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1011 "ordered_iteration:%%%s lower:%%%s\n", 1012 traits_t<UT>::spec, traits_t<UT>::spec); 1013 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1014 __kmp_str_free(&buff); 1015 } 1016 #endif 1017 1018 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1019 } // if 1020 } // if 1021 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1022 } 1023 1024 #ifdef KMP_GOMP_COMPAT 1025 1026 template <typename UT> 1027 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1028 typedef typename traits_t<UT>::signed_t ST; 1029 kmp_info_t *th = __kmp_threads[gtid]; 1030 1031 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1032 if (!th->th.th_team->t.t_serialized) { 1033 // int cid; 1034 dispatch_private_info_template<UT> *pr = 1035 reinterpret_cast<dispatch_private_info_template<UT> *>( 1036 th->th.th_dispatch->th_dispatch_pr_current); 1037 dispatch_shared_info_template<UT> volatile *sh = 1038 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1039 th->th.th_dispatch->th_dispatch_sh_current); 1040 KMP_DEBUG_ASSERT(pr); 1041 KMP_DEBUG_ASSERT(sh); 1042 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1043 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1044 1045 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1046 UT lower = pr->u.p.ordered_lower; 1047 UT upper = pr->u.p.ordered_upper; 1048 UT inc = upper - lower + 1; 1049 1050 if (pr->ordered_bumped == inc) { 1051 KD_TRACE( 1052 1000, 1053 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1054 gtid)); 1055 pr->ordered_bumped = 0; 1056 } else { 1057 inc -= pr->ordered_bumped; 1058 1059 #ifdef KMP_DEBUG 1060 { 1061 char *buff; 1062 // create format specifiers before the debug output 1063 buff = __kmp_str_format( 1064 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1065 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1066 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1067 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1068 __kmp_str_free(&buff); 1069 } 1070 #endif 1071 1072 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1073 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1074 1075 KMP_MB(); /* is this necessary? */ 1076 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1077 "ordered_bumped to zero\n", 1078 gtid)); 1079 pr->ordered_bumped = 0; 1080 //!!!!! TODO check if the inc should be unsigned, or signed??? 1081 #ifdef KMP_DEBUG 1082 { 1083 char *buff; 1084 // create format specifiers before the debug output 1085 buff = __kmp_str_format( 1086 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1087 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1088 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1089 traits_t<UT>::spec); 1090 KD_TRACE(1000, 1091 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1092 __kmp_str_free(&buff); 1093 } 1094 #endif 1095 1096 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1097 } 1098 // } 1099 } 1100 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1101 } 1102 1103 #endif /* KMP_GOMP_COMPAT */ 1104 1105 template <typename T> 1106 int __kmp_dispatch_next_algorithm(int gtid, 1107 dispatch_private_info_template<T> *pr, 1108 dispatch_shared_info_template<T> volatile *sh, 1109 kmp_int32 *p_last, T *p_lb, T *p_ub, 1110 typename traits_t<T>::signed_t *p_st, T nproc, 1111 T tid) { 1112 typedef typename traits_t<T>::unsigned_t UT; 1113 typedef typename traits_t<T>::signed_t ST; 1114 typedef typename traits_t<T>::floating_t DBL; 1115 int status = 0; 1116 kmp_int32 last = 0; 1117 T start; 1118 ST incr; 1119 UT limit, trip, init; 1120 kmp_info_t *th = __kmp_threads[gtid]; 1121 kmp_team_t *team = th->th.th_team; 1122 1123 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1124 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1125 KMP_DEBUG_ASSERT(pr); 1126 KMP_DEBUG_ASSERT(sh); 1127 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1128 #ifdef KMP_DEBUG 1129 { 1130 char *buff; 1131 // create format specifiers before the debug output 1132 buff = 1133 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1134 "sh:%%p nproc:%%%s tid:%%%s\n", 1135 traits_t<T>::spec, traits_t<T>::spec); 1136 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1137 __kmp_str_free(&buff); 1138 } 1139 #endif 1140 1141 // zero trip count 1142 if (pr->u.p.tc == 0) { 1143 KD_TRACE(10, 1144 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1145 "zero status:%d\n", 1146 gtid, status)); 1147 return 0; 1148 } 1149 1150 switch (pr->schedule) { 1151 #if (KMP_STATIC_STEAL_ENABLED) 1152 case kmp_sch_static_steal: { 1153 T chunk = pr->u.p.parm1; 1154 1155 KD_TRACE(100, 1156 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1157 gtid)); 1158 1159 trip = pr->u.p.tc - 1; 1160 1161 if (traits_t<T>::type_size > 4) { 1162 // use lock for 8-byte and CAS for 4-byte induction 1163 // variable. TODO (optional): check and use 16-byte CAS 1164 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1165 KMP_DEBUG_ASSERT(lck != NULL); 1166 if (pr->u.p.count < (UT)pr->u.p.ub) { 1167 __kmp_acquire_lock(lck, gtid); 1168 // try to get own chunk of iterations 1169 init = (pr->u.p.count)++; 1170 status = (init < (UT)pr->u.p.ub); 1171 __kmp_release_lock(lck, gtid); 1172 } else { 1173 status = 0; // no own chunks 1174 } 1175 if (!status) { // try to steal 1176 kmp_info_t **other_threads = team->t.t_threads; 1177 int while_limit = nproc; // nproc attempts to find a victim 1178 int while_index = 0; 1179 // TODO: algorithm of searching for a victim 1180 // should be cleaned up and measured 1181 while ((!status) && (while_limit != ++while_index)) { 1182 T remaining; 1183 T victimIdx = pr->u.p.parm4; 1184 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1185 dispatch_private_info_template<T> *victim = 1186 reinterpret_cast<dispatch_private_info_template<T> *>( 1187 other_threads[victimIdx] 1188 ->th.th_dispatch->th_dispatch_pr_current); 1189 while ((victim == NULL || victim == pr || 1190 (*(volatile T *)&victim->u.p.static_steal_counter != 1191 *(volatile T *)&pr->u.p.static_steal_counter)) && 1192 oldVictimIdx != victimIdx) { 1193 victimIdx = (victimIdx + 1) % nproc; 1194 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1195 other_threads[victimIdx] 1196 ->th.th_dispatch->th_dispatch_pr_current); 1197 } 1198 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1199 *(volatile T *)&pr->u.p.static_steal_counter)) { 1200 continue; // try once more (nproc attempts in total) 1201 // no victim is ready yet to participate in stealing 1202 // because all victims are still in kmp_init_dispatch 1203 } 1204 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1205 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1206 continue; // not enough chunks to steal, goto next victim 1207 } 1208 1209 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1210 KMP_ASSERT(lck != NULL); 1211 __kmp_acquire_lock(lck, gtid); 1212 limit = victim->u.p.ub; // keep initial ub 1213 if (victim->u.p.count >= limit || 1214 (remaining = limit - victim->u.p.count) < 2) { 1215 __kmp_release_lock(lck, gtid); 1216 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1217 continue; // not enough chunks to steal 1218 } 1219 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1220 // by 1 1221 if (remaining > 3) { 1222 // steal 1/4 of remaining 1223 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1224 init = (victim->u.p.ub -= (remaining >> 2)); 1225 } else { 1226 // steal 1 chunk of 2 or 3 remaining 1227 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1228 init = (victim->u.p.ub -= 1); 1229 } 1230 __kmp_release_lock(lck, gtid); 1231 1232 KMP_DEBUG_ASSERT(init + 1 <= limit); 1233 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1234 status = 1; 1235 while_index = 0; 1236 // now update own count and ub with stolen range but init chunk 1237 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1238 pr->u.p.count = init + 1; 1239 pr->u.p.ub = limit; 1240 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1241 } // while (search for victim) 1242 } // if (try to find victim and steal) 1243 } else { 1244 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1245 typedef union { 1246 struct { 1247 UT count; 1248 T ub; 1249 } p; 1250 kmp_int64 b; 1251 } union_i4; 1252 // All operations on 'count' or 'ub' must be combined atomically 1253 // together. 1254 { 1255 union_i4 vold, vnew; 1256 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1257 vnew = vold; 1258 vnew.p.count++; 1259 while (!KMP_COMPARE_AND_STORE_ACQ64( 1260 (volatile kmp_int64 *)&pr->u.p.count, 1261 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1262 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1263 KMP_CPU_PAUSE(); 1264 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1265 vnew = vold; 1266 vnew.p.count++; 1267 } 1268 vnew = vold; 1269 init = vnew.p.count; 1270 status = (init < (UT)vnew.p.ub); 1271 } 1272 1273 if (!status) { 1274 kmp_info_t **other_threads = team->t.t_threads; 1275 int while_limit = nproc; // nproc attempts to find a victim 1276 int while_index = 0; 1277 1278 // TODO: algorithm of searching for a victim 1279 // should be cleaned up and measured 1280 while ((!status) && (while_limit != ++while_index)) { 1281 union_i4 vold, vnew; 1282 kmp_int32 remaining; 1283 T victimIdx = pr->u.p.parm4; 1284 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1285 dispatch_private_info_template<T> *victim = 1286 reinterpret_cast<dispatch_private_info_template<T> *>( 1287 other_threads[victimIdx] 1288 ->th.th_dispatch->th_dispatch_pr_current); 1289 while ((victim == NULL || victim == pr || 1290 (*(volatile T *)&victim->u.p.static_steal_counter != 1291 *(volatile T *)&pr->u.p.static_steal_counter)) && 1292 oldVictimIdx != victimIdx) { 1293 victimIdx = (victimIdx + 1) % nproc; 1294 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1295 other_threads[victimIdx] 1296 ->th.th_dispatch->th_dispatch_pr_current); 1297 } 1298 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1299 *(volatile T *)&pr->u.p.static_steal_counter)) { 1300 continue; // try once more (nproc attempts in total) 1301 // no victim is ready yet to participate in stealing 1302 // because all victims are still in kmp_init_dispatch 1303 } 1304 pr->u.p.parm4 = victimIdx; // new victim found 1305 while (1) { // CAS loop if victim has enough chunks to steal 1306 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1307 vnew = vold; 1308 1309 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1310 if (vnew.p.count >= (UT)vnew.p.ub || 1311 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1312 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1313 break; // not enough chunks to steal, goto next victim 1314 } 1315 if (remaining > 3) { 1316 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1317 } else { 1318 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1319 } 1320 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1321 // TODO: Should this be acquire or release? 1322 if (KMP_COMPARE_AND_STORE_ACQ64( 1323 (volatile kmp_int64 *)&victim->u.p.count, 1324 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1325 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1326 // stealing succedded 1327 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1328 vold.p.ub - vnew.p.ub); 1329 status = 1; 1330 while_index = 0; 1331 // now update own count and ub 1332 init = vnew.p.ub; 1333 vold.p.count = init + 1; 1334 #if KMP_ARCH_X86 1335 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1336 #else 1337 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1338 #endif 1339 break; 1340 } // if (check CAS result) 1341 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1342 } // while (try to steal from particular victim) 1343 } // while (search for victim) 1344 } // if (try to find victim and steal) 1345 } // if (4-byte induction variable) 1346 if (!status) { 1347 *p_lb = 0; 1348 *p_ub = 0; 1349 if (p_st != NULL) 1350 *p_st = 0; 1351 } else { 1352 start = pr->u.p.parm2; 1353 init *= chunk; 1354 limit = chunk + init - 1; 1355 incr = pr->u.p.st; 1356 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1357 1358 KMP_DEBUG_ASSERT(init <= trip); 1359 if ((last = (limit >= trip)) != 0) 1360 limit = trip; 1361 if (p_st != NULL) 1362 *p_st = incr; 1363 1364 if (incr == 1) { 1365 *p_lb = start + init; 1366 *p_ub = start + limit; 1367 } else { 1368 *p_lb = start + init * incr; 1369 *p_ub = start + limit * incr; 1370 } 1371 1372 if (pr->flags.ordered) { 1373 pr->u.p.ordered_lower = init; 1374 pr->u.p.ordered_upper = limit; 1375 } // if 1376 } // if 1377 break; 1378 } // case 1379 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1380 case kmp_sch_static_balanced: { 1381 KD_TRACE( 1382 10, 1383 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1384 gtid)); 1385 /* check if thread has any iteration to do */ 1386 if ((status = !pr->u.p.count) != 0) { 1387 pr->u.p.count = 1; 1388 *p_lb = pr->u.p.lb; 1389 *p_ub = pr->u.p.ub; 1390 last = pr->u.p.parm1; 1391 if (p_st != NULL) 1392 *p_st = pr->u.p.st; 1393 } else { /* no iterations to do */ 1394 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1395 } 1396 } // case 1397 break; 1398 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1399 merged here */ 1400 case kmp_sch_static_chunked: { 1401 T parm1; 1402 1403 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1404 "kmp_sch_static_[affinity|chunked] case\n", 1405 gtid)); 1406 parm1 = pr->u.p.parm1; 1407 1408 trip = pr->u.p.tc - 1; 1409 init = parm1 * (pr->u.p.count + tid); 1410 1411 if ((status = (init <= trip)) != 0) { 1412 start = pr->u.p.lb; 1413 incr = pr->u.p.st; 1414 limit = parm1 + init - 1; 1415 1416 if ((last = (limit >= trip)) != 0) 1417 limit = trip; 1418 1419 if (p_st != NULL) 1420 *p_st = incr; 1421 1422 pr->u.p.count += nproc; 1423 1424 if (incr == 1) { 1425 *p_lb = start + init; 1426 *p_ub = start + limit; 1427 } else { 1428 *p_lb = start + init * incr; 1429 *p_ub = start + limit * incr; 1430 } 1431 1432 if (pr->flags.ordered) { 1433 pr->u.p.ordered_lower = init; 1434 pr->u.p.ordered_upper = limit; 1435 } // if 1436 } // if 1437 } // case 1438 break; 1439 1440 case kmp_sch_dynamic_chunked: { 1441 T chunk = pr->u.p.parm1; 1442 1443 KD_TRACE( 1444 100, 1445 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1446 gtid)); 1447 1448 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1449 trip = pr->u.p.tc - 1; 1450 1451 if ((status = (init <= trip)) == 0) { 1452 *p_lb = 0; 1453 *p_ub = 0; 1454 if (p_st != NULL) 1455 *p_st = 0; 1456 } else { 1457 start = pr->u.p.lb; 1458 limit = chunk + init - 1; 1459 incr = pr->u.p.st; 1460 1461 if ((last = (limit >= trip)) != 0) 1462 limit = trip; 1463 1464 if (p_st != NULL) 1465 *p_st = incr; 1466 1467 if (incr == 1) { 1468 *p_lb = start + init; 1469 *p_ub = start + limit; 1470 } else { 1471 *p_lb = start + init * incr; 1472 *p_ub = start + limit * incr; 1473 } 1474 1475 if (pr->flags.ordered) { 1476 pr->u.p.ordered_lower = init; 1477 pr->u.p.ordered_upper = limit; 1478 } // if 1479 } // if 1480 } // case 1481 break; 1482 1483 case kmp_sch_guided_iterative_chunked: { 1484 T chunkspec = pr->u.p.parm1; 1485 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1486 "iterative case\n", 1487 gtid)); 1488 trip = pr->u.p.tc; 1489 // Start atomic part of calculations 1490 while (1) { 1491 ST remaining; // signed, because can be < 0 1492 init = sh->u.s.iteration; // shared value 1493 remaining = trip - init; 1494 if (remaining <= 0) { // AC: need to compare with 0 first 1495 // nothing to do, don't try atomic op 1496 status = 0; 1497 break; 1498 } 1499 if ((T)remaining < 1500 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1501 // use dynamic-style shcedule 1502 // atomically inrement iterations, get old value 1503 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1504 (ST)chunkspec); 1505 remaining = trip - init; 1506 if (remaining <= 0) { 1507 status = 0; // all iterations got by other threads 1508 } else { 1509 // got some iterations to work on 1510 status = 1; 1511 if ((T)remaining > chunkspec) { 1512 limit = init + chunkspec - 1; 1513 } else { 1514 last = 1; // the last chunk 1515 limit = init + remaining - 1; 1516 } // if 1517 } // if 1518 break; 1519 } // if 1520 limit = init + 1521 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1522 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1523 (ST)init, (ST)limit)) { 1524 // CAS was successful, chunk obtained 1525 status = 1; 1526 --limit; 1527 break; 1528 } // if 1529 } // while 1530 if (status != 0) { 1531 start = pr->u.p.lb; 1532 incr = pr->u.p.st; 1533 if (p_st != NULL) 1534 *p_st = incr; 1535 *p_lb = start + init * incr; 1536 *p_ub = start + limit * incr; 1537 if (pr->flags.ordered) { 1538 pr->u.p.ordered_lower = init; 1539 pr->u.p.ordered_upper = limit; 1540 } // if 1541 } else { 1542 *p_lb = 0; 1543 *p_ub = 0; 1544 if (p_st != NULL) 1545 *p_st = 0; 1546 } // if 1547 } // case 1548 break; 1549 1550 #if OMP_45_ENABLED 1551 case kmp_sch_guided_simd: { 1552 // same as iterative but curr-chunk adjusted to be multiple of given 1553 // chunk 1554 T chunk = pr->u.p.parm1; 1555 KD_TRACE(100, 1556 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1557 gtid)); 1558 trip = pr->u.p.tc; 1559 // Start atomic part of calculations 1560 while (1) { 1561 ST remaining; // signed, because can be < 0 1562 init = sh->u.s.iteration; // shared value 1563 remaining = trip - init; 1564 if (remaining <= 0) { // AC: need to compare with 0 first 1565 status = 0; // nothing to do, don't try atomic op 1566 break; 1567 } 1568 KMP_DEBUG_ASSERT(init % chunk == 0); 1569 // compare with K*nproc*(chunk+1), K=2 by default 1570 if ((T)remaining < pr->u.p.parm2) { 1571 // use dynamic-style shcedule 1572 // atomically inrement iterations, get old value 1573 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1574 (ST)chunk); 1575 remaining = trip - init; 1576 if (remaining <= 0) { 1577 status = 0; // all iterations got by other threads 1578 } else { 1579 // got some iterations to work on 1580 status = 1; 1581 if ((T)remaining > chunk) { 1582 limit = init + chunk - 1; 1583 } else { 1584 last = 1; // the last chunk 1585 limit = init + remaining - 1; 1586 } // if 1587 } // if 1588 break; 1589 } // if 1590 // divide by K*nproc 1591 UT span = remaining * (*(double *)&pr->u.p.parm3); 1592 UT rem = span % chunk; 1593 if (rem) // adjust so that span%chunk == 0 1594 span += chunk - rem; 1595 limit = init + span; 1596 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1597 (ST)init, (ST)limit)) { 1598 // CAS was successful, chunk obtained 1599 status = 1; 1600 --limit; 1601 break; 1602 } // if 1603 } // while 1604 if (status != 0) { 1605 start = pr->u.p.lb; 1606 incr = pr->u.p.st; 1607 if (p_st != NULL) 1608 *p_st = incr; 1609 *p_lb = start + init * incr; 1610 *p_ub = start + limit * incr; 1611 if (pr->flags.ordered) { 1612 pr->u.p.ordered_lower = init; 1613 pr->u.p.ordered_upper = limit; 1614 } // if 1615 } else { 1616 *p_lb = 0; 1617 *p_ub = 0; 1618 if (p_st != NULL) 1619 *p_st = 0; 1620 } // if 1621 } // case 1622 break; 1623 #endif // OMP_45_ENABLED 1624 1625 case kmp_sch_guided_analytical_chunked: { 1626 T chunkspec = pr->u.p.parm1; 1627 UT chunkIdx; 1628 #if KMP_USE_X87CONTROL 1629 /* for storing original FPCW value for Windows* OS on 1630 IA-32 architecture 8-byte version */ 1631 unsigned int oldFpcw; 1632 unsigned int fpcwSet = 0; 1633 #endif 1634 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1635 "kmp_sch_guided_analytical_chunked case\n", 1636 gtid)); 1637 1638 trip = pr->u.p.tc; 1639 1640 KMP_DEBUG_ASSERT(nproc > 1); 1641 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1642 1643 while (1) { /* this while loop is a safeguard against unexpected zero 1644 chunk sizes */ 1645 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1646 if (chunkIdx >= (UT)pr->u.p.parm2) { 1647 --trip; 1648 /* use dynamic-style scheduling */ 1649 init = chunkIdx * chunkspec + pr->u.p.count; 1650 /* need to verify init > 0 in case of overflow in the above 1651 * calculation */ 1652 if ((status = (init > 0 && init <= trip)) != 0) { 1653 limit = init + chunkspec - 1; 1654 1655 if ((last = (limit >= trip)) != 0) 1656 limit = trip; 1657 } 1658 break; 1659 } else { 1660 /* use exponential-style scheduling */ 1661 /* The following check is to workaround the lack of long double precision on 1662 Windows* OS. 1663 This check works around the possible effect that init != 0 for chunkIdx == 0. 1664 */ 1665 #if KMP_USE_X87CONTROL 1666 /* If we haven't already done so, save original 1667 FPCW and set precision to 64-bit, as Windows* OS 1668 on IA-32 architecture defaults to 53-bit */ 1669 if (!fpcwSet) { 1670 oldFpcw = _control87(0, 0); 1671 _control87(_PC_64, _MCW_PC); 1672 fpcwSet = 0x30000; 1673 } 1674 #endif 1675 if (chunkIdx) { 1676 init = __kmp_dispatch_guided_remaining<T>( 1677 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1678 KMP_DEBUG_ASSERT(init); 1679 init = trip - init; 1680 } else 1681 init = 0; 1682 limit = trip - __kmp_dispatch_guided_remaining<T>( 1683 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1684 KMP_ASSERT(init <= limit); 1685 if (init < limit) { 1686 KMP_DEBUG_ASSERT(limit <= trip); 1687 --limit; 1688 status = 1; 1689 break; 1690 } // if 1691 } // if 1692 } // while (1) 1693 #if KMP_USE_X87CONTROL 1694 /* restore FPCW if necessary 1695 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1696 */ 1697 if (fpcwSet && (oldFpcw & fpcwSet)) 1698 _control87(oldFpcw, _MCW_PC); 1699 #endif 1700 if (status != 0) { 1701 start = pr->u.p.lb; 1702 incr = pr->u.p.st; 1703 if (p_st != NULL) 1704 *p_st = incr; 1705 *p_lb = start + init * incr; 1706 *p_ub = start + limit * incr; 1707 if (pr->flags.ordered) { 1708 pr->u.p.ordered_lower = init; 1709 pr->u.p.ordered_upper = limit; 1710 } 1711 } else { 1712 *p_lb = 0; 1713 *p_ub = 0; 1714 if (p_st != NULL) 1715 *p_st = 0; 1716 } 1717 } // case 1718 break; 1719 1720 case kmp_sch_trapezoidal: { 1721 UT index; 1722 T parm2 = pr->u.p.parm2; 1723 T parm3 = pr->u.p.parm3; 1724 T parm4 = pr->u.p.parm4; 1725 KD_TRACE(100, 1726 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1727 gtid)); 1728 1729 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1730 1731 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1732 trip = pr->u.p.tc - 1; 1733 1734 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1735 *p_lb = 0; 1736 *p_ub = 0; 1737 if (p_st != NULL) 1738 *p_st = 0; 1739 } else { 1740 start = pr->u.p.lb; 1741 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1742 incr = pr->u.p.st; 1743 1744 if ((last = (limit >= trip)) != 0) 1745 limit = trip; 1746 1747 if (p_st != NULL) 1748 *p_st = incr; 1749 1750 if (incr == 1) { 1751 *p_lb = start + init; 1752 *p_ub = start + limit; 1753 } else { 1754 *p_lb = start + init * incr; 1755 *p_ub = start + limit * incr; 1756 } 1757 1758 if (pr->flags.ordered) { 1759 pr->u.p.ordered_lower = init; 1760 pr->u.p.ordered_upper = limit; 1761 } // if 1762 } // if 1763 } // case 1764 break; 1765 default: { 1766 status = 0; // to avoid complaints on uninitialized variable use 1767 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1768 KMP_HNT(GetNewerLibrary), // Hint 1769 __kmp_msg_null // Variadic argument list terminator 1770 ); 1771 } break; 1772 } // switch 1773 if (p_last) 1774 *p_last = last; 1775 #ifdef KMP_DEBUG 1776 if (pr->flags.ordered) { 1777 char *buff; 1778 // create format specifiers before the debug output 1779 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1780 "ordered_lower:%%%s ordered_upper:%%%s\n", 1781 traits_t<UT>::spec, traits_t<UT>::spec); 1782 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1783 __kmp_str_free(&buff); 1784 } 1785 { 1786 char *buff; 1787 // create format specifiers before the debug output 1788 buff = __kmp_str_format( 1789 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1790 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1791 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1792 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1793 __kmp_str_free(&buff); 1794 } 1795 #endif 1796 return status; 1797 } 1798 1799 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1800 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1801 is not called. */ 1802 #if OMPT_SUPPORT && OMPT_OPTIONAL 1803 #define OMPT_LOOP_END \ 1804 if (status == 0) { \ 1805 if (ompt_enabled.ompt_callback_work) { \ 1806 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1807 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1808 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1809 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1810 &(task_info->task_data), 0, codeptr); \ 1811 } \ 1812 } 1813 // TODO: implement count 1814 #else 1815 #define OMPT_LOOP_END // no-op 1816 #endif 1817 1818 #if KMP_STATS_ENABLED 1819 #define KMP_STATS_LOOP_END \ 1820 { \ 1821 kmp_int64 u, l, t, i; \ 1822 l = (kmp_int64)(*p_lb); \ 1823 u = (kmp_int64)(*p_ub); \ 1824 i = (kmp_int64)(pr->u.p.st); \ 1825 if (status == 0) { \ 1826 t = 0; \ 1827 KMP_POP_PARTITIONED_TIMER(); \ 1828 } else if (i == 1) { \ 1829 if (u >= l) \ 1830 t = u - l + 1; \ 1831 else \ 1832 t = 0; \ 1833 } else if (i < 0) { \ 1834 if (l >= u) \ 1835 t = (l - u) / (-i) + 1; \ 1836 else \ 1837 t = 0; \ 1838 } else { \ 1839 if (u >= l) \ 1840 t = (u - l) / i + 1; \ 1841 else \ 1842 t = 0; \ 1843 } \ 1844 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1845 } 1846 #else 1847 #define KMP_STATS_LOOP_END /* Nothing */ 1848 #endif 1849 1850 template <typename T> 1851 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1852 T *p_lb, T *p_ub, 1853 typename traits_t<T>::signed_t *p_st 1854 #if OMPT_SUPPORT && OMPT_OPTIONAL 1855 , 1856 void *codeptr 1857 #endif 1858 ) { 1859 1860 typedef typename traits_t<T>::unsigned_t UT; 1861 typedef typename traits_t<T>::signed_t ST; 1862 // This is potentially slightly misleading, schedule(runtime) will appear here 1863 // even if the actual runtme schedule is static. (Which points out a 1864 // disadavantage of schedule(runtime): even when static scheduling is used it 1865 // costs more than a compile time choice to use static scheduling would.) 1866 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1867 1868 int status; 1869 dispatch_private_info_template<T> *pr; 1870 kmp_info_t *th = __kmp_threads[gtid]; 1871 kmp_team_t *team = th->th.th_team; 1872 1873 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1874 KD_TRACE( 1875 1000, 1876 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1877 gtid, p_lb, p_ub, p_st, p_last)); 1878 1879 if (team->t.t_serialized) { 1880 /* NOTE: serialize this dispatch becase we are not at the active level */ 1881 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1882 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1883 KMP_DEBUG_ASSERT(pr); 1884 1885 if ((status = (pr->u.p.tc != 0)) == 0) { 1886 *p_lb = 0; 1887 *p_ub = 0; 1888 // if ( p_last != NULL ) 1889 // *p_last = 0; 1890 if (p_st != NULL) 1891 *p_st = 0; 1892 if (__kmp_env_consistency_check) { 1893 if (pr->pushed_ws != ct_none) { 1894 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1895 } 1896 } 1897 } else if (pr->flags.nomerge) { 1898 kmp_int32 last; 1899 T start; 1900 UT limit, trip, init; 1901 ST incr; 1902 T chunk = pr->u.p.parm1; 1903 1904 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1905 gtid)); 1906 1907 init = chunk * pr->u.p.count++; 1908 trip = pr->u.p.tc - 1; 1909 1910 if ((status = (init <= trip)) == 0) { 1911 *p_lb = 0; 1912 *p_ub = 0; 1913 // if ( p_last != NULL ) 1914 // *p_last = 0; 1915 if (p_st != NULL) 1916 *p_st = 0; 1917 if (__kmp_env_consistency_check) { 1918 if (pr->pushed_ws != ct_none) { 1919 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1920 } 1921 } 1922 } else { 1923 start = pr->u.p.lb; 1924 limit = chunk + init - 1; 1925 incr = pr->u.p.st; 1926 1927 if ((last = (limit >= trip)) != 0) { 1928 limit = trip; 1929 #if KMP_OS_WINDOWS 1930 pr->u.p.last_upper = pr->u.p.ub; 1931 #endif /* KMP_OS_WINDOWS */ 1932 } 1933 if (p_last != NULL) 1934 *p_last = last; 1935 if (p_st != NULL) 1936 *p_st = incr; 1937 if (incr == 1) { 1938 *p_lb = start + init; 1939 *p_ub = start + limit; 1940 } else { 1941 *p_lb = start + init * incr; 1942 *p_ub = start + limit * incr; 1943 } 1944 1945 if (pr->flags.ordered) { 1946 pr->u.p.ordered_lower = init; 1947 pr->u.p.ordered_upper = limit; 1948 #ifdef KMP_DEBUG 1949 { 1950 char *buff; 1951 // create format specifiers before the debug output 1952 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1953 "ordered_lower:%%%s ordered_upper:%%%s\n", 1954 traits_t<UT>::spec, traits_t<UT>::spec); 1955 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1956 pr->u.p.ordered_upper)); 1957 __kmp_str_free(&buff); 1958 } 1959 #endif 1960 } // if 1961 } // if 1962 } else { 1963 pr->u.p.tc = 0; 1964 *p_lb = pr->u.p.lb; 1965 *p_ub = pr->u.p.ub; 1966 #if KMP_OS_WINDOWS 1967 pr->u.p.last_upper = *p_ub; 1968 #endif /* KMP_OS_WINDOWS */ 1969 if (p_last != NULL) 1970 *p_last = TRUE; 1971 if (p_st != NULL) 1972 *p_st = pr->u.p.st; 1973 } // if 1974 #ifdef KMP_DEBUG 1975 { 1976 char *buff; 1977 // create format specifiers before the debug output 1978 buff = __kmp_str_format( 1979 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1980 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1981 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1982 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1983 __kmp_str_free(&buff); 1984 } 1985 #endif 1986 #if INCLUDE_SSC_MARKS 1987 SSC_MARK_DISPATCH_NEXT(); 1988 #endif 1989 OMPT_LOOP_END; 1990 KMP_STATS_LOOP_END; 1991 return status; 1992 } else { 1993 kmp_int32 last = 0; 1994 dispatch_shared_info_template<T> volatile *sh; 1995 1996 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1997 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1998 1999 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2000 th->th.th_dispatch->th_dispatch_pr_current); 2001 KMP_DEBUG_ASSERT(pr); 2002 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2003 th->th.th_dispatch->th_dispatch_sh_current); 2004 KMP_DEBUG_ASSERT(sh); 2005 2006 #if KMP_USE_HIER_SCHED 2007 if (pr->flags.use_hier) 2008 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2009 else 2010 #endif // KMP_USE_HIER_SCHED 2011 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2012 p_st, th->th.th_team_nproc, 2013 th->th.th_info.ds.ds_tid); 2014 // status == 0: no more iterations to execute 2015 if (status == 0) { 2016 UT num_done; 2017 2018 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2019 #ifdef KMP_DEBUG 2020 { 2021 char *buff; 2022 // create format specifiers before the debug output 2023 buff = __kmp_str_format( 2024 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2025 traits_t<UT>::spec); 2026 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2027 __kmp_str_free(&buff); 2028 } 2029 #endif 2030 2031 #if KMP_USE_HIER_SCHED 2032 pr->flags.use_hier = FALSE; 2033 #endif 2034 if ((ST)num_done == th->th.th_team_nproc - 1) { 2035 #if (KMP_STATIC_STEAL_ENABLED) 2036 if (pr->schedule == kmp_sch_static_steal && 2037 traits_t<T>::type_size > 4) { 2038 int i; 2039 kmp_info_t **other_threads = team->t.t_threads; 2040 // loop complete, safe to destroy locks used for stealing 2041 for (i = 0; i < th->th.th_team_nproc; ++i) { 2042 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2043 KMP_ASSERT(lck != NULL); 2044 __kmp_destroy_lock(lck); 2045 __kmp_free(lck); 2046 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2047 } 2048 } 2049 #endif 2050 /* NOTE: release this buffer to be reused */ 2051 2052 KMP_MB(); /* Flush all pending memory write invalidates. */ 2053 2054 sh->u.s.num_done = 0; 2055 sh->u.s.iteration = 0; 2056 2057 /* TODO replace with general release procedure? */ 2058 if (pr->flags.ordered) { 2059 sh->u.s.ordered_iteration = 0; 2060 } 2061 2062 KMP_MB(); /* Flush all pending memory write invalidates. */ 2063 2064 sh->buffer_index += __kmp_dispatch_num_buffers; 2065 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2066 gtid, sh->buffer_index)); 2067 2068 KMP_MB(); /* Flush all pending memory write invalidates. */ 2069 2070 } // if 2071 if (__kmp_env_consistency_check) { 2072 if (pr->pushed_ws != ct_none) { 2073 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2074 } 2075 } 2076 2077 th->th.th_dispatch->th_deo_fcn = NULL; 2078 th->th.th_dispatch->th_dxo_fcn = NULL; 2079 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2080 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2081 } // if (status == 0) 2082 #if KMP_OS_WINDOWS 2083 else if (last) { 2084 pr->u.p.last_upper = pr->u.p.ub; 2085 } 2086 #endif /* KMP_OS_WINDOWS */ 2087 if (p_last != NULL && status != 0) 2088 *p_last = last; 2089 } // if 2090 2091 #ifdef KMP_DEBUG 2092 { 2093 char *buff; 2094 // create format specifiers before the debug output 2095 buff = __kmp_str_format( 2096 "__kmp_dispatch_next: T#%%d normal case: " 2097 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2098 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2099 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2100 (p_last ? *p_last : 0), status)); 2101 __kmp_str_free(&buff); 2102 } 2103 #endif 2104 #if INCLUDE_SSC_MARKS 2105 SSC_MARK_DISPATCH_NEXT(); 2106 #endif 2107 OMPT_LOOP_END; 2108 KMP_STATS_LOOP_END; 2109 return status; 2110 } 2111 2112 template <typename T> 2113 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2114 kmp_int32 *plastiter, T *plower, T *pupper, 2115 typename traits_t<T>::signed_t incr) { 2116 typedef typename traits_t<T>::unsigned_t UT; 2117 kmp_uint32 team_id; 2118 kmp_uint32 nteams; 2119 UT trip_count; 2120 kmp_team_t *team; 2121 kmp_info_t *th; 2122 2123 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2124 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2125 #ifdef KMP_DEBUG 2126 typedef typename traits_t<T>::signed_t ST; 2127 { 2128 char *buff; 2129 // create format specifiers before the debug output 2130 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2131 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2132 traits_t<T>::spec, traits_t<T>::spec, 2133 traits_t<ST>::spec, traits_t<T>::spec); 2134 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2135 __kmp_str_free(&buff); 2136 } 2137 #endif 2138 2139 if (__kmp_env_consistency_check) { 2140 if (incr == 0) { 2141 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2142 loc); 2143 } 2144 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2145 // The loop is illegal. 2146 // Some zero-trip loops maintained by compiler, e.g.: 2147 // for(i=10;i<0;++i) // lower >= upper - run-time check 2148 // for(i=0;i>10;--i) // lower <= upper - run-time check 2149 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2150 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2151 // Compiler does not check the following illegal loops: 2152 // for(i=0;i<10;i+=incr) // where incr<0 2153 // for(i=10;i>0;i-=incr) // where incr<0 2154 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2155 } 2156 } 2157 th = __kmp_threads[gtid]; 2158 team = th->th.th_team; 2159 #if OMP_40_ENABLED 2160 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2161 nteams = th->th.th_teams_size.nteams; 2162 #endif 2163 team_id = team->t.t_master_tid; 2164 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2165 2166 // compute global trip count 2167 if (incr == 1) { 2168 trip_count = *pupper - *plower + 1; 2169 } else if (incr == -1) { 2170 trip_count = *plower - *pupper + 1; 2171 } else if (incr > 0) { 2172 // upper-lower can exceed the limit of signed type 2173 trip_count = (UT)(*pupper - *plower) / incr + 1; 2174 } else { 2175 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2176 } 2177 2178 if (trip_count <= nteams) { 2179 KMP_DEBUG_ASSERT( 2180 __kmp_static == kmp_sch_static_greedy || 2181 __kmp_static == 2182 kmp_sch_static_balanced); // Unknown static scheduling type. 2183 // only some teams get single iteration, others get nothing 2184 if (team_id < trip_count) { 2185 *pupper = *plower = *plower + team_id * incr; 2186 } else { 2187 *plower = *pupper + incr; // zero-trip loop 2188 } 2189 if (plastiter != NULL) 2190 *plastiter = (team_id == trip_count - 1); 2191 } else { 2192 if (__kmp_static == kmp_sch_static_balanced) { 2193 UT chunk = trip_count / nteams; 2194 UT extras = trip_count % nteams; 2195 *plower += 2196 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2197 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2198 if (plastiter != NULL) 2199 *plastiter = (team_id == nteams - 1); 2200 } else { 2201 T chunk_inc_count = 2202 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2203 T upper = *pupper; 2204 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2205 // Unknown static scheduling type. 2206 *plower += team_id * chunk_inc_count; 2207 *pupper = *plower + chunk_inc_count - incr; 2208 // Check/correct bounds if needed 2209 if (incr > 0) { 2210 if (*pupper < *plower) 2211 *pupper = traits_t<T>::max_value; 2212 if (plastiter != NULL) 2213 *plastiter = *plower <= upper && *pupper > upper - incr; 2214 if (*pupper > upper) 2215 *pupper = upper; // tracker C73258 2216 } else { 2217 if (*pupper > *plower) 2218 *pupper = traits_t<T>::min_value; 2219 if (plastiter != NULL) 2220 *plastiter = *plower >= upper && *pupper < upper - incr; 2221 if (*pupper < upper) 2222 *pupper = upper; // tracker C73258 2223 } 2224 } 2225 } 2226 } 2227 2228 //----------------------------------------------------------------------------- 2229 // Dispatch routines 2230 // Transfer call to template< type T > 2231 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2232 // T lb, T ub, ST st, ST chunk ) 2233 extern "C" { 2234 2235 /*! 2236 @ingroup WORK_SHARING 2237 @{ 2238 @param loc Source location 2239 @param gtid Global thread id 2240 @param schedule Schedule type 2241 @param lb Lower bound 2242 @param ub Upper bound 2243 @param st Step (or increment if you prefer) 2244 @param chunk The chunk size to block with 2245 2246 This function prepares the runtime to start a dynamically scheduled for loop, 2247 saving the loop arguments. 2248 These functions are all identical apart from the types of the arguments. 2249 */ 2250 2251 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2252 enum sched_type schedule, kmp_int32 lb, 2253 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2254 KMP_DEBUG_ASSERT(__kmp_init_serial); 2255 #if OMPT_SUPPORT && OMPT_OPTIONAL 2256 OMPT_STORE_RETURN_ADDRESS(gtid); 2257 #endif 2258 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2259 } 2260 /*! 2261 See @ref __kmpc_dispatch_init_4 2262 */ 2263 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2264 enum sched_type schedule, kmp_uint32 lb, 2265 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2266 KMP_DEBUG_ASSERT(__kmp_init_serial); 2267 #if OMPT_SUPPORT && OMPT_OPTIONAL 2268 OMPT_STORE_RETURN_ADDRESS(gtid); 2269 #endif 2270 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2271 } 2272 2273 /*! 2274 See @ref __kmpc_dispatch_init_4 2275 */ 2276 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2277 enum sched_type schedule, kmp_int64 lb, 2278 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2279 KMP_DEBUG_ASSERT(__kmp_init_serial); 2280 #if OMPT_SUPPORT && OMPT_OPTIONAL 2281 OMPT_STORE_RETURN_ADDRESS(gtid); 2282 #endif 2283 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2284 } 2285 2286 /*! 2287 See @ref __kmpc_dispatch_init_4 2288 */ 2289 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2290 enum sched_type schedule, kmp_uint64 lb, 2291 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2292 KMP_DEBUG_ASSERT(__kmp_init_serial); 2293 #if OMPT_SUPPORT && OMPT_OPTIONAL 2294 OMPT_STORE_RETURN_ADDRESS(gtid); 2295 #endif 2296 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2297 } 2298 2299 /*! 2300 See @ref __kmpc_dispatch_init_4 2301 2302 Difference from __kmpc_dispatch_init set of functions is these functions 2303 are called for composite distribute parallel for construct. Thus before 2304 regular iterations dispatching we need to calc per-team iteration space. 2305 2306 These functions are all identical apart from the types of the arguments. 2307 */ 2308 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2309 enum sched_type schedule, kmp_int32 *p_last, 2310 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2311 kmp_int32 chunk) { 2312 KMP_DEBUG_ASSERT(__kmp_init_serial); 2313 #if OMPT_SUPPORT && OMPT_OPTIONAL 2314 OMPT_STORE_RETURN_ADDRESS(gtid); 2315 #endif 2316 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2317 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2318 } 2319 2320 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2321 enum sched_type schedule, kmp_int32 *p_last, 2322 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2323 kmp_int32 chunk) { 2324 KMP_DEBUG_ASSERT(__kmp_init_serial); 2325 #if OMPT_SUPPORT && OMPT_OPTIONAL 2326 OMPT_STORE_RETURN_ADDRESS(gtid); 2327 #endif 2328 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2329 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2330 } 2331 2332 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2333 enum sched_type schedule, kmp_int32 *p_last, 2334 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2335 kmp_int64 chunk) { 2336 KMP_DEBUG_ASSERT(__kmp_init_serial); 2337 #if OMPT_SUPPORT && OMPT_OPTIONAL 2338 OMPT_STORE_RETURN_ADDRESS(gtid); 2339 #endif 2340 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2341 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2342 } 2343 2344 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2345 enum sched_type schedule, kmp_int32 *p_last, 2346 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2347 kmp_int64 chunk) { 2348 KMP_DEBUG_ASSERT(__kmp_init_serial); 2349 #if OMPT_SUPPORT && OMPT_OPTIONAL 2350 OMPT_STORE_RETURN_ADDRESS(gtid); 2351 #endif 2352 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2353 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2354 } 2355 2356 /*! 2357 @param loc Source code location 2358 @param gtid Global thread id 2359 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2360 otherwise 2361 @param p_lb Pointer to the lower bound for the next chunk of work 2362 @param p_ub Pointer to the upper bound for the next chunk of work 2363 @param p_st Pointer to the stride for the next chunk of work 2364 @return one if there is work to be done, zero otherwise 2365 2366 Get the next dynamically allocated chunk of work for this thread. 2367 If there is no more work, then the lb,ub and stride need not be modified. 2368 */ 2369 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2370 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2371 #if OMPT_SUPPORT && OMPT_OPTIONAL 2372 OMPT_STORE_RETURN_ADDRESS(gtid); 2373 #endif 2374 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2376 , 2377 OMPT_LOAD_RETURN_ADDRESS(gtid) 2378 #endif 2379 ); 2380 } 2381 2382 /*! 2383 See @ref __kmpc_dispatch_next_4 2384 */ 2385 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2386 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2387 kmp_int32 *p_st) { 2388 #if OMPT_SUPPORT && OMPT_OPTIONAL 2389 OMPT_STORE_RETURN_ADDRESS(gtid); 2390 #endif 2391 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 , 2394 OMPT_LOAD_RETURN_ADDRESS(gtid) 2395 #endif 2396 ); 2397 } 2398 2399 /*! 2400 See @ref __kmpc_dispatch_next_4 2401 */ 2402 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2403 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2404 #if OMPT_SUPPORT && OMPT_OPTIONAL 2405 OMPT_STORE_RETURN_ADDRESS(gtid); 2406 #endif 2407 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2408 #if OMPT_SUPPORT && OMPT_OPTIONAL 2409 , 2410 OMPT_LOAD_RETURN_ADDRESS(gtid) 2411 #endif 2412 ); 2413 } 2414 2415 /*! 2416 See @ref __kmpc_dispatch_next_4 2417 */ 2418 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2419 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2420 kmp_int64 *p_st) { 2421 #if OMPT_SUPPORT && OMPT_OPTIONAL 2422 OMPT_STORE_RETURN_ADDRESS(gtid); 2423 #endif 2424 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2426 , 2427 OMPT_LOAD_RETURN_ADDRESS(gtid) 2428 #endif 2429 ); 2430 } 2431 2432 /*! 2433 @param loc Source code location 2434 @param gtid Global thread id 2435 2436 Mark the end of a dynamic loop. 2437 */ 2438 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2439 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2440 } 2441 2442 /*! 2443 See @ref __kmpc_dispatch_fini_4 2444 */ 2445 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2446 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2447 } 2448 2449 /*! 2450 See @ref __kmpc_dispatch_fini_4 2451 */ 2452 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2453 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2454 } 2455 2456 /*! 2457 See @ref __kmpc_dispatch_fini_4 2458 */ 2459 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2460 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2461 } 2462 /*! @} */ 2463 2464 //----------------------------------------------------------------------------- 2465 // Non-template routines from kmp_dispatch.cpp used in other sources 2466 2467 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2468 return value == checker; 2469 } 2470 2471 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2472 return value != checker; 2473 } 2474 2475 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2476 return value < checker; 2477 } 2478 2479 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2480 return value >= checker; 2481 } 2482 2483 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2484 return value <= checker; 2485 } 2486 2487 kmp_uint32 2488 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2489 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2490 void *obj // Higher-level synchronization object, or NULL. 2491 ) { 2492 // note: we may not belong to a team at this point 2493 volatile kmp_uint32 *spin = spinner; 2494 kmp_uint32 check = checker; 2495 kmp_uint32 spins; 2496 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2497 kmp_uint32 r; 2498 2499 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2500 KMP_INIT_YIELD(spins); 2501 // main wait spin loop 2502 while (!f(r = TCR_4(*spin), check)) { 2503 KMP_FSYNC_SPIN_PREPARE(obj); 2504 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2505 split. It causes problems with infinite recursion because of exit lock */ 2506 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2507 __kmp_abort_thread(); */ 2508 2509 /* if we have waited a bit, or are oversubscribed, yield */ 2510 /* pause is in the following code */ 2511 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2512 KMP_YIELD_SPIN(spins); 2513 } 2514 KMP_FSYNC_SPIN_ACQUIRED(obj); 2515 return r; 2516 } 2517 2518 void __kmp_wait_yield_4_ptr( 2519 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2520 void *obj // Higher-level synchronization object, or NULL. 2521 ) { 2522 // note: we may not belong to a team at this point 2523 void *spin = spinner; 2524 kmp_uint32 check = checker; 2525 kmp_uint32 spins; 2526 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2527 2528 KMP_FSYNC_SPIN_INIT(obj, spin); 2529 KMP_INIT_YIELD(spins); 2530 // main wait spin loop 2531 while (!f(spin, check)) { 2532 KMP_FSYNC_SPIN_PREPARE(obj); 2533 /* if we have waited a bit, or are oversubscribed, yield */ 2534 /* pause is in the following code */ 2535 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2536 KMP_YIELD_SPIN(spins); 2537 } 2538 KMP_FSYNC_SPIN_ACQUIRED(obj); 2539 } 2540 2541 } // extern "C" 2542 2543 #ifdef KMP_GOMP_COMPAT 2544 2545 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2546 enum sched_type schedule, kmp_int32 lb, 2547 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2548 int push_ws) { 2549 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2550 push_ws); 2551 } 2552 2553 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2554 enum sched_type schedule, kmp_uint32 lb, 2555 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2556 int push_ws) { 2557 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2558 push_ws); 2559 } 2560 2561 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2562 enum sched_type schedule, kmp_int64 lb, 2563 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2564 int push_ws) { 2565 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2566 push_ws); 2567 } 2568 2569 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2570 enum sched_type schedule, kmp_uint64 lb, 2571 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2572 int push_ws) { 2573 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2574 push_ws); 2575 } 2576 2577 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2578 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2579 } 2580 2581 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2582 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2583 } 2584 2585 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2586 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2587 } 2588 2589 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2590 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2591 } 2592 2593 #endif /* KMP_GOMP_COMPAT */ 2594 2595 /* ------------------------------------------------------------------------ */ 2596