1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 #include "kmp_lock.h" 38 #include "kmp_dispatch.h" 39 #if KMP_USE_HIER_SCHED 40 #include "kmp_dispatch_hier.h" 41 #endif 42 43 #if OMPT_SUPPORT 44 #include "ompt-specific.h" 45 #endif 46 47 /* ------------------------------------------------------------------------ */ 48 /* ------------------------------------------------------------------------ */ 49 50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 51 kmp_info_t *th; 52 53 KMP_DEBUG_ASSERT(gtid_ref); 54 55 if (__kmp_env_consistency_check) { 56 th = __kmp_threads[*gtid_ref]; 57 if (th->th.th_root->r.r_active && 58 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 59 #if KMP_USE_DYNAMIC_LOCK 60 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 61 #else 62 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 63 #endif 64 } 65 } 66 } 67 68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 69 kmp_info_t *th; 70 71 if (__kmp_env_consistency_check) { 72 th = __kmp_threads[*gtid_ref]; 73 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 74 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 75 } 76 } 77 } 78 79 // Initialize a dispatch_private_info_template<T> buffer for a particular 80 // type of schedule,chunk. The loop description is found in lb (lower bound), 81 // ub (upper bound), and st (stride). nproc is the number of threads relevant 82 // to the scheduling (often the number of threads in a team, but not always if 83 // hierarchical scheduling is used). tid is the id of the thread calling 84 // the function within the group of nproc threads. It will have a value 85 // between 0 and nproc - 1. This is often just the thread id within a team, but 86 // is not necessarily the case when using hierarchical scheduling. 87 // loc is the source file location of the corresponding loop 88 // gtid is the global thread id 89 template <typename T> 90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 91 dispatch_private_info_template<T> *pr, 92 enum sched_type schedule, T lb, T ub, 93 typename traits_t<T>::signed_t st, 94 #if USE_ITT_BUILD 95 kmp_uint64 *cur_chunk, 96 #endif 97 typename traits_t<T>::signed_t chunk, 98 T nproc, T tid) { 99 typedef typename traits_t<T>::unsigned_t UT; 100 typedef typename traits_t<T>::signed_t ST; 101 typedef typename traits_t<T>::floating_t DBL; 102 103 int active; 104 T tc; 105 kmp_info_t *th; 106 kmp_team_t *team; 107 108 #ifdef KMP_DEBUG 109 { 110 char *buff; 111 // create format specifiers before the debug output 112 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 113 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 114 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 115 traits_t<T>::spec, traits_t<T>::spec, 116 traits_t<ST>::spec, traits_t<ST>::spec, 117 traits_t<T>::spec, traits_t<T>::spec); 118 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 119 __kmp_str_free(&buff); 120 } 121 #endif 122 /* setup data */ 123 th = __kmp_threads[gtid]; 124 team = th->th.th_team; 125 active = !team->t.t_serialized; 126 127 #if USE_ITT_BUILD 128 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 129 __kmp_forkjoin_frames_mode == 3 && 130 KMP_MASTER_GTID(gtid) && 131 #if OMP_40_ENABLED 132 th->th.th_teams_microtask == NULL && 133 #endif 134 team->t.t_active_level == 1; 135 #endif 136 #if (KMP_STATIC_STEAL_ENABLED) 137 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 138 // AC: we now have only one implementation of stealing, so use it 139 schedule = kmp_sch_static_steal; 140 else 141 #endif 142 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 143 144 /* Pick up the nomerge/ordered bits from the scheduling type */ 145 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 146 pr->flags.nomerge = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.nomerge = FALSE; 151 } 152 pr->type_size = traits_t<T>::type_size; // remember the size of variables 153 if (kmp_ord_lower & schedule) { 154 pr->flags.ordered = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.ordered = FALSE; 159 } 160 161 if (schedule == kmp_sch_static) { 162 schedule = __kmp_static; 163 } else { 164 if (schedule == kmp_sch_runtime) { 165 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 166 // not specified) 167 schedule = team->t.t_sched.r_sched_type; 168 // Detail the schedule if needed (global controls are differentiated 169 // appropriately) 170 if (schedule == kmp_sch_guided_chunked) { 171 schedule = __kmp_guided; 172 } else if (schedule == kmp_sch_static) { 173 schedule = __kmp_static; 174 } 175 // Use the chunk size specified by OMP_SCHEDULE (or default if not 176 // specified) 177 chunk = team->t.t_sched.chunk; 178 #if USE_ITT_BUILD 179 if (cur_chunk) 180 *cur_chunk = chunk; 181 #endif 182 #ifdef KMP_DEBUG 183 { 184 char *buff; 185 // create format specifiers before the debug output 186 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 187 "schedule:%%d chunk:%%%s\n", 188 traits_t<ST>::spec); 189 KD_TRACE(10, (buff, gtid, schedule, chunk)); 190 __kmp_str_free(&buff); 191 } 192 #endif 193 } else { 194 if (schedule == kmp_sch_guided_chunked) { 195 schedule = __kmp_guided; 196 } 197 if (chunk <= 0) { 198 chunk = KMP_DEFAULT_CHUNK; 199 } 200 } 201 202 if (schedule == kmp_sch_auto) { 203 // mapping and differentiation: in the __kmp_do_serial_initialize() 204 schedule = __kmp_auto; 205 #ifdef KMP_DEBUG 206 { 207 char *buff; 208 // create format specifiers before the debug output 209 buff = __kmp_str_format( 210 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 211 "schedule:%%d chunk:%%%s\n", 212 traits_t<ST>::spec); 213 KD_TRACE(10, (buff, gtid, schedule, chunk)); 214 __kmp_str_free(&buff); 215 } 216 #endif 217 } 218 219 /* guided analytical not safe for too many threads */ 220 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 221 schedule = kmp_sch_guided_iterative_chunked; 222 KMP_WARNING(DispatchManyThreads); 223 } 224 #if OMP_45_ENABLED 225 if (schedule == kmp_sch_runtime_simd) { 226 // compiler provides simd_width in the chunk parameter 227 schedule = team->t.t_sched.r_sched_type; 228 // Detail the schedule if needed (global controls are differentiated 229 // appropriately) 230 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 231 schedule == __kmp_static) { 232 schedule = kmp_sch_static_balanced_chunked; 233 } else { 234 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 235 schedule = kmp_sch_guided_simd; 236 } 237 chunk = team->t.t_sched.chunk * chunk; 238 } 239 #if USE_ITT_BUILD 240 if (cur_chunk) 241 *cur_chunk = chunk; 242 #endif 243 #ifdef KMP_DEBUG 244 { 245 char *buff; 246 // create format specifiers before the debug output 247 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 248 " chunk:%%%s\n", 249 traits_t<ST>::spec); 250 KD_TRACE(10, (buff, gtid, schedule, chunk)); 251 __kmp_str_free(&buff); 252 } 253 #endif 254 } 255 #endif // OMP_45_ENABLED 256 pr->u.p.parm1 = chunk; 257 } 258 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 259 "unknown scheduling type"); 260 261 pr->u.p.count = 0; 262 263 if (__kmp_env_consistency_check) { 264 if (st == 0) { 265 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 266 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 267 } 268 } 269 // compute trip count 270 if (st == 1) { // most common case 271 if (ub >= lb) { 272 tc = ub - lb + 1; 273 } else { // ub < lb 274 tc = 0; // zero-trip 275 } 276 } else if (st < 0) { 277 if (lb >= ub) { 278 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(lb - ub) / (-st) + 1; 281 } else { // lb < ub 282 tc = 0; // zero-trip 283 } 284 } else { // st > 0 285 if (ub >= lb) { 286 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 287 // where the division needs to be unsigned regardless of the result type 288 tc = (UT)(ub - lb) / st + 1; 289 } else { // ub < lb 290 tc = 0; // zero-trip 291 } 292 } 293 294 pr->u.p.lb = lb; 295 pr->u.p.ub = ub; 296 pr->u.p.st = st; 297 pr->u.p.tc = tc; 298 299 #if KMP_OS_WINDOWS 300 pr->u.p.last_upper = ub + st; 301 #endif /* KMP_OS_WINDOWS */ 302 303 /* NOTE: only the active parallel region(s) has active ordered sections */ 304 305 if (active) { 306 if (pr->flags.ordered) { 307 pr->ordered_bumped = 0; 308 pr->u.p.ordered_lower = 1; 309 pr->u.p.ordered_upper = 0; 310 } 311 } 312 313 switch (schedule) { 314 #if (KMP_STATIC_STEAL_ENABLED) 315 case kmp_sch_static_steal: { 316 T ntc, init; 317 318 KD_TRACE(100, 319 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 320 gtid)); 321 322 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 323 if (nproc > 1 && ntc >= nproc) { 324 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 325 T id = tid; 326 T small_chunk, extras; 327 328 small_chunk = ntc / nproc; 329 extras = ntc % nproc; 330 331 init = id * small_chunk + (id < extras ? id : extras); 332 pr->u.p.count = init; 333 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 334 335 pr->u.p.parm2 = lb; 336 // pr->pfields.parm3 = 0; // it's not used in static_steal 337 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 338 pr->u.p.st = st; 339 if (traits_t<T>::type_size > 4) { 340 // AC: TODO: check if 16-byte CAS available and use it to 341 // improve performance (probably wait for explicit request 342 // before spending time on this). 343 // For now use dynamically allocated per-thread lock, 344 // free memory in __kmp_dispatch_next when status==0. 345 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 346 th->th.th_dispatch->th_steal_lock = 347 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 348 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 349 } 350 break; 351 } else { 352 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 353 "kmp_sch_static_balanced\n", 354 gtid)); 355 schedule = kmp_sch_static_balanced; 356 /* too few iterations: fall-through to kmp_sch_static_balanced */ 357 } // if 358 /* FALL-THROUGH to static balanced */ 359 } // case 360 #endif 361 case kmp_sch_static_balanced: { 362 T init, limit; 363 364 KD_TRACE( 365 100, 366 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 367 gtid)); 368 369 if (nproc > 1) { 370 T id = tid; 371 372 if (tc < nproc) { 373 if (id < tc) { 374 init = id; 375 limit = id; 376 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 377 } else { 378 pr->u.p.count = 1; /* means no more chunks to execute */ 379 pr->u.p.parm1 = FALSE; 380 break; 381 } 382 } else { 383 T small_chunk = tc / nproc; 384 T extras = tc % nproc; 385 init = id * small_chunk + (id < extras ? id : extras); 386 limit = init + small_chunk - (id < extras ? 0 : 1); 387 pr->u.p.parm1 = (id == nproc - 1); 388 } 389 } else { 390 if (tc > 0) { 391 init = 0; 392 limit = tc - 1; 393 pr->u.p.parm1 = TRUE; 394 } else { 395 // zero trip count 396 pr->u.p.count = 1; /* means no more chunks to execute */ 397 pr->u.p.parm1 = FALSE; 398 break; 399 } 400 } 401 #if USE_ITT_BUILD 402 // Calculate chunk for metadata report 403 if (itt_need_metadata_reporting) 404 if (cur_chunk) 405 *cur_chunk = limit - init + 1; 406 #endif 407 if (st == 1) { 408 pr->u.p.lb = lb + init; 409 pr->u.p.ub = lb + limit; 410 } else { 411 // calculated upper bound, "ub" is user-defined upper bound 412 T ub_tmp = lb + limit * st; 413 pr->u.p.lb = lb + init * st; 414 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 415 // it exactly 416 if (st > 0) { 417 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 418 } else { 419 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 420 } 421 } 422 if (pr->flags.ordered) { 423 pr->u.p.ordered_lower = init; 424 pr->u.p.ordered_upper = limit; 425 } 426 break; 427 } // case 428 #if OMP_45_ENABLED 429 case kmp_sch_static_balanced_chunked: { 430 // similar to balanced, but chunk adjusted to multiple of simd width 431 T nth = nproc; 432 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 433 " -> falling-through to static_greedy\n", 434 gtid)); 435 schedule = kmp_sch_static_greedy; 436 if (nth > 1) 437 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 438 else 439 pr->u.p.parm1 = tc; 440 break; 441 } // case 442 case kmp_sch_guided_simd: 443 #endif // OMP_45_ENABLED 444 case kmp_sch_guided_iterative_chunked: { 445 KD_TRACE( 446 100, 447 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 448 " case\n", 449 gtid)); 450 451 if (nproc > 1) { 452 if ((2L * chunk + 1) * nproc >= tc) { 453 /* chunk size too large, switch to dynamic */ 454 schedule = kmp_sch_dynamic_chunked; 455 } else { 456 // when remaining iters become less than parm2 - switch to dynamic 457 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 458 *(double *)&pr->u.p.parm3 = 459 guided_flt_param / nproc; // may occupy parm3 and parm4 460 } 461 } else { 462 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 463 "kmp_sch_static_greedy\n", 464 gtid)); 465 schedule = kmp_sch_static_greedy; 466 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 467 KD_TRACE( 468 100, 469 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 470 gtid)); 471 pr->u.p.parm1 = tc; 472 } // if 473 } // case 474 break; 475 case kmp_sch_guided_analytical_chunked: { 476 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 477 "kmp_sch_guided_analytical_chunked case\n", 478 gtid)); 479 480 if (nproc > 1) { 481 if ((2L * chunk + 1) * nproc >= tc) { 482 /* chunk size too large, switch to dynamic */ 483 schedule = kmp_sch_dynamic_chunked; 484 } else { 485 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 486 DBL x; 487 488 #if KMP_OS_WINDOWS && KMP_ARCH_X86 489 /* Linux* OS already has 64-bit computation by default for long double, 490 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 491 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 492 instead of the default 53-bit. Even though long double doesn't work 493 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 494 expected to impact the correctness of the algorithm, but this has not 495 been mathematically proven. */ 496 // save original FPCW and set precision to 64-bit, as 497 // Windows* OS on IA-32 architecture defaults to 53-bit 498 unsigned int oldFpcw = _control87(0, 0); 499 _control87(_PC_64, _MCW_PC); // 0,0x30000 500 #endif 501 /* value used for comparison in solver for cross-over point */ 502 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 503 504 /* crossover point--chunk indexes equal to or greater than 505 this point switch to dynamic-style scheduling */ 506 UT cross; 507 508 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 509 x = (long double)1.0 - (long double)0.5 / nproc; 510 511 #ifdef KMP_DEBUG 512 { // test natural alignment 513 struct _test_a { 514 char a; 515 union { 516 char b; 517 DBL d; 518 }; 519 } t; 520 ptrdiff_t natural_alignment = 521 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 522 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 523 // long)natural_alignment ); 524 KMP_DEBUG_ASSERT( 525 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 526 } 527 #endif // KMP_DEBUG 528 529 /* save the term in thread private dispatch structure */ 530 *(DBL *)&pr->u.p.parm3 = x; 531 532 /* solve for the crossover point to the nearest integer i for which C_i 533 <= chunk */ 534 { 535 UT left, right, mid; 536 long double p; 537 538 /* estimate initial upper and lower bound */ 539 540 /* doesn't matter what value right is as long as it is positive, but 541 it affects performance of the solver */ 542 right = 229; 543 p = __kmp_pow<UT>(x, right); 544 if (p > target) { 545 do { 546 p *= p; 547 right <<= 1; 548 } while (p > target && right < (1 << 27)); 549 /* lower bound is previous (failed) estimate of upper bound */ 550 left = right >> 1; 551 } else { 552 left = 0; 553 } 554 555 /* bisection root-finding method */ 556 while (left + 1 < right) { 557 mid = (left + right) / 2; 558 if (__kmp_pow<UT>(x, mid) > target) { 559 left = mid; 560 } else { 561 right = mid; 562 } 563 } // while 564 cross = right; 565 } 566 /* assert sanity of computed crossover point */ 567 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 568 __kmp_pow<UT>(x, cross) <= target); 569 570 /* save the crossover point in thread private dispatch structure */ 571 pr->u.p.parm2 = cross; 572 573 // C75803 574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 576 #else 577 #define GUIDED_ANALYTICAL_WORKAROUND (x) 578 #endif 579 /* dynamic-style scheduling offset */ 580 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 581 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 582 cross * chunk; 583 #if KMP_OS_WINDOWS && KMP_ARCH_X86 584 // restore FPCW 585 _control87(oldFpcw, _MCW_PC); 586 #endif 587 } // if 588 } else { 589 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 590 "kmp_sch_static_greedy\n", 591 gtid)); 592 schedule = kmp_sch_static_greedy; 593 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 594 pr->u.p.parm1 = tc; 595 } // if 596 } // case 597 break; 598 case kmp_sch_static_greedy: 599 KD_TRACE( 600 100, 601 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 602 gtid)); 603 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 604 break; 605 case kmp_sch_static_chunked: 606 case kmp_sch_dynamic_chunked: 607 if (pr->u.p.parm1 <= 0) { 608 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 609 } 610 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 611 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 612 gtid)); 613 break; 614 case kmp_sch_trapezoidal: { 615 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 616 617 T parm1, parm2, parm3, parm4; 618 KD_TRACE(100, 619 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 620 gtid)); 621 622 parm1 = chunk; 623 624 /* F : size of the first cycle */ 625 parm2 = (tc / (2 * nproc)); 626 627 if (parm2 < 1) { 628 parm2 = 1; 629 } 630 631 /* L : size of the last cycle. Make sure the last cycle is not larger 632 than the first cycle. */ 633 if (parm1 < 1) { 634 parm1 = 1; 635 } else if (parm1 > parm2) { 636 parm1 = parm2; 637 } 638 639 /* N : number of cycles */ 640 parm3 = (parm2 + parm1); 641 parm3 = (2 * tc + parm3 - 1) / parm3; 642 643 if (parm3 < 2) { 644 parm3 = 2; 645 } 646 647 /* sigma : decreasing incr of the trapezoid */ 648 parm4 = (parm3 - 1); 649 parm4 = (parm2 - parm1) / parm4; 650 651 // pointless check, because parm4 >= 0 always 652 // if ( parm4 < 0 ) { 653 // parm4 = 0; 654 //} 655 656 pr->u.p.parm1 = parm1; 657 pr->u.p.parm2 = parm2; 658 pr->u.p.parm3 = parm3; 659 pr->u.p.parm4 = parm4; 660 } // case 661 break; 662 663 default: { 664 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 665 KMP_HNT(GetNewerLibrary), // Hint 666 __kmp_msg_null // Variadic argument list terminator 667 ); 668 } break; 669 } // switch 670 pr->schedule = schedule; 671 } 672 673 #if KMP_USE_HIER_SCHED 674 template <typename T> 675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 676 typename traits_t<T>::signed_t st); 677 template <> 678 inline void 679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 680 kmp_int32 ub, kmp_int32 st) { 681 __kmp_dispatch_init_hierarchy<kmp_int32>( 682 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 683 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 684 } 685 template <> 686 inline void 687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 688 kmp_uint32 ub, kmp_int32 st) { 689 __kmp_dispatch_init_hierarchy<kmp_uint32>( 690 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 691 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 692 } 693 template <> 694 inline void 695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 696 kmp_int64 ub, kmp_int64 st) { 697 __kmp_dispatch_init_hierarchy<kmp_int64>( 698 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 699 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 700 } 701 template <> 702 inline void 703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 704 kmp_uint64 ub, kmp_int64 st) { 705 __kmp_dispatch_init_hierarchy<kmp_uint64>( 706 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 707 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 708 } 709 710 // free all the hierarchy scheduling memory associated with the team 711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 712 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 713 for (int i = 0; i < num_disp_buff; ++i) { 714 // type does not matter here so use kmp_int32 715 auto sh = 716 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 717 &team->t.t_disp_buffer[i]); 718 if (sh->hier) { 719 sh->hier->deallocate(); 720 __kmp_free(sh->hier); 721 } 722 } 723 } 724 #endif 725 726 // UT - unsigned flavor of T, ST - signed flavor of T, 727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 728 template <typename T> 729 static void 730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 731 T ub, typename traits_t<T>::signed_t st, 732 typename traits_t<T>::signed_t chunk, int push_ws) { 733 typedef typename traits_t<T>::unsigned_t UT; 734 typedef typename traits_t<T>::signed_t ST; 735 typedef typename traits_t<T>::floating_t DBL; 736 737 int active; 738 kmp_info_t *th; 739 kmp_team_t *team; 740 kmp_uint32 my_buffer_index; 741 dispatch_private_info_template<T> *pr; 742 dispatch_shared_info_template<T> volatile *sh; 743 744 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 745 sizeof(dispatch_private_info)); 746 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 747 sizeof(dispatch_shared_info)); 748 749 if (!TCR_4(__kmp_init_parallel)) 750 __kmp_parallel_initialize(); 751 752 #if INCLUDE_SSC_MARKS 753 SSC_MARK_DISPATCH_INIT(); 754 #endif 755 #ifdef KMP_DEBUG 756 { 757 char *buff; 758 // create format specifiers before the debug output 759 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 760 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 761 traits_t<ST>::spec, traits_t<T>::spec, 762 traits_t<T>::spec, traits_t<ST>::spec); 763 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 764 __kmp_str_free(&buff); 765 } 766 #endif 767 /* setup data */ 768 th = __kmp_threads[gtid]; 769 team = th->th.th_team; 770 active = !team->t.t_serialized; 771 th->th.th_ident = loc; 772 773 // Any half-decent optimizer will remove this test when the blocks are empty 774 // since the macros expand to nothing 775 // when statistics are disabled. 776 if (schedule == __kmp_static) { 777 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 778 } else { 779 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 780 } 781 782 #if KMP_USE_HIER_SCHED 783 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 784 // Hierarchical scheduling does not work with ordered, so if ordered is 785 // detected, then revert back to threaded scheduling. 786 bool ordered; 787 enum sched_type my_sched = schedule; 788 my_buffer_index = th->th.th_dispatch->th_disp_index; 789 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 790 &th->th.th_dispatch 791 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 792 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 793 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 794 my_sched = 795 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 796 ordered = (kmp_ord_lower & my_sched); 797 if (pr->flags.use_hier) { 798 if (ordered) { 799 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 800 "Disabling hierarchical scheduling.\n", 801 gtid)); 802 pr->flags.use_hier = FALSE; 803 } 804 } 805 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 806 // Don't use hierarchical for ordered parallel loops and don't 807 // use the runtime hierarchy if one was specified in the program 808 if (!ordered && !pr->flags.use_hier) 809 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 810 } 811 #endif // KMP_USE_HIER_SCHED 812 813 #if USE_ITT_BUILD 814 kmp_uint64 cur_chunk = chunk; 815 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 816 __kmp_forkjoin_frames_mode == 3 && 817 KMP_MASTER_GTID(gtid) && 818 #if OMP_40_ENABLED 819 th->th.th_teams_microtask == NULL && 820 #endif 821 team->t.t_active_level == 1; 822 #endif 823 if (!active) { 824 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 825 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 826 } else { 827 KMP_DEBUG_ASSERT(th->th.th_dispatch == 828 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 829 830 my_buffer_index = th->th.th_dispatch->th_disp_index++; 831 832 /* What happens when number of threads changes, need to resize buffer? */ 833 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 834 &th->th.th_dispatch 835 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 836 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 837 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 838 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 839 my_buffer_index)); 840 } 841 842 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 843 #if USE_ITT_BUILD 844 &cur_chunk, 845 #endif 846 chunk, (T)th->th.th_team_nproc, 847 (T)th->th.th_info.ds.ds_tid); 848 if (active) { 849 if (pr->flags.ordered == 0) { 850 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 851 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 852 } else { 853 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 854 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 855 } 856 } 857 858 if (active) { 859 /* The name of this buffer should be my_buffer_index when it's free to use 860 * it */ 861 862 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 863 "sh->buffer_index:%d\n", 864 gtid, my_buffer_index, sh->buffer_index)); 865 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 866 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 867 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 868 // my_buffer_index are *always* 32-bit integers. 869 KMP_MB(); /* is this necessary? */ 870 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 871 "sh->buffer_index:%d\n", 872 gtid, my_buffer_index, sh->buffer_index)); 873 874 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 875 th->th.th_dispatch->th_dispatch_sh_current = 876 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 877 #if USE_ITT_BUILD 878 if (pr->flags.ordered) { 879 __kmp_itt_ordered_init(gtid); 880 } 881 // Report loop metadata 882 if (itt_need_metadata_reporting) { 883 // Only report metadata by master of active team at level 1 884 kmp_uint64 schedtype = 0; 885 switch (schedule) { 886 case kmp_sch_static_chunked: 887 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 888 break; 889 case kmp_sch_static_greedy: 890 cur_chunk = pr->u.p.parm1; 891 break; 892 case kmp_sch_dynamic_chunked: 893 schedtype = 1; 894 break; 895 case kmp_sch_guided_iterative_chunked: 896 case kmp_sch_guided_analytical_chunked: 897 #if OMP_45_ENABLED 898 case kmp_sch_guided_simd: 899 #endif 900 schedtype = 2; 901 break; 902 default: 903 // Should we put this case under "static"? 904 // case kmp_sch_static_steal: 905 schedtype = 3; 906 break; 907 } 908 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 909 } 910 #if KMP_USE_HIER_SCHED 911 if (pr->flags.use_hier) { 912 pr->u.p.count = 0; 913 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 914 } 915 #endif // KMP_USER_HIER_SCHED 916 #endif /* USE_ITT_BUILD */ 917 } 918 919 #ifdef KMP_DEBUG 920 { 921 char *buff; 922 // create format specifiers before the debug output 923 buff = __kmp_str_format( 924 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 925 "lb:%%%s ub:%%%s" 926 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 927 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 928 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 929 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 930 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 931 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 932 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 933 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 934 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 935 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 936 __kmp_str_free(&buff); 937 } 938 #endif 939 #if (KMP_STATIC_STEAL_ENABLED) 940 // It cannot be guaranteed that after execution of a loop with some other 941 // schedule kind all the parm3 variables will contain the same value. Even if 942 // all parm3 will be the same, it still exists a bad case like using 0 and 1 943 // rather than program life-time increment. So the dedicated variable is 944 // required. The 'static_steal_counter' is used. 945 if (schedule == kmp_sch_static_steal) { 946 // Other threads will inspect this variable when searching for a victim. 947 // This is a flag showing that other threads may steal from this thread 948 // since then. 949 volatile T *p = &pr->u.p.static_steal_counter; 950 *p = *p + 1; 951 } 952 #endif // ( KMP_STATIC_STEAL_ENABLED ) 953 954 #if OMPT_SUPPORT && OMPT_OPTIONAL 955 if (ompt_enabled.ompt_callback_work) { 956 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 957 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 958 ompt_callbacks.ompt_callback(ompt_callback_work)( 959 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 960 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 961 } 962 #endif 963 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 964 } 965 966 /* For ordered loops, either __kmp_dispatch_finish() should be called after 967 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 968 * every chunk of iterations. If the ordered section(s) were not executed 969 * for this iteration (or every iteration in this chunk), we need to set the 970 * ordered iteration counters so that the next thread can proceed. */ 971 template <typename UT> 972 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 973 typedef typename traits_t<UT>::signed_t ST; 974 kmp_info_t *th = __kmp_threads[gtid]; 975 976 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 977 if (!th->th.th_team->t.t_serialized) { 978 979 dispatch_private_info_template<UT> *pr = 980 reinterpret_cast<dispatch_private_info_template<UT> *>( 981 th->th.th_dispatch->th_dispatch_pr_current); 982 dispatch_shared_info_template<UT> volatile *sh = 983 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 984 th->th.th_dispatch->th_dispatch_sh_current); 985 KMP_DEBUG_ASSERT(pr); 986 KMP_DEBUG_ASSERT(sh); 987 KMP_DEBUG_ASSERT(th->th.th_dispatch == 988 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 989 990 if (pr->ordered_bumped) { 991 KD_TRACE( 992 1000, 993 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 994 gtid)); 995 pr->ordered_bumped = 0; 996 } else { 997 UT lower = pr->u.p.ordered_lower; 998 999 #ifdef KMP_DEBUG 1000 { 1001 char *buff; 1002 // create format specifiers before the debug output 1003 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1004 "ordered_iteration:%%%s lower:%%%s\n", 1005 traits_t<UT>::spec, traits_t<UT>::spec); 1006 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1007 __kmp_str_free(&buff); 1008 } 1009 #endif 1010 1011 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1012 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1013 KMP_MB(); /* is this necessary? */ 1014 #ifdef KMP_DEBUG 1015 { 1016 char *buff; 1017 // create format specifiers before the debug output 1018 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1019 "ordered_iteration:%%%s lower:%%%s\n", 1020 traits_t<UT>::spec, traits_t<UT>::spec); 1021 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1022 __kmp_str_free(&buff); 1023 } 1024 #endif 1025 1026 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1027 } // if 1028 } // if 1029 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1030 } 1031 1032 #ifdef KMP_GOMP_COMPAT 1033 1034 template <typename UT> 1035 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1036 typedef typename traits_t<UT>::signed_t ST; 1037 kmp_info_t *th = __kmp_threads[gtid]; 1038 1039 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1040 if (!th->th.th_team->t.t_serialized) { 1041 // int cid; 1042 dispatch_private_info_template<UT> *pr = 1043 reinterpret_cast<dispatch_private_info_template<UT> *>( 1044 th->th.th_dispatch->th_dispatch_pr_current); 1045 dispatch_shared_info_template<UT> volatile *sh = 1046 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1047 th->th.th_dispatch->th_dispatch_sh_current); 1048 KMP_DEBUG_ASSERT(pr); 1049 KMP_DEBUG_ASSERT(sh); 1050 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1051 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1052 1053 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1054 UT lower = pr->u.p.ordered_lower; 1055 UT upper = pr->u.p.ordered_upper; 1056 UT inc = upper - lower + 1; 1057 1058 if (pr->ordered_bumped == inc) { 1059 KD_TRACE( 1060 1000, 1061 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1062 gtid)); 1063 pr->ordered_bumped = 0; 1064 } else { 1065 inc -= pr->ordered_bumped; 1066 1067 #ifdef KMP_DEBUG 1068 { 1069 char *buff; 1070 // create format specifiers before the debug output 1071 buff = __kmp_str_format( 1072 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1073 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1074 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1075 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1076 __kmp_str_free(&buff); 1077 } 1078 #endif 1079 1080 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1081 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1082 1083 KMP_MB(); /* is this necessary? */ 1084 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1085 "ordered_bumped to zero\n", 1086 gtid)); 1087 pr->ordered_bumped = 0; 1088 //!!!!! TODO check if the inc should be unsigned, or signed??? 1089 #ifdef KMP_DEBUG 1090 { 1091 char *buff; 1092 // create format specifiers before the debug output 1093 buff = __kmp_str_format( 1094 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1095 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1096 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1097 traits_t<UT>::spec); 1098 KD_TRACE(1000, 1099 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1100 __kmp_str_free(&buff); 1101 } 1102 #endif 1103 1104 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1105 } 1106 // } 1107 } 1108 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1109 } 1110 1111 #endif /* KMP_GOMP_COMPAT */ 1112 1113 template <typename T> 1114 int __kmp_dispatch_next_algorithm(int gtid, 1115 dispatch_private_info_template<T> *pr, 1116 dispatch_shared_info_template<T> volatile *sh, 1117 kmp_int32 *p_last, T *p_lb, T *p_ub, 1118 typename traits_t<T>::signed_t *p_st, T nproc, 1119 T tid) { 1120 typedef typename traits_t<T>::unsigned_t UT; 1121 typedef typename traits_t<T>::signed_t ST; 1122 typedef typename traits_t<T>::floating_t DBL; 1123 int status = 0; 1124 kmp_int32 last = 0; 1125 T start; 1126 ST incr; 1127 UT limit, trip, init; 1128 kmp_info_t *th = __kmp_threads[gtid]; 1129 kmp_team_t *team = th->th.th_team; 1130 1131 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1132 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1133 KMP_DEBUG_ASSERT(pr); 1134 KMP_DEBUG_ASSERT(sh); 1135 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1136 #ifdef KMP_DEBUG 1137 { 1138 char *buff; 1139 // create format specifiers before the debug output 1140 buff = 1141 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1142 "sh:%%p nproc:%%%s tid:%%%s\n", 1143 traits_t<T>::spec, traits_t<T>::spec); 1144 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1145 __kmp_str_free(&buff); 1146 } 1147 #endif 1148 1149 // zero trip count 1150 if (pr->u.p.tc == 0) { 1151 KD_TRACE(10, 1152 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1153 "zero status:%d\n", 1154 gtid, status)); 1155 return 0; 1156 } 1157 1158 switch (pr->schedule) { 1159 #if (KMP_STATIC_STEAL_ENABLED) 1160 case kmp_sch_static_steal: { 1161 T chunk = pr->u.p.parm1; 1162 1163 KD_TRACE(100, 1164 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1165 gtid)); 1166 1167 trip = pr->u.p.tc - 1; 1168 1169 if (traits_t<T>::type_size > 4) { 1170 // use lock for 8-byte and CAS for 4-byte induction 1171 // variable. TODO (optional): check and use 16-byte CAS 1172 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1173 KMP_DEBUG_ASSERT(lck != NULL); 1174 if (pr->u.p.count < (UT)pr->u.p.ub) { 1175 __kmp_acquire_lock(lck, gtid); 1176 // try to get own chunk of iterations 1177 init = (pr->u.p.count)++; 1178 status = (init < (UT)pr->u.p.ub); 1179 __kmp_release_lock(lck, gtid); 1180 } else { 1181 status = 0; // no own chunks 1182 } 1183 if (!status) { // try to steal 1184 kmp_info_t **other_threads = team->t.t_threads; 1185 int while_limit = nproc; // nproc attempts to find a victim 1186 int while_index = 0; 1187 // TODO: algorithm of searching for a victim 1188 // should be cleaned up and measured 1189 while ((!status) && (while_limit != ++while_index)) { 1190 T remaining; 1191 T victimIdx = pr->u.p.parm4; 1192 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1193 dispatch_private_info_template<T> *victim = 1194 reinterpret_cast<dispatch_private_info_template<T> *>( 1195 other_threads[victimIdx] 1196 ->th.th_dispatch->th_dispatch_pr_current); 1197 while ((victim == NULL || victim == pr || 1198 (*(volatile T *)&victim->u.p.static_steal_counter != 1199 *(volatile T *)&pr->u.p.static_steal_counter)) && 1200 oldVictimIdx != victimIdx) { 1201 victimIdx = (victimIdx + 1) % nproc; 1202 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1203 other_threads[victimIdx] 1204 ->th.th_dispatch->th_dispatch_pr_current); 1205 } 1206 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1207 *(volatile T *)&pr->u.p.static_steal_counter)) { 1208 continue; // try once more (nproc attempts in total) 1209 // no victim is ready yet to participate in stealing 1210 // because all victims are still in kmp_init_dispatch 1211 } 1212 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1213 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1214 continue; // not enough chunks to steal, goto next victim 1215 } 1216 1217 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1218 KMP_ASSERT(lck != NULL); 1219 __kmp_acquire_lock(lck, gtid); 1220 limit = victim->u.p.ub; // keep initial ub 1221 if (victim->u.p.count >= limit || 1222 (remaining = limit - victim->u.p.count) < 2) { 1223 __kmp_release_lock(lck, gtid); 1224 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1225 continue; // not enough chunks to steal 1226 } 1227 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1228 // by 1 1229 if (remaining > 3) { 1230 // steal 1/4 of remaining 1231 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1232 init = (victim->u.p.ub -= (remaining >> 2)); 1233 } else { 1234 // steal 1 chunk of 2 or 3 remaining 1235 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1236 init = (victim->u.p.ub -= 1); 1237 } 1238 __kmp_release_lock(lck, gtid); 1239 1240 KMP_DEBUG_ASSERT(init + 1 <= limit); 1241 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1242 status = 1; 1243 while_index = 0; 1244 // now update own count and ub with stolen range but init chunk 1245 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1246 pr->u.p.count = init + 1; 1247 pr->u.p.ub = limit; 1248 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1249 } // while (search for victim) 1250 } // if (try to find victim and steal) 1251 } else { 1252 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1253 typedef union { 1254 struct { 1255 UT count; 1256 T ub; 1257 } p; 1258 kmp_int64 b; 1259 } union_i4; 1260 // All operations on 'count' or 'ub' must be combined atomically 1261 // together. 1262 { 1263 union_i4 vold, vnew; 1264 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1265 vnew = vold; 1266 vnew.p.count++; 1267 while (!KMP_COMPARE_AND_STORE_ACQ64( 1268 (volatile kmp_int64 *)&pr->u.p.count, 1269 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1270 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1271 KMP_CPU_PAUSE(); 1272 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1273 vnew = vold; 1274 vnew.p.count++; 1275 } 1276 vnew = vold; 1277 init = vnew.p.count; 1278 status = (init < (UT)vnew.p.ub); 1279 } 1280 1281 if (!status) { 1282 kmp_info_t **other_threads = team->t.t_threads; 1283 int while_limit = nproc; // nproc attempts to find a victim 1284 int while_index = 0; 1285 1286 // TODO: algorithm of searching for a victim 1287 // should be cleaned up and measured 1288 while ((!status) && (while_limit != ++while_index)) { 1289 union_i4 vold, vnew; 1290 kmp_int32 remaining; 1291 T victimIdx = pr->u.p.parm4; 1292 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1293 dispatch_private_info_template<T> *victim = 1294 reinterpret_cast<dispatch_private_info_template<T> *>( 1295 other_threads[victimIdx] 1296 ->th.th_dispatch->th_dispatch_pr_current); 1297 while ((victim == NULL || victim == pr || 1298 (*(volatile T *)&victim->u.p.static_steal_counter != 1299 *(volatile T *)&pr->u.p.static_steal_counter)) && 1300 oldVictimIdx != victimIdx) { 1301 victimIdx = (victimIdx + 1) % nproc; 1302 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1303 other_threads[victimIdx] 1304 ->th.th_dispatch->th_dispatch_pr_current); 1305 } 1306 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1307 *(volatile T *)&pr->u.p.static_steal_counter)) { 1308 continue; // try once more (nproc attempts in total) 1309 // no victim is ready yet to participate in stealing 1310 // because all victims are still in kmp_init_dispatch 1311 } 1312 pr->u.p.parm4 = victimIdx; // new victim found 1313 while (1) { // CAS loop if victim has enough chunks to steal 1314 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1315 vnew = vold; 1316 1317 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1318 if (vnew.p.count >= (UT)vnew.p.ub || 1319 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1320 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1321 break; // not enough chunks to steal, goto next victim 1322 } 1323 if (remaining > 3) { 1324 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1325 } else { 1326 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1327 } 1328 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1329 // TODO: Should this be acquire or release? 1330 if (KMP_COMPARE_AND_STORE_ACQ64( 1331 (volatile kmp_int64 *)&victim->u.p.count, 1332 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1333 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1334 // stealing succedded 1335 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1336 vold.p.ub - vnew.p.ub); 1337 status = 1; 1338 while_index = 0; 1339 // now update own count and ub 1340 init = vnew.p.ub; 1341 vold.p.count = init + 1; 1342 #if KMP_ARCH_X86 1343 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1344 #else 1345 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1346 #endif 1347 break; 1348 } // if (check CAS result) 1349 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1350 } // while (try to steal from particular victim) 1351 } // while (search for victim) 1352 } // if (try to find victim and steal) 1353 } // if (4-byte induction variable) 1354 if (!status) { 1355 *p_lb = 0; 1356 *p_ub = 0; 1357 if (p_st != NULL) 1358 *p_st = 0; 1359 } else { 1360 start = pr->u.p.parm2; 1361 init *= chunk; 1362 limit = chunk + init - 1; 1363 incr = pr->u.p.st; 1364 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1365 1366 KMP_DEBUG_ASSERT(init <= trip); 1367 if ((last = (limit >= trip)) != 0) 1368 limit = trip; 1369 if (p_st != NULL) 1370 *p_st = incr; 1371 1372 if (incr == 1) { 1373 *p_lb = start + init; 1374 *p_ub = start + limit; 1375 } else { 1376 *p_lb = start + init * incr; 1377 *p_ub = start + limit * incr; 1378 } 1379 1380 if (pr->flags.ordered) { 1381 pr->u.p.ordered_lower = init; 1382 pr->u.p.ordered_upper = limit; 1383 } // if 1384 } // if 1385 break; 1386 } // case 1387 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1388 case kmp_sch_static_balanced: { 1389 KD_TRACE( 1390 10, 1391 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1392 gtid)); 1393 /* check if thread has any iteration to do */ 1394 if ((status = !pr->u.p.count) != 0) { 1395 pr->u.p.count = 1; 1396 *p_lb = pr->u.p.lb; 1397 *p_ub = pr->u.p.ub; 1398 last = pr->u.p.parm1; 1399 if (p_st != NULL) 1400 *p_st = pr->u.p.st; 1401 } else { /* no iterations to do */ 1402 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1403 } 1404 } // case 1405 break; 1406 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1407 merged here */ 1408 case kmp_sch_static_chunked: { 1409 T parm1; 1410 1411 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1412 "kmp_sch_static_[affinity|chunked] case\n", 1413 gtid)); 1414 parm1 = pr->u.p.parm1; 1415 1416 trip = pr->u.p.tc - 1; 1417 init = parm1 * (pr->u.p.count + tid); 1418 1419 if ((status = (init <= trip)) != 0) { 1420 start = pr->u.p.lb; 1421 incr = pr->u.p.st; 1422 limit = parm1 + init - 1; 1423 1424 if ((last = (limit >= trip)) != 0) 1425 limit = trip; 1426 1427 if (p_st != NULL) 1428 *p_st = incr; 1429 1430 pr->u.p.count += nproc; 1431 1432 if (incr == 1) { 1433 *p_lb = start + init; 1434 *p_ub = start + limit; 1435 } else { 1436 *p_lb = start + init * incr; 1437 *p_ub = start + limit * incr; 1438 } 1439 1440 if (pr->flags.ordered) { 1441 pr->u.p.ordered_lower = init; 1442 pr->u.p.ordered_upper = limit; 1443 } // if 1444 } // if 1445 } // case 1446 break; 1447 1448 case kmp_sch_dynamic_chunked: { 1449 T chunk = pr->u.p.parm1; 1450 1451 KD_TRACE( 1452 100, 1453 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1454 gtid)); 1455 1456 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1457 trip = pr->u.p.tc - 1; 1458 1459 if ((status = (init <= trip)) == 0) { 1460 *p_lb = 0; 1461 *p_ub = 0; 1462 if (p_st != NULL) 1463 *p_st = 0; 1464 } else { 1465 start = pr->u.p.lb; 1466 limit = chunk + init - 1; 1467 incr = pr->u.p.st; 1468 1469 if ((last = (limit >= trip)) != 0) 1470 limit = trip; 1471 1472 if (p_st != NULL) 1473 *p_st = incr; 1474 1475 if (incr == 1) { 1476 *p_lb = start + init; 1477 *p_ub = start + limit; 1478 } else { 1479 *p_lb = start + init * incr; 1480 *p_ub = start + limit * incr; 1481 } 1482 1483 if (pr->flags.ordered) { 1484 pr->u.p.ordered_lower = init; 1485 pr->u.p.ordered_upper = limit; 1486 } // if 1487 } // if 1488 } // case 1489 break; 1490 1491 case kmp_sch_guided_iterative_chunked: { 1492 T chunkspec = pr->u.p.parm1; 1493 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1494 "iterative case\n", 1495 gtid)); 1496 trip = pr->u.p.tc; 1497 // Start atomic part of calculations 1498 while (1) { 1499 ST remaining; // signed, because can be < 0 1500 init = sh->u.s.iteration; // shared value 1501 remaining = trip - init; 1502 if (remaining <= 0) { // AC: need to compare with 0 first 1503 // nothing to do, don't try atomic op 1504 status = 0; 1505 break; 1506 } 1507 if ((T)remaining < 1508 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1509 // use dynamic-style shcedule 1510 // atomically inrement iterations, get old value 1511 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1512 (ST)chunkspec); 1513 remaining = trip - init; 1514 if (remaining <= 0) { 1515 status = 0; // all iterations got by other threads 1516 } else { 1517 // got some iterations to work on 1518 status = 1; 1519 if ((T)remaining > chunkspec) { 1520 limit = init + chunkspec - 1; 1521 } else { 1522 last = 1; // the last chunk 1523 limit = init + remaining - 1; 1524 } // if 1525 } // if 1526 break; 1527 } // if 1528 limit = init + 1529 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1530 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1531 (ST)init, (ST)limit)) { 1532 // CAS was successful, chunk obtained 1533 status = 1; 1534 --limit; 1535 break; 1536 } // if 1537 } // while 1538 if (status != 0) { 1539 start = pr->u.p.lb; 1540 incr = pr->u.p.st; 1541 if (p_st != NULL) 1542 *p_st = incr; 1543 *p_lb = start + init * incr; 1544 *p_ub = start + limit * incr; 1545 if (pr->flags.ordered) { 1546 pr->u.p.ordered_lower = init; 1547 pr->u.p.ordered_upper = limit; 1548 } // if 1549 } else { 1550 *p_lb = 0; 1551 *p_ub = 0; 1552 if (p_st != NULL) 1553 *p_st = 0; 1554 } // if 1555 } // case 1556 break; 1557 1558 #if OMP_45_ENABLED 1559 case kmp_sch_guided_simd: { 1560 // same as iterative but curr-chunk adjusted to be multiple of given 1561 // chunk 1562 T chunk = pr->u.p.parm1; 1563 KD_TRACE(100, 1564 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1565 gtid)); 1566 trip = pr->u.p.tc; 1567 // Start atomic part of calculations 1568 while (1) { 1569 ST remaining; // signed, because can be < 0 1570 init = sh->u.s.iteration; // shared value 1571 remaining = trip - init; 1572 if (remaining <= 0) { // AC: need to compare with 0 first 1573 status = 0; // nothing to do, don't try atomic op 1574 break; 1575 } 1576 KMP_DEBUG_ASSERT(init % chunk == 0); 1577 // compare with K*nproc*(chunk+1), K=2 by default 1578 if ((T)remaining < pr->u.p.parm2) { 1579 // use dynamic-style shcedule 1580 // atomically inrement iterations, get old value 1581 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1582 (ST)chunk); 1583 remaining = trip - init; 1584 if (remaining <= 0) { 1585 status = 0; // all iterations got by other threads 1586 } else { 1587 // got some iterations to work on 1588 status = 1; 1589 if ((T)remaining > chunk) { 1590 limit = init + chunk - 1; 1591 } else { 1592 last = 1; // the last chunk 1593 limit = init + remaining - 1; 1594 } // if 1595 } // if 1596 break; 1597 } // if 1598 // divide by K*nproc 1599 UT span = remaining * (*(double *)&pr->u.p.parm3); 1600 UT rem = span % chunk; 1601 if (rem) // adjust so that span%chunk == 0 1602 span += chunk - rem; 1603 limit = init + span; 1604 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1605 (ST)init, (ST)limit)) { 1606 // CAS was successful, chunk obtained 1607 status = 1; 1608 --limit; 1609 break; 1610 } // if 1611 } // while 1612 if (status != 0) { 1613 start = pr->u.p.lb; 1614 incr = pr->u.p.st; 1615 if (p_st != NULL) 1616 *p_st = incr; 1617 *p_lb = start + init * incr; 1618 *p_ub = start + limit * incr; 1619 if (pr->flags.ordered) { 1620 pr->u.p.ordered_lower = init; 1621 pr->u.p.ordered_upper = limit; 1622 } // if 1623 } else { 1624 *p_lb = 0; 1625 *p_ub = 0; 1626 if (p_st != NULL) 1627 *p_st = 0; 1628 } // if 1629 } // case 1630 break; 1631 #endif // OMP_45_ENABLED 1632 1633 case kmp_sch_guided_analytical_chunked: { 1634 T chunkspec = pr->u.p.parm1; 1635 UT chunkIdx; 1636 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1637 /* for storing original FPCW value for Windows* OS on 1638 IA-32 architecture 8-byte version */ 1639 unsigned int oldFpcw; 1640 unsigned int fpcwSet = 0; 1641 #endif 1642 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1643 "kmp_sch_guided_analytical_chunked case\n", 1644 gtid)); 1645 1646 trip = pr->u.p.tc; 1647 1648 KMP_DEBUG_ASSERT(nproc > 1); 1649 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1650 1651 while (1) { /* this while loop is a safeguard against unexpected zero 1652 chunk sizes */ 1653 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1654 if (chunkIdx >= (UT)pr->u.p.parm2) { 1655 --trip; 1656 /* use dynamic-style scheduling */ 1657 init = chunkIdx * chunkspec + pr->u.p.count; 1658 /* need to verify init > 0 in case of overflow in the above 1659 * calculation */ 1660 if ((status = (init > 0 && init <= trip)) != 0) { 1661 limit = init + chunkspec - 1; 1662 1663 if ((last = (limit >= trip)) != 0) 1664 limit = trip; 1665 } 1666 break; 1667 } else { 1668 /* use exponential-style scheduling */ 1669 /* The following check is to workaround the lack of long double precision on 1670 Windows* OS. 1671 This check works around the possible effect that init != 0 for chunkIdx == 0. 1672 */ 1673 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1674 /* If we haven't already done so, save original 1675 FPCW and set precision to 64-bit, as Windows* OS 1676 on IA-32 architecture defaults to 53-bit */ 1677 if (!fpcwSet) { 1678 oldFpcw = _control87(0, 0); 1679 _control87(_PC_64, _MCW_PC); 1680 fpcwSet = 0x30000; 1681 } 1682 #endif 1683 if (chunkIdx) { 1684 init = __kmp_dispatch_guided_remaining<T>( 1685 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1686 KMP_DEBUG_ASSERT(init); 1687 init = trip - init; 1688 } else 1689 init = 0; 1690 limit = trip - __kmp_dispatch_guided_remaining<T>( 1691 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1692 KMP_ASSERT(init <= limit); 1693 if (init < limit) { 1694 KMP_DEBUG_ASSERT(limit <= trip); 1695 --limit; 1696 status = 1; 1697 break; 1698 } // if 1699 } // if 1700 } // while (1) 1701 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1702 /* restore FPCW if necessary 1703 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1704 */ 1705 if (fpcwSet && (oldFpcw & fpcwSet)) 1706 _control87(oldFpcw, _MCW_PC); 1707 #endif 1708 if (status != 0) { 1709 start = pr->u.p.lb; 1710 incr = pr->u.p.st; 1711 if (p_st != NULL) 1712 *p_st = incr; 1713 *p_lb = start + init * incr; 1714 *p_ub = start + limit * incr; 1715 if (pr->flags.ordered) { 1716 pr->u.p.ordered_lower = init; 1717 pr->u.p.ordered_upper = limit; 1718 } 1719 } else { 1720 *p_lb = 0; 1721 *p_ub = 0; 1722 if (p_st != NULL) 1723 *p_st = 0; 1724 } 1725 } // case 1726 break; 1727 1728 case kmp_sch_trapezoidal: { 1729 UT index; 1730 T parm2 = pr->u.p.parm2; 1731 T parm3 = pr->u.p.parm3; 1732 T parm4 = pr->u.p.parm4; 1733 KD_TRACE(100, 1734 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1735 gtid)); 1736 1737 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1738 1739 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1740 trip = pr->u.p.tc - 1; 1741 1742 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1743 *p_lb = 0; 1744 *p_ub = 0; 1745 if (p_st != NULL) 1746 *p_st = 0; 1747 } else { 1748 start = pr->u.p.lb; 1749 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1750 incr = pr->u.p.st; 1751 1752 if ((last = (limit >= trip)) != 0) 1753 limit = trip; 1754 1755 if (p_st != NULL) 1756 *p_st = incr; 1757 1758 if (incr == 1) { 1759 *p_lb = start + init; 1760 *p_ub = start + limit; 1761 } else { 1762 *p_lb = start + init * incr; 1763 *p_ub = start + limit * incr; 1764 } 1765 1766 if (pr->flags.ordered) { 1767 pr->u.p.ordered_lower = init; 1768 pr->u.p.ordered_upper = limit; 1769 } // if 1770 } // if 1771 } // case 1772 break; 1773 default: { 1774 status = 0; // to avoid complaints on uninitialized variable use 1775 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1776 KMP_HNT(GetNewerLibrary), // Hint 1777 __kmp_msg_null // Variadic argument list terminator 1778 ); 1779 } break; 1780 } // switch 1781 if (p_last) 1782 *p_last = last; 1783 #ifdef KMP_DEBUG 1784 if (pr->flags.ordered) { 1785 char *buff; 1786 // create format specifiers before the debug output 1787 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1788 "ordered_lower:%%%s ordered_upper:%%%s\n", 1789 traits_t<UT>::spec, traits_t<UT>::spec); 1790 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1791 __kmp_str_free(&buff); 1792 } 1793 { 1794 char *buff; 1795 // create format specifiers before the debug output 1796 buff = __kmp_str_format( 1797 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1798 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1799 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1800 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1801 __kmp_str_free(&buff); 1802 } 1803 #endif 1804 return status; 1805 } 1806 1807 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1808 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1809 is not called. */ 1810 #if OMPT_SUPPORT && OMPT_OPTIONAL 1811 #define OMPT_LOOP_END \ 1812 if (status == 0) { \ 1813 if (ompt_enabled.ompt_callback_work) { \ 1814 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1815 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1816 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1817 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1818 &(task_info->task_data), 0, codeptr); \ 1819 } \ 1820 } 1821 // TODO: implement count 1822 #else 1823 #define OMPT_LOOP_END // no-op 1824 #endif 1825 1826 #if KMP_STATS_ENABLED 1827 #define KMP_STATS_LOOP_END \ 1828 { \ 1829 kmp_int64 u, l, t, i; \ 1830 l = (kmp_int64)(*p_lb); \ 1831 u = (kmp_int64)(*p_ub); \ 1832 i = (kmp_int64)(pr->u.p.st); \ 1833 if (status == 0) { \ 1834 t = 0; \ 1835 KMP_POP_PARTITIONED_TIMER(); \ 1836 } else if (i == 1) { \ 1837 if (u >= l) \ 1838 t = u - l + 1; \ 1839 else \ 1840 t = 0; \ 1841 } else if (i < 0) { \ 1842 if (l >= u) \ 1843 t = (l - u) / (-i) + 1; \ 1844 else \ 1845 t = 0; \ 1846 } else { \ 1847 if (u >= l) \ 1848 t = (u - l) / i + 1; \ 1849 else \ 1850 t = 0; \ 1851 } \ 1852 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1853 } 1854 #else 1855 #define KMP_STATS_LOOP_END /* Nothing */ 1856 #endif 1857 1858 template <typename T> 1859 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1860 T *p_lb, T *p_ub, 1861 typename traits_t<T>::signed_t *p_st 1862 #if OMPT_SUPPORT && OMPT_OPTIONAL 1863 , 1864 void *codeptr 1865 #endif 1866 ) { 1867 1868 typedef typename traits_t<T>::unsigned_t UT; 1869 typedef typename traits_t<T>::signed_t ST; 1870 typedef typename traits_t<T>::floating_t DBL; 1871 // This is potentially slightly misleading, schedule(runtime) will appear here 1872 // even if the actual runtme schedule is static. (Which points out a 1873 // disadavantage of schedule(runtime): even when static scheduling is used it 1874 // costs more than a compile time choice to use static scheduling would.) 1875 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1876 1877 int status; 1878 dispatch_private_info_template<T> *pr; 1879 kmp_info_t *th = __kmp_threads[gtid]; 1880 kmp_team_t *team = th->th.th_team; 1881 1882 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1883 KD_TRACE( 1884 1000, 1885 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1886 gtid, p_lb, p_ub, p_st, p_last)); 1887 1888 if (team->t.t_serialized) { 1889 /* NOTE: serialize this dispatch becase we are not at the active level */ 1890 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1891 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1892 KMP_DEBUG_ASSERT(pr); 1893 1894 if ((status = (pr->u.p.tc != 0)) == 0) { 1895 *p_lb = 0; 1896 *p_ub = 0; 1897 // if ( p_last != NULL ) 1898 // *p_last = 0; 1899 if (p_st != NULL) 1900 *p_st = 0; 1901 if (__kmp_env_consistency_check) { 1902 if (pr->pushed_ws != ct_none) { 1903 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1904 } 1905 } 1906 } else if (pr->flags.nomerge) { 1907 kmp_int32 last; 1908 T start; 1909 UT limit, trip, init; 1910 ST incr; 1911 T chunk = pr->u.p.parm1; 1912 1913 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1914 gtid)); 1915 1916 init = chunk * pr->u.p.count++; 1917 trip = pr->u.p.tc - 1; 1918 1919 if ((status = (init <= trip)) == 0) { 1920 *p_lb = 0; 1921 *p_ub = 0; 1922 // if ( p_last != NULL ) 1923 // *p_last = 0; 1924 if (p_st != NULL) 1925 *p_st = 0; 1926 if (__kmp_env_consistency_check) { 1927 if (pr->pushed_ws != ct_none) { 1928 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1929 } 1930 } 1931 } else { 1932 start = pr->u.p.lb; 1933 limit = chunk + init - 1; 1934 incr = pr->u.p.st; 1935 1936 if ((last = (limit >= trip)) != 0) { 1937 limit = trip; 1938 #if KMP_OS_WINDOWS 1939 pr->u.p.last_upper = pr->u.p.ub; 1940 #endif /* KMP_OS_WINDOWS */ 1941 } 1942 if (p_last != NULL) 1943 *p_last = last; 1944 if (p_st != NULL) 1945 *p_st = incr; 1946 if (incr == 1) { 1947 *p_lb = start + init; 1948 *p_ub = start + limit; 1949 } else { 1950 *p_lb = start + init * incr; 1951 *p_ub = start + limit * incr; 1952 } 1953 1954 if (pr->flags.ordered) { 1955 pr->u.p.ordered_lower = init; 1956 pr->u.p.ordered_upper = limit; 1957 #ifdef KMP_DEBUG 1958 { 1959 char *buff; 1960 // create format specifiers before the debug output 1961 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1962 "ordered_lower:%%%s ordered_upper:%%%s\n", 1963 traits_t<UT>::spec, traits_t<UT>::spec); 1964 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1965 pr->u.p.ordered_upper)); 1966 __kmp_str_free(&buff); 1967 } 1968 #endif 1969 } // if 1970 } // if 1971 } else { 1972 pr->u.p.tc = 0; 1973 *p_lb = pr->u.p.lb; 1974 *p_ub = pr->u.p.ub; 1975 #if KMP_OS_WINDOWS 1976 pr->u.p.last_upper = *p_ub; 1977 #endif /* KMP_OS_WINDOWS */ 1978 if (p_last != NULL) 1979 *p_last = TRUE; 1980 if (p_st != NULL) 1981 *p_st = pr->u.p.st; 1982 } // if 1983 #ifdef KMP_DEBUG 1984 { 1985 char *buff; 1986 // create format specifiers before the debug output 1987 buff = __kmp_str_format( 1988 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1989 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1990 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1991 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1992 __kmp_str_free(&buff); 1993 } 1994 #endif 1995 #if INCLUDE_SSC_MARKS 1996 SSC_MARK_DISPATCH_NEXT(); 1997 #endif 1998 OMPT_LOOP_END; 1999 KMP_STATS_LOOP_END; 2000 return status; 2001 } else { 2002 kmp_int32 last = 0; 2003 dispatch_shared_info_template<T> volatile *sh; 2004 2005 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2006 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2007 2008 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2009 th->th.th_dispatch->th_dispatch_pr_current); 2010 KMP_DEBUG_ASSERT(pr); 2011 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2012 th->th.th_dispatch->th_dispatch_sh_current); 2013 KMP_DEBUG_ASSERT(sh); 2014 2015 #if KMP_USE_HIER_SCHED 2016 if (pr->flags.use_hier) 2017 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2018 else 2019 #endif // KMP_USE_HIER_SCHED 2020 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2021 p_st, th->th.th_team_nproc, 2022 th->th.th_info.ds.ds_tid); 2023 // status == 0: no more iterations to execute 2024 if (status == 0) { 2025 UT num_done; 2026 2027 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2028 #ifdef KMP_DEBUG 2029 { 2030 char *buff; 2031 // create format specifiers before the debug output 2032 buff = __kmp_str_format( 2033 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2034 traits_t<UT>::spec); 2035 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2036 __kmp_str_free(&buff); 2037 } 2038 #endif 2039 2040 #if KMP_USE_HIER_SCHED 2041 pr->flags.use_hier = FALSE; 2042 #endif 2043 if ((ST)num_done == th->th.th_team_nproc - 1) { 2044 #if (KMP_STATIC_STEAL_ENABLED) 2045 if (pr->schedule == kmp_sch_static_steal && 2046 traits_t<T>::type_size > 4) { 2047 int i; 2048 kmp_info_t **other_threads = team->t.t_threads; 2049 // loop complete, safe to destroy locks used for stealing 2050 for (i = 0; i < th->th.th_team_nproc; ++i) { 2051 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2052 KMP_ASSERT(lck != NULL); 2053 __kmp_destroy_lock(lck); 2054 __kmp_free(lck); 2055 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2056 } 2057 } 2058 #endif 2059 /* NOTE: release this buffer to be reused */ 2060 2061 KMP_MB(); /* Flush all pending memory write invalidates. */ 2062 2063 sh->u.s.num_done = 0; 2064 sh->u.s.iteration = 0; 2065 2066 /* TODO replace with general release procedure? */ 2067 if (pr->flags.ordered) { 2068 sh->u.s.ordered_iteration = 0; 2069 } 2070 2071 KMP_MB(); /* Flush all pending memory write invalidates. */ 2072 2073 sh->buffer_index += __kmp_dispatch_num_buffers; 2074 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2075 gtid, sh->buffer_index)); 2076 2077 KMP_MB(); /* Flush all pending memory write invalidates. */ 2078 2079 } // if 2080 if (__kmp_env_consistency_check) { 2081 if (pr->pushed_ws != ct_none) { 2082 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2083 } 2084 } 2085 2086 th->th.th_dispatch->th_deo_fcn = NULL; 2087 th->th.th_dispatch->th_dxo_fcn = NULL; 2088 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2089 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2090 } // if (status == 0) 2091 #if KMP_OS_WINDOWS 2092 else if (last) { 2093 pr->u.p.last_upper = pr->u.p.ub; 2094 } 2095 #endif /* KMP_OS_WINDOWS */ 2096 if (p_last != NULL && status != 0) 2097 *p_last = last; 2098 } // if 2099 2100 #ifdef KMP_DEBUG 2101 { 2102 char *buff; 2103 // create format specifiers before the debug output 2104 buff = __kmp_str_format( 2105 "__kmp_dispatch_next: T#%%d normal case: " 2106 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2107 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2108 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2109 (p_last ? *p_last : 0), status)); 2110 __kmp_str_free(&buff); 2111 } 2112 #endif 2113 #if INCLUDE_SSC_MARKS 2114 SSC_MARK_DISPATCH_NEXT(); 2115 #endif 2116 OMPT_LOOP_END; 2117 KMP_STATS_LOOP_END; 2118 return status; 2119 } 2120 2121 template <typename T> 2122 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2123 kmp_int32 *plastiter, T *plower, T *pupper, 2124 typename traits_t<T>::signed_t incr) { 2125 typedef typename traits_t<T>::unsigned_t UT; 2126 typedef typename traits_t<T>::signed_t ST; 2127 kmp_uint32 team_id; 2128 kmp_uint32 nteams; 2129 UT trip_count; 2130 kmp_team_t *team; 2131 kmp_info_t *th; 2132 2133 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2134 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2135 #ifdef KMP_DEBUG 2136 { 2137 char *buff; 2138 // create format specifiers before the debug output 2139 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2140 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2141 traits_t<T>::spec, traits_t<T>::spec, 2142 traits_t<ST>::spec, traits_t<T>::spec); 2143 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2144 __kmp_str_free(&buff); 2145 } 2146 #endif 2147 2148 if (__kmp_env_consistency_check) { 2149 if (incr == 0) { 2150 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2151 loc); 2152 } 2153 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2154 // The loop is illegal. 2155 // Some zero-trip loops maintained by compiler, e.g.: 2156 // for(i=10;i<0;++i) // lower >= upper - run-time check 2157 // for(i=0;i>10;--i) // lower <= upper - run-time check 2158 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2159 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2160 // Compiler does not check the following illegal loops: 2161 // for(i=0;i<10;i+=incr) // where incr<0 2162 // for(i=10;i>0;i-=incr) // where incr<0 2163 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2164 } 2165 } 2166 th = __kmp_threads[gtid]; 2167 team = th->th.th_team; 2168 #if OMP_40_ENABLED 2169 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2170 nteams = th->th.th_teams_size.nteams; 2171 #endif 2172 team_id = team->t.t_master_tid; 2173 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2174 2175 // compute global trip count 2176 if (incr == 1) { 2177 trip_count = *pupper - *plower + 1; 2178 } else if (incr == -1) { 2179 trip_count = *plower - *pupper + 1; 2180 } else if (incr > 0) { 2181 // upper-lower can exceed the limit of signed type 2182 trip_count = (UT)(*pupper - *plower) / incr + 1; 2183 } else { 2184 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2185 } 2186 2187 if (trip_count <= nteams) { 2188 KMP_DEBUG_ASSERT( 2189 __kmp_static == kmp_sch_static_greedy || 2190 __kmp_static == 2191 kmp_sch_static_balanced); // Unknown static scheduling type. 2192 // only some teams get single iteration, others get nothing 2193 if (team_id < trip_count) { 2194 *pupper = *plower = *plower + team_id * incr; 2195 } else { 2196 *plower = *pupper + incr; // zero-trip loop 2197 } 2198 if (plastiter != NULL) 2199 *plastiter = (team_id == trip_count - 1); 2200 } else { 2201 if (__kmp_static == kmp_sch_static_balanced) { 2202 UT chunk = trip_count / nteams; 2203 UT extras = trip_count % nteams; 2204 *plower += 2205 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2206 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2207 if (plastiter != NULL) 2208 *plastiter = (team_id == nteams - 1); 2209 } else { 2210 T chunk_inc_count = 2211 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2212 T upper = *pupper; 2213 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2214 // Unknown static scheduling type. 2215 *plower += team_id * chunk_inc_count; 2216 *pupper = *plower + chunk_inc_count - incr; 2217 // Check/correct bounds if needed 2218 if (incr > 0) { 2219 if (*pupper < *plower) 2220 *pupper = traits_t<T>::max_value; 2221 if (plastiter != NULL) 2222 *plastiter = *plower <= upper && *pupper > upper - incr; 2223 if (*pupper > upper) 2224 *pupper = upper; // tracker C73258 2225 } else { 2226 if (*pupper > *plower) 2227 *pupper = traits_t<T>::min_value; 2228 if (plastiter != NULL) 2229 *plastiter = *plower >= upper && *pupper < upper - incr; 2230 if (*pupper < upper) 2231 *pupper = upper; // tracker C73258 2232 } 2233 } 2234 } 2235 } 2236 2237 //----------------------------------------------------------------------------- 2238 // Dispatch routines 2239 // Transfer call to template< type T > 2240 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2241 // T lb, T ub, ST st, ST chunk ) 2242 extern "C" { 2243 2244 /*! 2245 @ingroup WORK_SHARING 2246 @{ 2247 @param loc Source location 2248 @param gtid Global thread id 2249 @param schedule Schedule type 2250 @param lb Lower bound 2251 @param ub Upper bound 2252 @param st Step (or increment if you prefer) 2253 @param chunk The chunk size to block with 2254 2255 This function prepares the runtime to start a dynamically scheduled for loop, 2256 saving the loop arguments. 2257 These functions are all identical apart from the types of the arguments. 2258 */ 2259 2260 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2261 enum sched_type schedule, kmp_int32 lb, 2262 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2263 KMP_DEBUG_ASSERT(__kmp_init_serial); 2264 #if OMPT_SUPPORT && OMPT_OPTIONAL 2265 OMPT_STORE_RETURN_ADDRESS(gtid); 2266 #endif 2267 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2268 } 2269 /*! 2270 See @ref __kmpc_dispatch_init_4 2271 */ 2272 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2273 enum sched_type schedule, kmp_uint32 lb, 2274 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2275 KMP_DEBUG_ASSERT(__kmp_init_serial); 2276 #if OMPT_SUPPORT && OMPT_OPTIONAL 2277 OMPT_STORE_RETURN_ADDRESS(gtid); 2278 #endif 2279 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2280 } 2281 2282 /*! 2283 See @ref __kmpc_dispatch_init_4 2284 */ 2285 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2286 enum sched_type schedule, kmp_int64 lb, 2287 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2288 KMP_DEBUG_ASSERT(__kmp_init_serial); 2289 #if OMPT_SUPPORT && OMPT_OPTIONAL 2290 OMPT_STORE_RETURN_ADDRESS(gtid); 2291 #endif 2292 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2293 } 2294 2295 /*! 2296 See @ref __kmpc_dispatch_init_4 2297 */ 2298 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2299 enum sched_type schedule, kmp_uint64 lb, 2300 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2301 KMP_DEBUG_ASSERT(__kmp_init_serial); 2302 #if OMPT_SUPPORT && OMPT_OPTIONAL 2303 OMPT_STORE_RETURN_ADDRESS(gtid); 2304 #endif 2305 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2306 } 2307 2308 /*! 2309 See @ref __kmpc_dispatch_init_4 2310 2311 Difference from __kmpc_dispatch_init set of functions is these functions 2312 are called for composite distribute parallel for construct. Thus before 2313 regular iterations dispatching we need to calc per-team iteration space. 2314 2315 These functions are all identical apart from the types of the arguments. 2316 */ 2317 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2318 enum sched_type schedule, kmp_int32 *p_last, 2319 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2320 kmp_int32 chunk) { 2321 KMP_DEBUG_ASSERT(__kmp_init_serial); 2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 OMPT_STORE_RETURN_ADDRESS(gtid); 2324 #endif 2325 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2326 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2327 } 2328 2329 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2330 enum sched_type schedule, kmp_int32 *p_last, 2331 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2332 kmp_int32 chunk) { 2333 KMP_DEBUG_ASSERT(__kmp_init_serial); 2334 #if OMPT_SUPPORT && OMPT_OPTIONAL 2335 OMPT_STORE_RETURN_ADDRESS(gtid); 2336 #endif 2337 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2338 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2339 } 2340 2341 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2342 enum sched_type schedule, kmp_int32 *p_last, 2343 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2344 kmp_int64 chunk) { 2345 KMP_DEBUG_ASSERT(__kmp_init_serial); 2346 #if OMPT_SUPPORT && OMPT_OPTIONAL 2347 OMPT_STORE_RETURN_ADDRESS(gtid); 2348 #endif 2349 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2350 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2351 } 2352 2353 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2354 enum sched_type schedule, kmp_int32 *p_last, 2355 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2356 kmp_int64 chunk) { 2357 KMP_DEBUG_ASSERT(__kmp_init_serial); 2358 #if OMPT_SUPPORT && OMPT_OPTIONAL 2359 OMPT_STORE_RETURN_ADDRESS(gtid); 2360 #endif 2361 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2362 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2363 } 2364 2365 /*! 2366 @param loc Source code location 2367 @param gtid Global thread id 2368 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2369 otherwise 2370 @param p_lb Pointer to the lower bound for the next chunk of work 2371 @param p_ub Pointer to the upper bound for the next chunk of work 2372 @param p_st Pointer to the stride for the next chunk of work 2373 @return one if there is work to be done, zero otherwise 2374 2375 Get the next dynamically allocated chunk of work for this thread. 2376 If there is no more work, then the lb,ub and stride need not be modified. 2377 */ 2378 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2379 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2380 #if OMPT_SUPPORT && OMPT_OPTIONAL 2381 OMPT_STORE_RETURN_ADDRESS(gtid); 2382 #endif 2383 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2384 #if OMPT_SUPPORT && OMPT_OPTIONAL 2385 , 2386 OMPT_LOAD_RETURN_ADDRESS(gtid) 2387 #endif 2388 ); 2389 } 2390 2391 /*! 2392 See @ref __kmpc_dispatch_next_4 2393 */ 2394 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2395 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2396 kmp_int32 *p_st) { 2397 #if OMPT_SUPPORT && OMPT_OPTIONAL 2398 OMPT_STORE_RETURN_ADDRESS(gtid); 2399 #endif 2400 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2401 #if OMPT_SUPPORT && OMPT_OPTIONAL 2402 , 2403 OMPT_LOAD_RETURN_ADDRESS(gtid) 2404 #endif 2405 ); 2406 } 2407 2408 /*! 2409 See @ref __kmpc_dispatch_next_4 2410 */ 2411 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2412 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2413 #if OMPT_SUPPORT && OMPT_OPTIONAL 2414 OMPT_STORE_RETURN_ADDRESS(gtid); 2415 #endif 2416 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2417 #if OMPT_SUPPORT && OMPT_OPTIONAL 2418 , 2419 OMPT_LOAD_RETURN_ADDRESS(gtid) 2420 #endif 2421 ); 2422 } 2423 2424 /*! 2425 See @ref __kmpc_dispatch_next_4 2426 */ 2427 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2428 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2429 kmp_int64 *p_st) { 2430 #if OMPT_SUPPORT && OMPT_OPTIONAL 2431 OMPT_STORE_RETURN_ADDRESS(gtid); 2432 #endif 2433 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2434 #if OMPT_SUPPORT && OMPT_OPTIONAL 2435 , 2436 OMPT_LOAD_RETURN_ADDRESS(gtid) 2437 #endif 2438 ); 2439 } 2440 2441 /*! 2442 @param loc Source code location 2443 @param gtid Global thread id 2444 2445 Mark the end of a dynamic loop. 2446 */ 2447 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2448 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2449 } 2450 2451 /*! 2452 See @ref __kmpc_dispatch_fini_4 2453 */ 2454 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2455 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2456 } 2457 2458 /*! 2459 See @ref __kmpc_dispatch_fini_4 2460 */ 2461 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2462 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2463 } 2464 2465 /*! 2466 See @ref __kmpc_dispatch_fini_4 2467 */ 2468 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2469 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2470 } 2471 /*! @} */ 2472 2473 //----------------------------------------------------------------------------- 2474 // Non-template routines from kmp_dispatch.cpp used in other sources 2475 2476 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2477 return value == checker; 2478 } 2479 2480 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2481 return value != checker; 2482 } 2483 2484 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2485 return value < checker; 2486 } 2487 2488 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2489 return value >= checker; 2490 } 2491 2492 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2493 return value <= checker; 2494 } 2495 2496 kmp_uint32 2497 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2498 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2499 void *obj // Higher-level synchronization object, or NULL. 2500 ) { 2501 // note: we may not belong to a team at this point 2502 volatile kmp_uint32 *spin = spinner; 2503 kmp_uint32 check = checker; 2504 kmp_uint32 spins; 2505 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2506 kmp_uint32 r; 2507 2508 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2509 KMP_INIT_YIELD(spins); 2510 // main wait spin loop 2511 while (!f(r = TCR_4(*spin), check)) { 2512 KMP_FSYNC_SPIN_PREPARE(obj); 2513 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2514 split. It causes problems with infinite recursion because of exit lock */ 2515 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2516 __kmp_abort_thread(); */ 2517 2518 /* if we have waited a bit, or are oversubscribed, yield */ 2519 /* pause is in the following code */ 2520 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2521 KMP_YIELD_SPIN(spins); 2522 } 2523 KMP_FSYNC_SPIN_ACQUIRED(obj); 2524 return r; 2525 } 2526 2527 void __kmp_wait_yield_4_ptr( 2528 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2529 void *obj // Higher-level synchronization object, or NULL. 2530 ) { 2531 // note: we may not belong to a team at this point 2532 void *spin = spinner; 2533 kmp_uint32 check = checker; 2534 kmp_uint32 spins; 2535 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2536 2537 KMP_FSYNC_SPIN_INIT(obj, spin); 2538 KMP_INIT_YIELD(spins); 2539 // main wait spin loop 2540 while (!f(spin, check)) { 2541 KMP_FSYNC_SPIN_PREPARE(obj); 2542 /* if we have waited a bit, or are oversubscribed, yield */ 2543 /* pause is in the following code */ 2544 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2545 KMP_YIELD_SPIN(spins); 2546 } 2547 KMP_FSYNC_SPIN_ACQUIRED(obj); 2548 } 2549 2550 } // extern "C" 2551 2552 #ifdef KMP_GOMP_COMPAT 2553 2554 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2555 enum sched_type schedule, kmp_int32 lb, 2556 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2557 int push_ws) { 2558 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2559 push_ws); 2560 } 2561 2562 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2563 enum sched_type schedule, kmp_uint32 lb, 2564 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2565 int push_ws) { 2566 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2567 push_ws); 2568 } 2569 2570 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2571 enum sched_type schedule, kmp_int64 lb, 2572 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2573 int push_ws) { 2574 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2575 push_ws); 2576 } 2577 2578 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2579 enum sched_type schedule, kmp_uint64 lb, 2580 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2581 int push_ws) { 2582 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2583 push_ws); 2584 } 2585 2586 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2587 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2588 } 2589 2590 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2591 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2592 } 2593 2594 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2595 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2596 } 2597 2598 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2599 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2600 } 2601 2602 #endif /* KMP_GOMP_COMPAT */ 2603 2604 /* ------------------------------------------------------------------------ */ 2605