1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Initialize a dispatch_private_info_template<T> buffer for a particular 72 // type of schedule,chunk. The loop description is found in lb (lower bound), 73 // ub (upper bound), and st (stride). nproc is the number of threads relevant 74 // to the scheduling (often the number of threads in a team, but not always if 75 // hierarchical scheduling is used). tid is the id of the thread calling 76 // the function within the group of nproc threads. It will have a value 77 // between 0 and nproc - 1. This is often just the thread id within a team, but 78 // is not necessarily the case when using hierarchical scheduling. 79 // loc is the source file location of the corresponding loop 80 // gtid is the global thread id 81 template <typename T> 82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 83 dispatch_private_info_template<T> *pr, 84 enum sched_type schedule, T lb, T ub, 85 typename traits_t<T>::signed_t st, 86 #if USE_ITT_BUILD 87 kmp_uint64 *cur_chunk, 88 #endif 89 typename traits_t<T>::signed_t chunk, 90 T nproc, T tid) { 91 typedef typename traits_t<T>::unsigned_t UT; 92 typedef typename traits_t<T>::floating_t DBL; 93 94 int active; 95 T tc; 96 kmp_info_t *th; 97 kmp_team_t *team; 98 99 #ifdef KMP_DEBUG 100 typedef typename traits_t<T>::signed_t ST; 101 { 102 char *buff; 103 // create format specifiers before the debug output 104 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 105 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 106 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 107 traits_t<T>::spec, traits_t<T>::spec, 108 traits_t<ST>::spec, traits_t<ST>::spec, 109 traits_t<T>::spec, traits_t<T>::spec); 110 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 111 __kmp_str_free(&buff); 112 } 113 #endif 114 /* setup data */ 115 th = __kmp_threads[gtid]; 116 team = th->th.th_team; 117 active = !team->t.t_serialized; 118 119 #if USE_ITT_BUILD 120 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 121 __kmp_forkjoin_frames_mode == 3 && 122 KMP_MASTER_GTID(gtid) && 123 #if OMP_40_ENABLED 124 th->th.th_teams_microtask == NULL && 125 #endif 126 team->t.t_active_level == 1; 127 #endif 128 #if (KMP_STATIC_STEAL_ENABLED) 129 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 130 // AC: we now have only one implementation of stealing, so use it 131 schedule = kmp_sch_static_steal; 132 else 133 #endif 134 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 135 136 /* Pick up the nomerge/ordered bits from the scheduling type */ 137 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 138 pr->flags.nomerge = TRUE; 139 schedule = 140 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 141 } else { 142 pr->flags.nomerge = FALSE; 143 } 144 pr->type_size = traits_t<T>::type_size; // remember the size of variables 145 if (kmp_ord_lower & schedule) { 146 pr->flags.ordered = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.ordered = FALSE; 151 } 152 153 if (schedule == kmp_sch_static) { 154 schedule = __kmp_static; 155 } else { 156 if (schedule == kmp_sch_runtime) { 157 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 158 // not specified) 159 schedule = team->t.t_sched.r_sched_type; 160 // Detail the schedule if needed (global controls are differentiated 161 // appropriately) 162 if (schedule == kmp_sch_guided_chunked) { 163 schedule = __kmp_guided; 164 } else if (schedule == kmp_sch_static) { 165 schedule = __kmp_static; 166 } 167 // Use the chunk size specified by OMP_SCHEDULE (or default if not 168 // specified) 169 chunk = team->t.t_sched.chunk; 170 #if USE_ITT_BUILD 171 if (cur_chunk) 172 *cur_chunk = chunk; 173 #endif 174 #ifdef KMP_DEBUG 175 { 176 char *buff; 177 // create format specifiers before the debug output 178 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 179 "schedule:%%d chunk:%%%s\n", 180 traits_t<ST>::spec); 181 KD_TRACE(10, (buff, gtid, schedule, chunk)); 182 __kmp_str_free(&buff); 183 } 184 #endif 185 } else { 186 if (schedule == kmp_sch_guided_chunked) { 187 schedule = __kmp_guided; 188 } 189 if (chunk <= 0) { 190 chunk = KMP_DEFAULT_CHUNK; 191 } 192 } 193 194 if (schedule == kmp_sch_auto) { 195 // mapping and differentiation: in the __kmp_do_serial_initialize() 196 schedule = __kmp_auto; 197 #ifdef KMP_DEBUG 198 { 199 char *buff; 200 // create format specifiers before the debug output 201 buff = __kmp_str_format( 202 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 203 "schedule:%%d chunk:%%%s\n", 204 traits_t<ST>::spec); 205 KD_TRACE(10, (buff, gtid, schedule, chunk)); 206 __kmp_str_free(&buff); 207 } 208 #endif 209 } 210 211 /* guided analytical not safe for too many threads */ 212 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 213 schedule = kmp_sch_guided_iterative_chunked; 214 KMP_WARNING(DispatchManyThreads); 215 } 216 #if OMP_45_ENABLED 217 if (schedule == kmp_sch_runtime_simd) { 218 // compiler provides simd_width in the chunk parameter 219 schedule = team->t.t_sched.r_sched_type; 220 // Detail the schedule if needed (global controls are differentiated 221 // appropriately) 222 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 223 schedule == __kmp_static) { 224 schedule = kmp_sch_static_balanced_chunked; 225 } else { 226 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 227 schedule = kmp_sch_guided_simd; 228 } 229 chunk = team->t.t_sched.chunk * chunk; 230 } 231 #if USE_ITT_BUILD 232 if (cur_chunk) 233 *cur_chunk = chunk; 234 #endif 235 #ifdef KMP_DEBUG 236 { 237 char *buff; 238 // create format specifiers before the debug output 239 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 240 " chunk:%%%s\n", 241 traits_t<ST>::spec); 242 KD_TRACE(10, (buff, gtid, schedule, chunk)); 243 __kmp_str_free(&buff); 244 } 245 #endif 246 } 247 #endif // OMP_45_ENABLED 248 pr->u.p.parm1 = chunk; 249 } 250 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 251 "unknown scheduling type"); 252 253 pr->u.p.count = 0; 254 255 if (__kmp_env_consistency_check) { 256 if (st == 0) { 257 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 258 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 259 } 260 } 261 // compute trip count 262 if (st == 1) { // most common case 263 if (ub >= lb) { 264 tc = ub - lb + 1; 265 } else { // ub < lb 266 tc = 0; // zero-trip 267 } 268 } else if (st < 0) { 269 if (lb >= ub) { 270 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 271 // where the division needs to be unsigned regardless of the result type 272 tc = (UT)(lb - ub) / (-st) + 1; 273 } else { // lb < ub 274 tc = 0; // zero-trip 275 } 276 } else { // st > 0 277 if (ub >= lb) { 278 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(ub - lb) / st + 1; 281 } else { // ub < lb 282 tc = 0; // zero-trip 283 } 284 } 285 286 pr->u.p.lb = lb; 287 pr->u.p.ub = ub; 288 pr->u.p.st = st; 289 pr->u.p.tc = tc; 290 291 #if KMP_OS_WINDOWS 292 pr->u.p.last_upper = ub + st; 293 #endif /* KMP_OS_WINDOWS */ 294 295 /* NOTE: only the active parallel region(s) has active ordered sections */ 296 297 if (active) { 298 if (pr->flags.ordered) { 299 pr->ordered_bumped = 0; 300 pr->u.p.ordered_lower = 1; 301 pr->u.p.ordered_upper = 0; 302 } 303 } 304 305 switch (schedule) { 306 #if (KMP_STATIC_STEAL_ENABLED) 307 case kmp_sch_static_steal: { 308 T ntc, init; 309 310 KD_TRACE(100, 311 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 312 gtid)); 313 314 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 315 if (nproc > 1 && ntc >= nproc) { 316 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 317 T id = tid; 318 T small_chunk, extras; 319 320 small_chunk = ntc / nproc; 321 extras = ntc % nproc; 322 323 init = id * small_chunk + (id < extras ? id : extras); 324 pr->u.p.count = init; 325 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 326 327 pr->u.p.parm2 = lb; 328 // pr->pfields.parm3 = 0; // it's not used in static_steal 329 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 330 pr->u.p.st = st; 331 if (traits_t<T>::type_size > 4) { 332 // AC: TODO: check if 16-byte CAS available and use it to 333 // improve performance (probably wait for explicit request 334 // before spending time on this). 335 // For now use dynamically allocated per-thread lock, 336 // free memory in __kmp_dispatch_next when status==0. 337 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 338 th->th.th_dispatch->th_steal_lock = 339 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 340 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 341 } 342 break; 343 } else { 344 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 345 "kmp_sch_static_balanced\n", 346 gtid)); 347 schedule = kmp_sch_static_balanced; 348 /* too few iterations: fall-through to kmp_sch_static_balanced */ 349 } // if 350 /* FALL-THROUGH to static balanced */ 351 } // case 352 #endif 353 case kmp_sch_static_balanced: { 354 T init, limit; 355 356 KD_TRACE( 357 100, 358 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 359 gtid)); 360 361 if (nproc > 1) { 362 T id = tid; 363 364 if (tc < nproc) { 365 if (id < tc) { 366 init = id; 367 limit = id; 368 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 369 } else { 370 pr->u.p.count = 1; /* means no more chunks to execute */ 371 pr->u.p.parm1 = FALSE; 372 break; 373 } 374 } else { 375 T small_chunk = tc / nproc; 376 T extras = tc % nproc; 377 init = id * small_chunk + (id < extras ? id : extras); 378 limit = init + small_chunk - (id < extras ? 0 : 1); 379 pr->u.p.parm1 = (id == nproc - 1); 380 } 381 } else { 382 if (tc > 0) { 383 init = 0; 384 limit = tc - 1; 385 pr->u.p.parm1 = TRUE; 386 } else { 387 // zero trip count 388 pr->u.p.count = 1; /* means no more chunks to execute */ 389 pr->u.p.parm1 = FALSE; 390 break; 391 } 392 } 393 #if USE_ITT_BUILD 394 // Calculate chunk for metadata report 395 if (itt_need_metadata_reporting) 396 if (cur_chunk) 397 *cur_chunk = limit - init + 1; 398 #endif 399 if (st == 1) { 400 pr->u.p.lb = lb + init; 401 pr->u.p.ub = lb + limit; 402 } else { 403 // calculated upper bound, "ub" is user-defined upper bound 404 T ub_tmp = lb + limit * st; 405 pr->u.p.lb = lb + init * st; 406 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 407 // it exactly 408 if (st > 0) { 409 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 410 } else { 411 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 412 } 413 } 414 if (pr->flags.ordered) { 415 pr->u.p.ordered_lower = init; 416 pr->u.p.ordered_upper = limit; 417 } 418 break; 419 } // case 420 #if OMP_45_ENABLED 421 case kmp_sch_static_balanced_chunked: { 422 // similar to balanced, but chunk adjusted to multiple of simd width 423 T nth = nproc; 424 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 425 " -> falling-through to static_greedy\n", 426 gtid)); 427 schedule = kmp_sch_static_greedy; 428 if (nth > 1) 429 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 430 else 431 pr->u.p.parm1 = tc; 432 break; 433 } // case 434 case kmp_sch_guided_simd: 435 #endif // OMP_45_ENABLED 436 case kmp_sch_guided_iterative_chunked: { 437 KD_TRACE( 438 100, 439 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 440 " case\n", 441 gtid)); 442 443 if (nproc > 1) { 444 if ((2L * chunk + 1) * nproc >= tc) { 445 /* chunk size too large, switch to dynamic */ 446 schedule = kmp_sch_dynamic_chunked; 447 } else { 448 // when remaining iters become less than parm2 - switch to dynamic 449 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 450 *(double *)&pr->u.p.parm3 = 451 guided_flt_param / nproc; // may occupy parm3 and parm4 452 } 453 } else { 454 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 455 "kmp_sch_static_greedy\n", 456 gtid)); 457 schedule = kmp_sch_static_greedy; 458 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 459 KD_TRACE( 460 100, 461 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 462 gtid)); 463 pr->u.p.parm1 = tc; 464 } // if 465 } // case 466 break; 467 case kmp_sch_guided_analytical_chunked: { 468 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 469 "kmp_sch_guided_analytical_chunked case\n", 470 gtid)); 471 472 if (nproc > 1) { 473 if ((2L * chunk + 1) * nproc >= tc) { 474 /* chunk size too large, switch to dynamic */ 475 schedule = kmp_sch_dynamic_chunked; 476 } else { 477 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 478 DBL x; 479 480 #if KMP_USE_X87CONTROL 481 /* Linux* OS already has 64-bit computation by default for long double, 482 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 483 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 484 instead of the default 53-bit. Even though long double doesn't work 485 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 486 expected to impact the correctness of the algorithm, but this has not 487 been mathematically proven. */ 488 // save original FPCW and set precision to 64-bit, as 489 // Windows* OS on IA-32 architecture defaults to 53-bit 490 unsigned int oldFpcw = _control87(0, 0); 491 _control87(_PC_64, _MCW_PC); // 0,0x30000 492 #endif 493 /* value used for comparison in solver for cross-over point */ 494 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 495 496 /* crossover point--chunk indexes equal to or greater than 497 this point switch to dynamic-style scheduling */ 498 UT cross; 499 500 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 501 x = (long double)1.0 - (long double)0.5 / nproc; 502 503 #ifdef KMP_DEBUG 504 { // test natural alignment 505 struct _test_a { 506 char a; 507 union { 508 char b; 509 DBL d; 510 }; 511 } t; 512 ptrdiff_t natural_alignment = 513 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 514 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 515 // long)natural_alignment ); 516 KMP_DEBUG_ASSERT( 517 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 518 } 519 #endif // KMP_DEBUG 520 521 /* save the term in thread private dispatch structure */ 522 *(DBL *)&pr->u.p.parm3 = x; 523 524 /* solve for the crossover point to the nearest integer i for which C_i 525 <= chunk */ 526 { 527 UT left, right, mid; 528 long double p; 529 530 /* estimate initial upper and lower bound */ 531 532 /* doesn't matter what value right is as long as it is positive, but 533 it affects performance of the solver */ 534 right = 229; 535 p = __kmp_pow<UT>(x, right); 536 if (p > target) { 537 do { 538 p *= p; 539 right <<= 1; 540 } while (p > target && right < (1 << 27)); 541 /* lower bound is previous (failed) estimate of upper bound */ 542 left = right >> 1; 543 } else { 544 left = 0; 545 } 546 547 /* bisection root-finding method */ 548 while (left + 1 < right) { 549 mid = (left + right) / 2; 550 if (__kmp_pow<UT>(x, mid) > target) { 551 left = mid; 552 } else { 553 right = mid; 554 } 555 } // while 556 cross = right; 557 } 558 /* assert sanity of computed crossover point */ 559 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 560 __kmp_pow<UT>(x, cross) <= target); 561 562 /* save the crossover point in thread private dispatch structure */ 563 pr->u.p.parm2 = cross; 564 565 // C75803 566 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 567 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 568 #else 569 #define GUIDED_ANALYTICAL_WORKAROUND (x) 570 #endif 571 /* dynamic-style scheduling offset */ 572 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 573 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 574 cross * chunk; 575 #if KMP_USE_X87CONTROL 576 // restore FPCW 577 _control87(oldFpcw, _MCW_PC); 578 #endif 579 } // if 580 } else { 581 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 582 "kmp_sch_static_greedy\n", 583 gtid)); 584 schedule = kmp_sch_static_greedy; 585 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 586 pr->u.p.parm1 = tc; 587 } // if 588 } // case 589 break; 590 case kmp_sch_static_greedy: 591 KD_TRACE( 592 100, 593 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 594 gtid)); 595 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 596 break; 597 case kmp_sch_static_chunked: 598 case kmp_sch_dynamic_chunked: 599 if (pr->u.p.parm1 <= 0) { 600 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 601 } 602 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 603 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 604 gtid)); 605 break; 606 case kmp_sch_trapezoidal: { 607 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 608 609 T parm1, parm2, parm3, parm4; 610 KD_TRACE(100, 611 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 612 gtid)); 613 614 parm1 = chunk; 615 616 /* F : size of the first cycle */ 617 parm2 = (tc / (2 * nproc)); 618 619 if (parm2 < 1) { 620 parm2 = 1; 621 } 622 623 /* L : size of the last cycle. Make sure the last cycle is not larger 624 than the first cycle. */ 625 if (parm1 < 1) { 626 parm1 = 1; 627 } else if (parm1 > parm2) { 628 parm1 = parm2; 629 } 630 631 /* N : number of cycles */ 632 parm3 = (parm2 + parm1); 633 parm3 = (2 * tc + parm3 - 1) / parm3; 634 635 if (parm3 < 2) { 636 parm3 = 2; 637 } 638 639 /* sigma : decreasing incr of the trapezoid */ 640 parm4 = (parm3 - 1); 641 parm4 = (parm2 - parm1) / parm4; 642 643 // pointless check, because parm4 >= 0 always 644 // if ( parm4 < 0 ) { 645 // parm4 = 0; 646 //} 647 648 pr->u.p.parm1 = parm1; 649 pr->u.p.parm2 = parm2; 650 pr->u.p.parm3 = parm3; 651 pr->u.p.parm4 = parm4; 652 } // case 653 break; 654 655 default: { 656 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 657 KMP_HNT(GetNewerLibrary), // Hint 658 __kmp_msg_null // Variadic argument list terminator 659 ); 660 } break; 661 } // switch 662 pr->schedule = schedule; 663 } 664 665 #if KMP_USE_HIER_SCHED 666 template <typename T> 667 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 668 typename traits_t<T>::signed_t st); 669 template <> 670 inline void 671 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 672 kmp_int32 ub, kmp_int32 st) { 673 __kmp_dispatch_init_hierarchy<kmp_int32>( 674 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 675 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 676 } 677 template <> 678 inline void 679 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 680 kmp_uint32 ub, kmp_int32 st) { 681 __kmp_dispatch_init_hierarchy<kmp_uint32>( 682 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 683 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 684 } 685 template <> 686 inline void 687 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 688 kmp_int64 ub, kmp_int64 st) { 689 __kmp_dispatch_init_hierarchy<kmp_int64>( 690 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 691 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 692 } 693 template <> 694 inline void 695 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 696 kmp_uint64 ub, kmp_int64 st) { 697 __kmp_dispatch_init_hierarchy<kmp_uint64>( 698 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 699 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 700 } 701 702 // free all the hierarchy scheduling memory associated with the team 703 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 704 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 705 for (int i = 0; i < num_disp_buff; ++i) { 706 // type does not matter here so use kmp_int32 707 auto sh = 708 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 709 &team->t.t_disp_buffer[i]); 710 if (sh->hier) { 711 sh->hier->deallocate(); 712 __kmp_free(sh->hier); 713 } 714 } 715 } 716 #endif 717 718 // UT - unsigned flavor of T, ST - signed flavor of T, 719 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 720 template <typename T> 721 static void 722 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 723 T ub, typename traits_t<T>::signed_t st, 724 typename traits_t<T>::signed_t chunk, int push_ws) { 725 typedef typename traits_t<T>::unsigned_t UT; 726 727 int active; 728 kmp_info_t *th; 729 kmp_team_t *team; 730 kmp_uint32 my_buffer_index; 731 dispatch_private_info_template<T> *pr; 732 dispatch_shared_info_template<T> volatile *sh; 733 734 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 735 sizeof(dispatch_private_info)); 736 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 737 sizeof(dispatch_shared_info)); 738 739 if (!TCR_4(__kmp_init_parallel)) 740 __kmp_parallel_initialize(); 741 742 #if OMP_50_ENABLED 743 __kmp_resume_if_soft_paused(); 744 #endif 745 746 #if INCLUDE_SSC_MARKS 747 SSC_MARK_DISPATCH_INIT(); 748 #endif 749 #ifdef KMP_DEBUG 750 typedef typename traits_t<T>::signed_t ST; 751 { 752 char *buff; 753 // create format specifiers before the debug output 754 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 755 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 756 traits_t<ST>::spec, traits_t<T>::spec, 757 traits_t<T>::spec, traits_t<ST>::spec); 758 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 759 __kmp_str_free(&buff); 760 } 761 #endif 762 /* setup data */ 763 th = __kmp_threads[gtid]; 764 team = th->th.th_team; 765 active = !team->t.t_serialized; 766 th->th.th_ident = loc; 767 768 // Any half-decent optimizer will remove this test when the blocks are empty 769 // since the macros expand to nothing 770 // when statistics are disabled. 771 if (schedule == __kmp_static) { 772 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 773 } else { 774 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 775 } 776 777 #if KMP_USE_HIER_SCHED 778 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 779 // Hierarchical scheduling does not work with ordered, so if ordered is 780 // detected, then revert back to threaded scheduling. 781 bool ordered; 782 enum sched_type my_sched = schedule; 783 my_buffer_index = th->th.th_dispatch->th_disp_index; 784 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 785 &th->th.th_dispatch 786 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 787 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 788 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 789 my_sched = 790 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 791 ordered = (kmp_ord_lower & my_sched); 792 if (pr->flags.use_hier) { 793 if (ordered) { 794 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 795 "Disabling hierarchical scheduling.\n", 796 gtid)); 797 pr->flags.use_hier = FALSE; 798 } 799 } 800 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 801 // Don't use hierarchical for ordered parallel loops and don't 802 // use the runtime hierarchy if one was specified in the program 803 if (!ordered && !pr->flags.use_hier) 804 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 805 } 806 #endif // KMP_USE_HIER_SCHED 807 808 #if USE_ITT_BUILD 809 kmp_uint64 cur_chunk = chunk; 810 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 811 __kmp_forkjoin_frames_mode == 3 && 812 KMP_MASTER_GTID(gtid) && 813 #if OMP_40_ENABLED 814 th->th.th_teams_microtask == NULL && 815 #endif 816 team->t.t_active_level == 1; 817 #endif 818 if (!active) { 819 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 820 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 821 } else { 822 KMP_DEBUG_ASSERT(th->th.th_dispatch == 823 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 824 825 my_buffer_index = th->th.th_dispatch->th_disp_index++; 826 827 /* What happens when number of threads changes, need to resize buffer? */ 828 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 829 &th->th.th_dispatch 830 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 831 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 832 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 833 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 834 my_buffer_index)); 835 } 836 837 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 838 #if USE_ITT_BUILD 839 &cur_chunk, 840 #endif 841 chunk, (T)th->th.th_team_nproc, 842 (T)th->th.th_info.ds.ds_tid); 843 if (active) { 844 if (pr->flags.ordered == 0) { 845 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 846 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 847 } else { 848 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 849 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 850 } 851 } 852 853 if (active) { 854 /* The name of this buffer should be my_buffer_index when it's free to use 855 * it */ 856 857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 858 "sh->buffer_index:%d\n", 859 gtid, my_buffer_index, sh->buffer_index)); 860 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 861 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 862 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 863 // my_buffer_index are *always* 32-bit integers. 864 KMP_MB(); /* is this necessary? */ 865 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 866 "sh->buffer_index:%d\n", 867 gtid, my_buffer_index, sh->buffer_index)); 868 869 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 870 th->th.th_dispatch->th_dispatch_sh_current = 871 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 872 #if USE_ITT_BUILD 873 if (pr->flags.ordered) { 874 __kmp_itt_ordered_init(gtid); 875 } 876 // Report loop metadata 877 if (itt_need_metadata_reporting) { 878 // Only report metadata by master of active team at level 1 879 kmp_uint64 schedtype = 0; 880 switch (schedule) { 881 case kmp_sch_static_chunked: 882 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 883 break; 884 case kmp_sch_static_greedy: 885 cur_chunk = pr->u.p.parm1; 886 break; 887 case kmp_sch_dynamic_chunked: 888 schedtype = 1; 889 break; 890 case kmp_sch_guided_iterative_chunked: 891 case kmp_sch_guided_analytical_chunked: 892 #if OMP_45_ENABLED 893 case kmp_sch_guided_simd: 894 #endif 895 schedtype = 2; 896 break; 897 default: 898 // Should we put this case under "static"? 899 // case kmp_sch_static_steal: 900 schedtype = 3; 901 break; 902 } 903 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 904 } 905 #if KMP_USE_HIER_SCHED 906 if (pr->flags.use_hier) { 907 pr->u.p.count = 0; 908 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 909 } 910 #endif // KMP_USER_HIER_SCHED 911 #endif /* USE_ITT_BUILD */ 912 } 913 914 #ifdef KMP_DEBUG 915 { 916 char *buff; 917 // create format specifiers before the debug output 918 buff = __kmp_str_format( 919 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 920 "lb:%%%s ub:%%%s" 921 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 922 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 923 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 924 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 925 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 926 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 927 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 928 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 929 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 930 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 931 __kmp_str_free(&buff); 932 } 933 #endif 934 #if (KMP_STATIC_STEAL_ENABLED) 935 // It cannot be guaranteed that after execution of a loop with some other 936 // schedule kind all the parm3 variables will contain the same value. Even if 937 // all parm3 will be the same, it still exists a bad case like using 0 and 1 938 // rather than program life-time increment. So the dedicated variable is 939 // required. The 'static_steal_counter' is used. 940 if (schedule == kmp_sch_static_steal) { 941 // Other threads will inspect this variable when searching for a victim. 942 // This is a flag showing that other threads may steal from this thread 943 // since then. 944 volatile T *p = &pr->u.p.static_steal_counter; 945 *p = *p + 1; 946 } 947 #endif // ( KMP_STATIC_STEAL_ENABLED ) 948 949 #if OMPT_SUPPORT && OMPT_OPTIONAL 950 if (ompt_enabled.ompt_callback_work) { 951 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 952 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 953 ompt_callbacks.ompt_callback(ompt_callback_work)( 954 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 955 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 956 } 957 #endif 958 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 959 } 960 961 /* For ordered loops, either __kmp_dispatch_finish() should be called after 962 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 963 * every chunk of iterations. If the ordered section(s) were not executed 964 * for this iteration (or every iteration in this chunk), we need to set the 965 * ordered iteration counters so that the next thread can proceed. */ 966 template <typename UT> 967 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 968 typedef typename traits_t<UT>::signed_t ST; 969 kmp_info_t *th = __kmp_threads[gtid]; 970 971 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 972 if (!th->th.th_team->t.t_serialized) { 973 974 dispatch_private_info_template<UT> *pr = 975 reinterpret_cast<dispatch_private_info_template<UT> *>( 976 th->th.th_dispatch->th_dispatch_pr_current); 977 dispatch_shared_info_template<UT> volatile *sh = 978 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 979 th->th.th_dispatch->th_dispatch_sh_current); 980 KMP_DEBUG_ASSERT(pr); 981 KMP_DEBUG_ASSERT(sh); 982 KMP_DEBUG_ASSERT(th->th.th_dispatch == 983 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 984 985 if (pr->ordered_bumped) { 986 KD_TRACE( 987 1000, 988 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 989 gtid)); 990 pr->ordered_bumped = 0; 991 } else { 992 UT lower = pr->u.p.ordered_lower; 993 994 #ifdef KMP_DEBUG 995 { 996 char *buff; 997 // create format specifiers before the debug output 998 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 999 "ordered_iteration:%%%s lower:%%%s\n", 1000 traits_t<UT>::spec, traits_t<UT>::spec); 1001 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1002 __kmp_str_free(&buff); 1003 } 1004 #endif 1005 1006 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1007 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1008 KMP_MB(); /* is this necessary? */ 1009 #ifdef KMP_DEBUG 1010 { 1011 char *buff; 1012 // create format specifiers before the debug output 1013 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1014 "ordered_iteration:%%%s lower:%%%s\n", 1015 traits_t<UT>::spec, traits_t<UT>::spec); 1016 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1017 __kmp_str_free(&buff); 1018 } 1019 #endif 1020 1021 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1022 } // if 1023 } // if 1024 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1025 } 1026 1027 #ifdef KMP_GOMP_COMPAT 1028 1029 template <typename UT> 1030 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1031 typedef typename traits_t<UT>::signed_t ST; 1032 kmp_info_t *th = __kmp_threads[gtid]; 1033 1034 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1035 if (!th->th.th_team->t.t_serialized) { 1036 // int cid; 1037 dispatch_private_info_template<UT> *pr = 1038 reinterpret_cast<dispatch_private_info_template<UT> *>( 1039 th->th.th_dispatch->th_dispatch_pr_current); 1040 dispatch_shared_info_template<UT> volatile *sh = 1041 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1042 th->th.th_dispatch->th_dispatch_sh_current); 1043 KMP_DEBUG_ASSERT(pr); 1044 KMP_DEBUG_ASSERT(sh); 1045 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1046 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1047 1048 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1049 UT lower = pr->u.p.ordered_lower; 1050 UT upper = pr->u.p.ordered_upper; 1051 UT inc = upper - lower + 1; 1052 1053 if (pr->ordered_bumped == inc) { 1054 KD_TRACE( 1055 1000, 1056 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1057 gtid)); 1058 pr->ordered_bumped = 0; 1059 } else { 1060 inc -= pr->ordered_bumped; 1061 1062 #ifdef KMP_DEBUG 1063 { 1064 char *buff; 1065 // create format specifiers before the debug output 1066 buff = __kmp_str_format( 1067 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1068 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1069 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1070 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1071 __kmp_str_free(&buff); 1072 } 1073 #endif 1074 1075 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1076 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1077 1078 KMP_MB(); /* is this necessary? */ 1079 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1080 "ordered_bumped to zero\n", 1081 gtid)); 1082 pr->ordered_bumped = 0; 1083 //!!!!! TODO check if the inc should be unsigned, or signed??? 1084 #ifdef KMP_DEBUG 1085 { 1086 char *buff; 1087 // create format specifiers before the debug output 1088 buff = __kmp_str_format( 1089 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1090 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1091 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1092 traits_t<UT>::spec); 1093 KD_TRACE(1000, 1094 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1095 __kmp_str_free(&buff); 1096 } 1097 #endif 1098 1099 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1100 } 1101 // } 1102 } 1103 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1104 } 1105 1106 #endif /* KMP_GOMP_COMPAT */ 1107 1108 template <typename T> 1109 int __kmp_dispatch_next_algorithm(int gtid, 1110 dispatch_private_info_template<T> *pr, 1111 dispatch_shared_info_template<T> volatile *sh, 1112 kmp_int32 *p_last, T *p_lb, T *p_ub, 1113 typename traits_t<T>::signed_t *p_st, T nproc, 1114 T tid) { 1115 typedef typename traits_t<T>::unsigned_t UT; 1116 typedef typename traits_t<T>::signed_t ST; 1117 typedef typename traits_t<T>::floating_t DBL; 1118 int status = 0; 1119 kmp_int32 last = 0; 1120 T start; 1121 ST incr; 1122 UT limit, trip, init; 1123 kmp_info_t *th = __kmp_threads[gtid]; 1124 kmp_team_t *team = th->th.th_team; 1125 1126 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1127 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1128 KMP_DEBUG_ASSERT(pr); 1129 KMP_DEBUG_ASSERT(sh); 1130 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1131 #ifdef KMP_DEBUG 1132 { 1133 char *buff; 1134 // create format specifiers before the debug output 1135 buff = 1136 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1137 "sh:%%p nproc:%%%s tid:%%%s\n", 1138 traits_t<T>::spec, traits_t<T>::spec); 1139 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1140 __kmp_str_free(&buff); 1141 } 1142 #endif 1143 1144 // zero trip count 1145 if (pr->u.p.tc == 0) { 1146 KD_TRACE(10, 1147 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1148 "zero status:%d\n", 1149 gtid, status)); 1150 return 0; 1151 } 1152 1153 switch (pr->schedule) { 1154 #if (KMP_STATIC_STEAL_ENABLED) 1155 case kmp_sch_static_steal: { 1156 T chunk = pr->u.p.parm1; 1157 1158 KD_TRACE(100, 1159 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1160 gtid)); 1161 1162 trip = pr->u.p.tc - 1; 1163 1164 if (traits_t<T>::type_size > 4) { 1165 // use lock for 8-byte and CAS for 4-byte induction 1166 // variable. TODO (optional): check and use 16-byte CAS 1167 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1168 KMP_DEBUG_ASSERT(lck != NULL); 1169 if (pr->u.p.count < (UT)pr->u.p.ub) { 1170 __kmp_acquire_lock(lck, gtid); 1171 // try to get own chunk of iterations 1172 init = (pr->u.p.count)++; 1173 status = (init < (UT)pr->u.p.ub); 1174 __kmp_release_lock(lck, gtid); 1175 } else { 1176 status = 0; // no own chunks 1177 } 1178 if (!status) { // try to steal 1179 kmp_info_t **other_threads = team->t.t_threads; 1180 int while_limit = nproc; // nproc attempts to find a victim 1181 int while_index = 0; 1182 // TODO: algorithm of searching for a victim 1183 // should be cleaned up and measured 1184 while ((!status) && (while_limit != ++while_index)) { 1185 T remaining; 1186 T victimIdx = pr->u.p.parm4; 1187 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1188 dispatch_private_info_template<T> *victim = 1189 reinterpret_cast<dispatch_private_info_template<T> *>( 1190 other_threads[victimIdx] 1191 ->th.th_dispatch->th_dispatch_pr_current); 1192 while ((victim == NULL || victim == pr || 1193 (*(volatile T *)&victim->u.p.static_steal_counter != 1194 *(volatile T *)&pr->u.p.static_steal_counter)) && 1195 oldVictimIdx != victimIdx) { 1196 victimIdx = (victimIdx + 1) % nproc; 1197 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1198 other_threads[victimIdx] 1199 ->th.th_dispatch->th_dispatch_pr_current); 1200 } 1201 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1202 *(volatile T *)&pr->u.p.static_steal_counter)) { 1203 continue; // try once more (nproc attempts in total) 1204 // no victim is ready yet to participate in stealing 1205 // because all victims are still in kmp_init_dispatch 1206 } 1207 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1208 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1209 continue; // not enough chunks to steal, goto next victim 1210 } 1211 1212 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1213 KMP_ASSERT(lck != NULL); 1214 __kmp_acquire_lock(lck, gtid); 1215 limit = victim->u.p.ub; // keep initial ub 1216 if (victim->u.p.count >= limit || 1217 (remaining = limit - victim->u.p.count) < 2) { 1218 __kmp_release_lock(lck, gtid); 1219 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1220 continue; // not enough chunks to steal 1221 } 1222 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1223 // by 1 1224 if (remaining > 3) { 1225 // steal 1/4 of remaining 1226 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1227 init = (victim->u.p.ub -= (remaining >> 2)); 1228 } else { 1229 // steal 1 chunk of 2 or 3 remaining 1230 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1231 init = (victim->u.p.ub -= 1); 1232 } 1233 __kmp_release_lock(lck, gtid); 1234 1235 KMP_DEBUG_ASSERT(init + 1 <= limit); 1236 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1237 status = 1; 1238 while_index = 0; 1239 // now update own count and ub with stolen range but init chunk 1240 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1241 pr->u.p.count = init + 1; 1242 pr->u.p.ub = limit; 1243 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1244 } // while (search for victim) 1245 } // if (try to find victim and steal) 1246 } else { 1247 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1248 typedef union { 1249 struct { 1250 UT count; 1251 T ub; 1252 } p; 1253 kmp_int64 b; 1254 } union_i4; 1255 // All operations on 'count' or 'ub' must be combined atomically 1256 // together. 1257 { 1258 union_i4 vold, vnew; 1259 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1260 vnew = vold; 1261 vnew.p.count++; 1262 while (!KMP_COMPARE_AND_STORE_ACQ64( 1263 (volatile kmp_int64 *)&pr->u.p.count, 1264 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1265 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1266 KMP_CPU_PAUSE(); 1267 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1268 vnew = vold; 1269 vnew.p.count++; 1270 } 1271 vnew = vold; 1272 init = vnew.p.count; 1273 status = (init < (UT)vnew.p.ub); 1274 } 1275 1276 if (!status) { 1277 kmp_info_t **other_threads = team->t.t_threads; 1278 int while_limit = nproc; // nproc attempts to find a victim 1279 int while_index = 0; 1280 1281 // TODO: algorithm of searching for a victim 1282 // should be cleaned up and measured 1283 while ((!status) && (while_limit != ++while_index)) { 1284 union_i4 vold, vnew; 1285 kmp_int32 remaining; 1286 T victimIdx = pr->u.p.parm4; 1287 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1288 dispatch_private_info_template<T> *victim = 1289 reinterpret_cast<dispatch_private_info_template<T> *>( 1290 other_threads[victimIdx] 1291 ->th.th_dispatch->th_dispatch_pr_current); 1292 while ((victim == NULL || victim == pr || 1293 (*(volatile T *)&victim->u.p.static_steal_counter != 1294 *(volatile T *)&pr->u.p.static_steal_counter)) && 1295 oldVictimIdx != victimIdx) { 1296 victimIdx = (victimIdx + 1) % nproc; 1297 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1298 other_threads[victimIdx] 1299 ->th.th_dispatch->th_dispatch_pr_current); 1300 } 1301 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1302 *(volatile T *)&pr->u.p.static_steal_counter)) { 1303 continue; // try once more (nproc attempts in total) 1304 // no victim is ready yet to participate in stealing 1305 // because all victims are still in kmp_init_dispatch 1306 } 1307 pr->u.p.parm4 = victimIdx; // new victim found 1308 while (1) { // CAS loop if victim has enough chunks to steal 1309 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1310 vnew = vold; 1311 1312 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1313 if (vnew.p.count >= (UT)vnew.p.ub || 1314 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1315 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1316 break; // not enough chunks to steal, goto next victim 1317 } 1318 if (remaining > 3) { 1319 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1320 } else { 1321 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1322 } 1323 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1324 // TODO: Should this be acquire or release? 1325 if (KMP_COMPARE_AND_STORE_ACQ64( 1326 (volatile kmp_int64 *)&victim->u.p.count, 1327 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1328 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1329 // stealing succedded 1330 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1331 vold.p.ub - vnew.p.ub); 1332 status = 1; 1333 while_index = 0; 1334 // now update own count and ub 1335 init = vnew.p.ub; 1336 vold.p.count = init + 1; 1337 #if KMP_ARCH_X86 1338 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1339 #else 1340 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1341 #endif 1342 break; 1343 } // if (check CAS result) 1344 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1345 } // while (try to steal from particular victim) 1346 } // while (search for victim) 1347 } // if (try to find victim and steal) 1348 } // if (4-byte induction variable) 1349 if (!status) { 1350 *p_lb = 0; 1351 *p_ub = 0; 1352 if (p_st != NULL) 1353 *p_st = 0; 1354 } else { 1355 start = pr->u.p.parm2; 1356 init *= chunk; 1357 limit = chunk + init - 1; 1358 incr = pr->u.p.st; 1359 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1360 1361 KMP_DEBUG_ASSERT(init <= trip); 1362 if ((last = (limit >= trip)) != 0) 1363 limit = trip; 1364 if (p_st != NULL) 1365 *p_st = incr; 1366 1367 if (incr == 1) { 1368 *p_lb = start + init; 1369 *p_ub = start + limit; 1370 } else { 1371 *p_lb = start + init * incr; 1372 *p_ub = start + limit * incr; 1373 } 1374 1375 if (pr->flags.ordered) { 1376 pr->u.p.ordered_lower = init; 1377 pr->u.p.ordered_upper = limit; 1378 } // if 1379 } // if 1380 break; 1381 } // case 1382 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1383 case kmp_sch_static_balanced: { 1384 KD_TRACE( 1385 10, 1386 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1387 gtid)); 1388 /* check if thread has any iteration to do */ 1389 if ((status = !pr->u.p.count) != 0) { 1390 pr->u.p.count = 1; 1391 *p_lb = pr->u.p.lb; 1392 *p_ub = pr->u.p.ub; 1393 last = pr->u.p.parm1; 1394 if (p_st != NULL) 1395 *p_st = pr->u.p.st; 1396 } else { /* no iterations to do */ 1397 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1398 } 1399 } // case 1400 break; 1401 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1402 merged here */ 1403 case kmp_sch_static_chunked: { 1404 T parm1; 1405 1406 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1407 "kmp_sch_static_[affinity|chunked] case\n", 1408 gtid)); 1409 parm1 = pr->u.p.parm1; 1410 1411 trip = pr->u.p.tc - 1; 1412 init = parm1 * (pr->u.p.count + tid); 1413 1414 if ((status = (init <= trip)) != 0) { 1415 start = pr->u.p.lb; 1416 incr = pr->u.p.st; 1417 limit = parm1 + init - 1; 1418 1419 if ((last = (limit >= trip)) != 0) 1420 limit = trip; 1421 1422 if (p_st != NULL) 1423 *p_st = incr; 1424 1425 pr->u.p.count += nproc; 1426 1427 if (incr == 1) { 1428 *p_lb = start + init; 1429 *p_ub = start + limit; 1430 } else { 1431 *p_lb = start + init * incr; 1432 *p_ub = start + limit * incr; 1433 } 1434 1435 if (pr->flags.ordered) { 1436 pr->u.p.ordered_lower = init; 1437 pr->u.p.ordered_upper = limit; 1438 } // if 1439 } // if 1440 } // case 1441 break; 1442 1443 case kmp_sch_dynamic_chunked: { 1444 T chunk = pr->u.p.parm1; 1445 1446 KD_TRACE( 1447 100, 1448 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1449 gtid)); 1450 1451 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1452 trip = pr->u.p.tc - 1; 1453 1454 if ((status = (init <= trip)) == 0) { 1455 *p_lb = 0; 1456 *p_ub = 0; 1457 if (p_st != NULL) 1458 *p_st = 0; 1459 } else { 1460 start = pr->u.p.lb; 1461 limit = chunk + init - 1; 1462 incr = pr->u.p.st; 1463 1464 if ((last = (limit >= trip)) != 0) 1465 limit = trip; 1466 1467 if (p_st != NULL) 1468 *p_st = incr; 1469 1470 if (incr == 1) { 1471 *p_lb = start + init; 1472 *p_ub = start + limit; 1473 } else { 1474 *p_lb = start + init * incr; 1475 *p_ub = start + limit * incr; 1476 } 1477 1478 if (pr->flags.ordered) { 1479 pr->u.p.ordered_lower = init; 1480 pr->u.p.ordered_upper = limit; 1481 } // if 1482 } // if 1483 } // case 1484 break; 1485 1486 case kmp_sch_guided_iterative_chunked: { 1487 T chunkspec = pr->u.p.parm1; 1488 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1489 "iterative case\n", 1490 gtid)); 1491 trip = pr->u.p.tc; 1492 // Start atomic part of calculations 1493 while (1) { 1494 ST remaining; // signed, because can be < 0 1495 init = sh->u.s.iteration; // shared value 1496 remaining = trip - init; 1497 if (remaining <= 0) { // AC: need to compare with 0 first 1498 // nothing to do, don't try atomic op 1499 status = 0; 1500 break; 1501 } 1502 if ((T)remaining < 1503 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1504 // use dynamic-style shcedule 1505 // atomically inrement iterations, get old value 1506 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1507 (ST)chunkspec); 1508 remaining = trip - init; 1509 if (remaining <= 0) { 1510 status = 0; // all iterations got by other threads 1511 } else { 1512 // got some iterations to work on 1513 status = 1; 1514 if ((T)remaining > chunkspec) { 1515 limit = init + chunkspec - 1; 1516 } else { 1517 last = 1; // the last chunk 1518 limit = init + remaining - 1; 1519 } // if 1520 } // if 1521 break; 1522 } // if 1523 limit = init + 1524 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1525 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1526 (ST)init, (ST)limit)) { 1527 // CAS was successful, chunk obtained 1528 status = 1; 1529 --limit; 1530 break; 1531 } // if 1532 } // while 1533 if (status != 0) { 1534 start = pr->u.p.lb; 1535 incr = pr->u.p.st; 1536 if (p_st != NULL) 1537 *p_st = incr; 1538 *p_lb = start + init * incr; 1539 *p_ub = start + limit * incr; 1540 if (pr->flags.ordered) { 1541 pr->u.p.ordered_lower = init; 1542 pr->u.p.ordered_upper = limit; 1543 } // if 1544 } else { 1545 *p_lb = 0; 1546 *p_ub = 0; 1547 if (p_st != NULL) 1548 *p_st = 0; 1549 } // if 1550 } // case 1551 break; 1552 1553 #if OMP_45_ENABLED 1554 case kmp_sch_guided_simd: { 1555 // same as iterative but curr-chunk adjusted to be multiple of given 1556 // chunk 1557 T chunk = pr->u.p.parm1; 1558 KD_TRACE(100, 1559 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1560 gtid)); 1561 trip = pr->u.p.tc; 1562 // Start atomic part of calculations 1563 while (1) { 1564 ST remaining; // signed, because can be < 0 1565 init = sh->u.s.iteration; // shared value 1566 remaining = trip - init; 1567 if (remaining <= 0) { // AC: need to compare with 0 first 1568 status = 0; // nothing to do, don't try atomic op 1569 break; 1570 } 1571 KMP_DEBUG_ASSERT(init % chunk == 0); 1572 // compare with K*nproc*(chunk+1), K=2 by default 1573 if ((T)remaining < pr->u.p.parm2) { 1574 // use dynamic-style shcedule 1575 // atomically inrement iterations, get old value 1576 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1577 (ST)chunk); 1578 remaining = trip - init; 1579 if (remaining <= 0) { 1580 status = 0; // all iterations got by other threads 1581 } else { 1582 // got some iterations to work on 1583 status = 1; 1584 if ((T)remaining > chunk) { 1585 limit = init + chunk - 1; 1586 } else { 1587 last = 1; // the last chunk 1588 limit = init + remaining - 1; 1589 } // if 1590 } // if 1591 break; 1592 } // if 1593 // divide by K*nproc 1594 UT span = remaining * (*(double *)&pr->u.p.parm3); 1595 UT rem = span % chunk; 1596 if (rem) // adjust so that span%chunk == 0 1597 span += chunk - rem; 1598 limit = init + span; 1599 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1600 (ST)init, (ST)limit)) { 1601 // CAS was successful, chunk obtained 1602 status = 1; 1603 --limit; 1604 break; 1605 } // if 1606 } // while 1607 if (status != 0) { 1608 start = pr->u.p.lb; 1609 incr = pr->u.p.st; 1610 if (p_st != NULL) 1611 *p_st = incr; 1612 *p_lb = start + init * incr; 1613 *p_ub = start + limit * incr; 1614 if (pr->flags.ordered) { 1615 pr->u.p.ordered_lower = init; 1616 pr->u.p.ordered_upper = limit; 1617 } // if 1618 } else { 1619 *p_lb = 0; 1620 *p_ub = 0; 1621 if (p_st != NULL) 1622 *p_st = 0; 1623 } // if 1624 } // case 1625 break; 1626 #endif // OMP_45_ENABLED 1627 1628 case kmp_sch_guided_analytical_chunked: { 1629 T chunkspec = pr->u.p.parm1; 1630 UT chunkIdx; 1631 #if KMP_USE_X87CONTROL 1632 /* for storing original FPCW value for Windows* OS on 1633 IA-32 architecture 8-byte version */ 1634 unsigned int oldFpcw; 1635 unsigned int fpcwSet = 0; 1636 #endif 1637 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1638 "kmp_sch_guided_analytical_chunked case\n", 1639 gtid)); 1640 1641 trip = pr->u.p.tc; 1642 1643 KMP_DEBUG_ASSERT(nproc > 1); 1644 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1645 1646 while (1) { /* this while loop is a safeguard against unexpected zero 1647 chunk sizes */ 1648 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1649 if (chunkIdx >= (UT)pr->u.p.parm2) { 1650 --trip; 1651 /* use dynamic-style scheduling */ 1652 init = chunkIdx * chunkspec + pr->u.p.count; 1653 /* need to verify init > 0 in case of overflow in the above 1654 * calculation */ 1655 if ((status = (init > 0 && init <= trip)) != 0) { 1656 limit = init + chunkspec - 1; 1657 1658 if ((last = (limit >= trip)) != 0) 1659 limit = trip; 1660 } 1661 break; 1662 } else { 1663 /* use exponential-style scheduling */ 1664 /* The following check is to workaround the lack of long double precision on 1665 Windows* OS. 1666 This check works around the possible effect that init != 0 for chunkIdx == 0. 1667 */ 1668 #if KMP_USE_X87CONTROL 1669 /* If we haven't already done so, save original 1670 FPCW and set precision to 64-bit, as Windows* OS 1671 on IA-32 architecture defaults to 53-bit */ 1672 if (!fpcwSet) { 1673 oldFpcw = _control87(0, 0); 1674 _control87(_PC_64, _MCW_PC); 1675 fpcwSet = 0x30000; 1676 } 1677 #endif 1678 if (chunkIdx) { 1679 init = __kmp_dispatch_guided_remaining<T>( 1680 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1681 KMP_DEBUG_ASSERT(init); 1682 init = trip - init; 1683 } else 1684 init = 0; 1685 limit = trip - __kmp_dispatch_guided_remaining<T>( 1686 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1687 KMP_ASSERT(init <= limit); 1688 if (init < limit) { 1689 KMP_DEBUG_ASSERT(limit <= trip); 1690 --limit; 1691 status = 1; 1692 break; 1693 } // if 1694 } // if 1695 } // while (1) 1696 #if KMP_USE_X87CONTROL 1697 /* restore FPCW if necessary 1698 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1699 */ 1700 if (fpcwSet && (oldFpcw & fpcwSet)) 1701 _control87(oldFpcw, _MCW_PC); 1702 #endif 1703 if (status != 0) { 1704 start = pr->u.p.lb; 1705 incr = pr->u.p.st; 1706 if (p_st != NULL) 1707 *p_st = incr; 1708 *p_lb = start + init * incr; 1709 *p_ub = start + limit * incr; 1710 if (pr->flags.ordered) { 1711 pr->u.p.ordered_lower = init; 1712 pr->u.p.ordered_upper = limit; 1713 } 1714 } else { 1715 *p_lb = 0; 1716 *p_ub = 0; 1717 if (p_st != NULL) 1718 *p_st = 0; 1719 } 1720 } // case 1721 break; 1722 1723 case kmp_sch_trapezoidal: { 1724 UT index; 1725 T parm2 = pr->u.p.parm2; 1726 T parm3 = pr->u.p.parm3; 1727 T parm4 = pr->u.p.parm4; 1728 KD_TRACE(100, 1729 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1730 gtid)); 1731 1732 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1733 1734 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1735 trip = pr->u.p.tc - 1; 1736 1737 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1738 *p_lb = 0; 1739 *p_ub = 0; 1740 if (p_st != NULL) 1741 *p_st = 0; 1742 } else { 1743 start = pr->u.p.lb; 1744 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1745 incr = pr->u.p.st; 1746 1747 if ((last = (limit >= trip)) != 0) 1748 limit = trip; 1749 1750 if (p_st != NULL) 1751 *p_st = incr; 1752 1753 if (incr == 1) { 1754 *p_lb = start + init; 1755 *p_ub = start + limit; 1756 } else { 1757 *p_lb = start + init * incr; 1758 *p_ub = start + limit * incr; 1759 } 1760 1761 if (pr->flags.ordered) { 1762 pr->u.p.ordered_lower = init; 1763 pr->u.p.ordered_upper = limit; 1764 } // if 1765 } // if 1766 } // case 1767 break; 1768 default: { 1769 status = 0; // to avoid complaints on uninitialized variable use 1770 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1771 KMP_HNT(GetNewerLibrary), // Hint 1772 __kmp_msg_null // Variadic argument list terminator 1773 ); 1774 } break; 1775 } // switch 1776 if (p_last) 1777 *p_last = last; 1778 #ifdef KMP_DEBUG 1779 if (pr->flags.ordered) { 1780 char *buff; 1781 // create format specifiers before the debug output 1782 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1783 "ordered_lower:%%%s ordered_upper:%%%s\n", 1784 traits_t<UT>::spec, traits_t<UT>::spec); 1785 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1786 __kmp_str_free(&buff); 1787 } 1788 { 1789 char *buff; 1790 // create format specifiers before the debug output 1791 buff = __kmp_str_format( 1792 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1793 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1794 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1795 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1796 __kmp_str_free(&buff); 1797 } 1798 #endif 1799 return status; 1800 } 1801 1802 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1803 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1804 is not called. */ 1805 #if OMPT_SUPPORT && OMPT_OPTIONAL 1806 #define OMPT_LOOP_END \ 1807 if (status == 0) { \ 1808 if (ompt_enabled.ompt_callback_work) { \ 1809 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1810 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1811 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1812 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1813 &(task_info->task_data), 0, codeptr); \ 1814 } \ 1815 } 1816 // TODO: implement count 1817 #else 1818 #define OMPT_LOOP_END // no-op 1819 #endif 1820 1821 #if KMP_STATS_ENABLED 1822 #define KMP_STATS_LOOP_END \ 1823 { \ 1824 kmp_int64 u, l, t, i; \ 1825 l = (kmp_int64)(*p_lb); \ 1826 u = (kmp_int64)(*p_ub); \ 1827 i = (kmp_int64)(pr->u.p.st); \ 1828 if (status == 0) { \ 1829 t = 0; \ 1830 KMP_POP_PARTITIONED_TIMER(); \ 1831 } else if (i == 1) { \ 1832 if (u >= l) \ 1833 t = u - l + 1; \ 1834 else \ 1835 t = 0; \ 1836 } else if (i < 0) { \ 1837 if (l >= u) \ 1838 t = (l - u) / (-i) + 1; \ 1839 else \ 1840 t = 0; \ 1841 } else { \ 1842 if (u >= l) \ 1843 t = (u - l) / i + 1; \ 1844 else \ 1845 t = 0; \ 1846 } \ 1847 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1848 } 1849 #else 1850 #define KMP_STATS_LOOP_END /* Nothing */ 1851 #endif 1852 1853 template <typename T> 1854 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1855 T *p_lb, T *p_ub, 1856 typename traits_t<T>::signed_t *p_st 1857 #if OMPT_SUPPORT && OMPT_OPTIONAL 1858 , 1859 void *codeptr 1860 #endif 1861 ) { 1862 1863 typedef typename traits_t<T>::unsigned_t UT; 1864 typedef typename traits_t<T>::signed_t ST; 1865 // This is potentially slightly misleading, schedule(runtime) will appear here 1866 // even if the actual runtme schedule is static. (Which points out a 1867 // disadavantage of schedule(runtime): even when static scheduling is used it 1868 // costs more than a compile time choice to use static scheduling would.) 1869 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1870 1871 int status; 1872 dispatch_private_info_template<T> *pr; 1873 kmp_info_t *th = __kmp_threads[gtid]; 1874 kmp_team_t *team = th->th.th_team; 1875 1876 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1877 KD_TRACE( 1878 1000, 1879 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1880 gtid, p_lb, p_ub, p_st, p_last)); 1881 1882 if (team->t.t_serialized) { 1883 /* NOTE: serialize this dispatch becase we are not at the active level */ 1884 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1885 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1886 KMP_DEBUG_ASSERT(pr); 1887 1888 if ((status = (pr->u.p.tc != 0)) == 0) { 1889 *p_lb = 0; 1890 *p_ub = 0; 1891 // if ( p_last != NULL ) 1892 // *p_last = 0; 1893 if (p_st != NULL) 1894 *p_st = 0; 1895 if (__kmp_env_consistency_check) { 1896 if (pr->pushed_ws != ct_none) { 1897 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1898 } 1899 } 1900 } else if (pr->flags.nomerge) { 1901 kmp_int32 last; 1902 T start; 1903 UT limit, trip, init; 1904 ST incr; 1905 T chunk = pr->u.p.parm1; 1906 1907 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1908 gtid)); 1909 1910 init = chunk * pr->u.p.count++; 1911 trip = pr->u.p.tc - 1; 1912 1913 if ((status = (init <= trip)) == 0) { 1914 *p_lb = 0; 1915 *p_ub = 0; 1916 // if ( p_last != NULL ) 1917 // *p_last = 0; 1918 if (p_st != NULL) 1919 *p_st = 0; 1920 if (__kmp_env_consistency_check) { 1921 if (pr->pushed_ws != ct_none) { 1922 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1923 } 1924 } 1925 } else { 1926 start = pr->u.p.lb; 1927 limit = chunk + init - 1; 1928 incr = pr->u.p.st; 1929 1930 if ((last = (limit >= trip)) != 0) { 1931 limit = trip; 1932 #if KMP_OS_WINDOWS 1933 pr->u.p.last_upper = pr->u.p.ub; 1934 #endif /* KMP_OS_WINDOWS */ 1935 } 1936 if (p_last != NULL) 1937 *p_last = last; 1938 if (p_st != NULL) 1939 *p_st = incr; 1940 if (incr == 1) { 1941 *p_lb = start + init; 1942 *p_ub = start + limit; 1943 } else { 1944 *p_lb = start + init * incr; 1945 *p_ub = start + limit * incr; 1946 } 1947 1948 if (pr->flags.ordered) { 1949 pr->u.p.ordered_lower = init; 1950 pr->u.p.ordered_upper = limit; 1951 #ifdef KMP_DEBUG 1952 { 1953 char *buff; 1954 // create format specifiers before the debug output 1955 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1956 "ordered_lower:%%%s ordered_upper:%%%s\n", 1957 traits_t<UT>::spec, traits_t<UT>::spec); 1958 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1959 pr->u.p.ordered_upper)); 1960 __kmp_str_free(&buff); 1961 } 1962 #endif 1963 } // if 1964 } // if 1965 } else { 1966 pr->u.p.tc = 0; 1967 *p_lb = pr->u.p.lb; 1968 *p_ub = pr->u.p.ub; 1969 #if KMP_OS_WINDOWS 1970 pr->u.p.last_upper = *p_ub; 1971 #endif /* KMP_OS_WINDOWS */ 1972 if (p_last != NULL) 1973 *p_last = TRUE; 1974 if (p_st != NULL) 1975 *p_st = pr->u.p.st; 1976 } // if 1977 #ifdef KMP_DEBUG 1978 { 1979 char *buff; 1980 // create format specifiers before the debug output 1981 buff = __kmp_str_format( 1982 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1983 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1984 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1985 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1986 __kmp_str_free(&buff); 1987 } 1988 #endif 1989 #if INCLUDE_SSC_MARKS 1990 SSC_MARK_DISPATCH_NEXT(); 1991 #endif 1992 OMPT_LOOP_END; 1993 KMP_STATS_LOOP_END; 1994 return status; 1995 } else { 1996 kmp_int32 last = 0; 1997 dispatch_shared_info_template<T> volatile *sh; 1998 1999 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2000 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2001 2002 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2003 th->th.th_dispatch->th_dispatch_pr_current); 2004 KMP_DEBUG_ASSERT(pr); 2005 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2006 th->th.th_dispatch->th_dispatch_sh_current); 2007 KMP_DEBUG_ASSERT(sh); 2008 2009 #if KMP_USE_HIER_SCHED 2010 if (pr->flags.use_hier) 2011 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2012 else 2013 #endif // KMP_USE_HIER_SCHED 2014 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2015 p_st, th->th.th_team_nproc, 2016 th->th.th_info.ds.ds_tid); 2017 // status == 0: no more iterations to execute 2018 if (status == 0) { 2019 UT num_done; 2020 2021 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2022 #ifdef KMP_DEBUG 2023 { 2024 char *buff; 2025 // create format specifiers before the debug output 2026 buff = __kmp_str_format( 2027 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2028 traits_t<UT>::spec); 2029 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2030 __kmp_str_free(&buff); 2031 } 2032 #endif 2033 2034 #if KMP_USE_HIER_SCHED 2035 pr->flags.use_hier = FALSE; 2036 #endif 2037 if ((ST)num_done == th->th.th_team_nproc - 1) { 2038 #if (KMP_STATIC_STEAL_ENABLED) 2039 if (pr->schedule == kmp_sch_static_steal && 2040 traits_t<T>::type_size > 4) { 2041 int i; 2042 kmp_info_t **other_threads = team->t.t_threads; 2043 // loop complete, safe to destroy locks used for stealing 2044 for (i = 0; i < th->th.th_team_nproc; ++i) { 2045 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2046 KMP_ASSERT(lck != NULL); 2047 __kmp_destroy_lock(lck); 2048 __kmp_free(lck); 2049 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2050 } 2051 } 2052 #endif 2053 /* NOTE: release this buffer to be reused */ 2054 2055 KMP_MB(); /* Flush all pending memory write invalidates. */ 2056 2057 sh->u.s.num_done = 0; 2058 sh->u.s.iteration = 0; 2059 2060 /* TODO replace with general release procedure? */ 2061 if (pr->flags.ordered) { 2062 sh->u.s.ordered_iteration = 0; 2063 } 2064 2065 KMP_MB(); /* Flush all pending memory write invalidates. */ 2066 2067 sh->buffer_index += __kmp_dispatch_num_buffers; 2068 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2069 gtid, sh->buffer_index)); 2070 2071 KMP_MB(); /* Flush all pending memory write invalidates. */ 2072 2073 } // if 2074 if (__kmp_env_consistency_check) { 2075 if (pr->pushed_ws != ct_none) { 2076 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2077 } 2078 } 2079 2080 th->th.th_dispatch->th_deo_fcn = NULL; 2081 th->th.th_dispatch->th_dxo_fcn = NULL; 2082 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2083 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2084 } // if (status == 0) 2085 #if KMP_OS_WINDOWS 2086 else if (last) { 2087 pr->u.p.last_upper = pr->u.p.ub; 2088 } 2089 #endif /* KMP_OS_WINDOWS */ 2090 if (p_last != NULL && status != 0) 2091 *p_last = last; 2092 } // if 2093 2094 #ifdef KMP_DEBUG 2095 { 2096 char *buff; 2097 // create format specifiers before the debug output 2098 buff = __kmp_str_format( 2099 "__kmp_dispatch_next: T#%%d normal case: " 2100 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2101 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2102 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2103 (p_last ? *p_last : 0), status)); 2104 __kmp_str_free(&buff); 2105 } 2106 #endif 2107 #if INCLUDE_SSC_MARKS 2108 SSC_MARK_DISPATCH_NEXT(); 2109 #endif 2110 OMPT_LOOP_END; 2111 KMP_STATS_LOOP_END; 2112 return status; 2113 } 2114 2115 template <typename T> 2116 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2117 kmp_int32 *plastiter, T *plower, T *pupper, 2118 typename traits_t<T>::signed_t incr) { 2119 typedef typename traits_t<T>::unsigned_t UT; 2120 kmp_uint32 team_id; 2121 kmp_uint32 nteams; 2122 UT trip_count; 2123 kmp_team_t *team; 2124 kmp_info_t *th; 2125 2126 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2127 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2128 #ifdef KMP_DEBUG 2129 typedef typename traits_t<T>::signed_t ST; 2130 { 2131 char *buff; 2132 // create format specifiers before the debug output 2133 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2134 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2135 traits_t<T>::spec, traits_t<T>::spec, 2136 traits_t<ST>::spec, traits_t<T>::spec); 2137 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2138 __kmp_str_free(&buff); 2139 } 2140 #endif 2141 2142 if (__kmp_env_consistency_check) { 2143 if (incr == 0) { 2144 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2145 loc); 2146 } 2147 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2148 // The loop is illegal. 2149 // Some zero-trip loops maintained by compiler, e.g.: 2150 // for(i=10;i<0;++i) // lower >= upper - run-time check 2151 // for(i=0;i>10;--i) // lower <= upper - run-time check 2152 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2153 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2154 // Compiler does not check the following illegal loops: 2155 // for(i=0;i<10;i+=incr) // where incr<0 2156 // for(i=10;i>0;i-=incr) // where incr<0 2157 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2158 } 2159 } 2160 th = __kmp_threads[gtid]; 2161 team = th->th.th_team; 2162 #if OMP_40_ENABLED 2163 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2164 nteams = th->th.th_teams_size.nteams; 2165 #endif 2166 team_id = team->t.t_master_tid; 2167 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2168 2169 // compute global trip count 2170 if (incr == 1) { 2171 trip_count = *pupper - *plower + 1; 2172 } else if (incr == -1) { 2173 trip_count = *plower - *pupper + 1; 2174 } else if (incr > 0) { 2175 // upper-lower can exceed the limit of signed type 2176 trip_count = (UT)(*pupper - *plower) / incr + 1; 2177 } else { 2178 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2179 } 2180 2181 if (trip_count <= nteams) { 2182 KMP_DEBUG_ASSERT( 2183 __kmp_static == kmp_sch_static_greedy || 2184 __kmp_static == 2185 kmp_sch_static_balanced); // Unknown static scheduling type. 2186 // only some teams get single iteration, others get nothing 2187 if (team_id < trip_count) { 2188 *pupper = *plower = *plower + team_id * incr; 2189 } else { 2190 *plower = *pupper + incr; // zero-trip loop 2191 } 2192 if (plastiter != NULL) 2193 *plastiter = (team_id == trip_count - 1); 2194 } else { 2195 if (__kmp_static == kmp_sch_static_balanced) { 2196 UT chunk = trip_count / nteams; 2197 UT extras = trip_count % nteams; 2198 *plower += 2199 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2200 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2201 if (plastiter != NULL) 2202 *plastiter = (team_id == nteams - 1); 2203 } else { 2204 T chunk_inc_count = 2205 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2206 T upper = *pupper; 2207 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2208 // Unknown static scheduling type. 2209 *plower += team_id * chunk_inc_count; 2210 *pupper = *plower + chunk_inc_count - incr; 2211 // Check/correct bounds if needed 2212 if (incr > 0) { 2213 if (*pupper < *plower) 2214 *pupper = traits_t<T>::max_value; 2215 if (plastiter != NULL) 2216 *plastiter = *plower <= upper && *pupper > upper - incr; 2217 if (*pupper > upper) 2218 *pupper = upper; // tracker C73258 2219 } else { 2220 if (*pupper > *plower) 2221 *pupper = traits_t<T>::min_value; 2222 if (plastiter != NULL) 2223 *plastiter = *plower >= upper && *pupper < upper - incr; 2224 if (*pupper < upper) 2225 *pupper = upper; // tracker C73258 2226 } 2227 } 2228 } 2229 } 2230 2231 //----------------------------------------------------------------------------- 2232 // Dispatch routines 2233 // Transfer call to template< type T > 2234 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2235 // T lb, T ub, ST st, ST chunk ) 2236 extern "C" { 2237 2238 /*! 2239 @ingroup WORK_SHARING 2240 @{ 2241 @param loc Source location 2242 @param gtid Global thread id 2243 @param schedule Schedule type 2244 @param lb Lower bound 2245 @param ub Upper bound 2246 @param st Step (or increment if you prefer) 2247 @param chunk The chunk size to block with 2248 2249 This function prepares the runtime to start a dynamically scheduled for loop, 2250 saving the loop arguments. 2251 These functions are all identical apart from the types of the arguments. 2252 */ 2253 2254 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2255 enum sched_type schedule, kmp_int32 lb, 2256 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2257 KMP_DEBUG_ASSERT(__kmp_init_serial); 2258 #if OMPT_SUPPORT && OMPT_OPTIONAL 2259 OMPT_STORE_RETURN_ADDRESS(gtid); 2260 #endif 2261 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2262 } 2263 /*! 2264 See @ref __kmpc_dispatch_init_4 2265 */ 2266 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2267 enum sched_type schedule, kmp_uint32 lb, 2268 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2269 KMP_DEBUG_ASSERT(__kmp_init_serial); 2270 #if OMPT_SUPPORT && OMPT_OPTIONAL 2271 OMPT_STORE_RETURN_ADDRESS(gtid); 2272 #endif 2273 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2274 } 2275 2276 /*! 2277 See @ref __kmpc_dispatch_init_4 2278 */ 2279 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2280 enum sched_type schedule, kmp_int64 lb, 2281 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2282 KMP_DEBUG_ASSERT(__kmp_init_serial); 2283 #if OMPT_SUPPORT && OMPT_OPTIONAL 2284 OMPT_STORE_RETURN_ADDRESS(gtid); 2285 #endif 2286 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2287 } 2288 2289 /*! 2290 See @ref __kmpc_dispatch_init_4 2291 */ 2292 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2293 enum sched_type schedule, kmp_uint64 lb, 2294 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2295 KMP_DEBUG_ASSERT(__kmp_init_serial); 2296 #if OMPT_SUPPORT && OMPT_OPTIONAL 2297 OMPT_STORE_RETURN_ADDRESS(gtid); 2298 #endif 2299 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2300 } 2301 2302 /*! 2303 See @ref __kmpc_dispatch_init_4 2304 2305 Difference from __kmpc_dispatch_init set of functions is these functions 2306 are called for composite distribute parallel for construct. Thus before 2307 regular iterations dispatching we need to calc per-team iteration space. 2308 2309 These functions are all identical apart from the types of the arguments. 2310 */ 2311 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2312 enum sched_type schedule, kmp_int32 *p_last, 2313 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2314 kmp_int32 chunk) { 2315 KMP_DEBUG_ASSERT(__kmp_init_serial); 2316 #if OMPT_SUPPORT && OMPT_OPTIONAL 2317 OMPT_STORE_RETURN_ADDRESS(gtid); 2318 #endif 2319 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2320 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2321 } 2322 2323 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2324 enum sched_type schedule, kmp_int32 *p_last, 2325 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2326 kmp_int32 chunk) { 2327 KMP_DEBUG_ASSERT(__kmp_init_serial); 2328 #if OMPT_SUPPORT && OMPT_OPTIONAL 2329 OMPT_STORE_RETURN_ADDRESS(gtid); 2330 #endif 2331 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2332 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2333 } 2334 2335 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2336 enum sched_type schedule, kmp_int32 *p_last, 2337 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2338 kmp_int64 chunk) { 2339 KMP_DEBUG_ASSERT(__kmp_init_serial); 2340 #if OMPT_SUPPORT && OMPT_OPTIONAL 2341 OMPT_STORE_RETURN_ADDRESS(gtid); 2342 #endif 2343 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2344 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2345 } 2346 2347 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2348 enum sched_type schedule, kmp_int32 *p_last, 2349 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2350 kmp_int64 chunk) { 2351 KMP_DEBUG_ASSERT(__kmp_init_serial); 2352 #if OMPT_SUPPORT && OMPT_OPTIONAL 2353 OMPT_STORE_RETURN_ADDRESS(gtid); 2354 #endif 2355 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2356 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2357 } 2358 2359 /*! 2360 @param loc Source code location 2361 @param gtid Global thread id 2362 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2363 otherwise 2364 @param p_lb Pointer to the lower bound for the next chunk of work 2365 @param p_ub Pointer to the upper bound for the next chunk of work 2366 @param p_st Pointer to the stride for the next chunk of work 2367 @return one if there is work to be done, zero otherwise 2368 2369 Get the next dynamically allocated chunk of work for this thread. 2370 If there is no more work, then the lb,ub and stride need not be modified. 2371 */ 2372 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2373 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2374 #if OMPT_SUPPORT && OMPT_OPTIONAL 2375 OMPT_STORE_RETURN_ADDRESS(gtid); 2376 #endif 2377 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 , 2380 OMPT_LOAD_RETURN_ADDRESS(gtid) 2381 #endif 2382 ); 2383 } 2384 2385 /*! 2386 See @ref __kmpc_dispatch_next_4 2387 */ 2388 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2389 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2390 kmp_int32 *p_st) { 2391 #if OMPT_SUPPORT && OMPT_OPTIONAL 2392 OMPT_STORE_RETURN_ADDRESS(gtid); 2393 #endif 2394 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2395 #if OMPT_SUPPORT && OMPT_OPTIONAL 2396 , 2397 OMPT_LOAD_RETURN_ADDRESS(gtid) 2398 #endif 2399 ); 2400 } 2401 2402 /*! 2403 See @ref __kmpc_dispatch_next_4 2404 */ 2405 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2406 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2407 #if OMPT_SUPPORT && OMPT_OPTIONAL 2408 OMPT_STORE_RETURN_ADDRESS(gtid); 2409 #endif 2410 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2411 #if OMPT_SUPPORT && OMPT_OPTIONAL 2412 , 2413 OMPT_LOAD_RETURN_ADDRESS(gtid) 2414 #endif 2415 ); 2416 } 2417 2418 /*! 2419 See @ref __kmpc_dispatch_next_4 2420 */ 2421 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2422 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2423 kmp_int64 *p_st) { 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 OMPT_STORE_RETURN_ADDRESS(gtid); 2426 #endif 2427 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2428 #if OMPT_SUPPORT && OMPT_OPTIONAL 2429 , 2430 OMPT_LOAD_RETURN_ADDRESS(gtid) 2431 #endif 2432 ); 2433 } 2434 2435 /*! 2436 @param loc Source code location 2437 @param gtid Global thread id 2438 2439 Mark the end of a dynamic loop. 2440 */ 2441 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2442 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2443 } 2444 2445 /*! 2446 See @ref __kmpc_dispatch_fini_4 2447 */ 2448 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2449 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2450 } 2451 2452 /*! 2453 See @ref __kmpc_dispatch_fini_4 2454 */ 2455 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2456 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2457 } 2458 2459 /*! 2460 See @ref __kmpc_dispatch_fini_4 2461 */ 2462 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2463 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2464 } 2465 /*! @} */ 2466 2467 //----------------------------------------------------------------------------- 2468 // Non-template routines from kmp_dispatch.cpp used in other sources 2469 2470 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2471 return value == checker; 2472 } 2473 2474 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2475 return value != checker; 2476 } 2477 2478 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2479 return value < checker; 2480 } 2481 2482 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2483 return value >= checker; 2484 } 2485 2486 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2487 return value <= checker; 2488 } 2489 2490 kmp_uint32 2491 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2492 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2493 void *obj // Higher-level synchronization object, or NULL. 2494 ) { 2495 // note: we may not belong to a team at this point 2496 volatile kmp_uint32 *spin = spinner; 2497 kmp_uint32 check = checker; 2498 kmp_uint32 spins; 2499 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2500 kmp_uint32 r; 2501 2502 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2503 KMP_INIT_YIELD(spins); 2504 // main wait spin loop 2505 while (!f(r = TCR_4(*spin), check)) { 2506 KMP_FSYNC_SPIN_PREPARE(obj); 2507 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2508 split. It causes problems with infinite recursion because of exit lock */ 2509 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2510 __kmp_abort_thread(); */ 2511 2512 /* if we have waited a bit, or are oversubscribed, yield */ 2513 /* pause is in the following code */ 2514 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2515 KMP_YIELD_SPIN(spins); 2516 } 2517 KMP_FSYNC_SPIN_ACQUIRED(obj); 2518 return r; 2519 } 2520 2521 void __kmp_wait_yield_4_ptr( 2522 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2523 void *obj // Higher-level synchronization object, or NULL. 2524 ) { 2525 // note: we may not belong to a team at this point 2526 void *spin = spinner; 2527 kmp_uint32 check = checker; 2528 kmp_uint32 spins; 2529 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2530 2531 KMP_FSYNC_SPIN_INIT(obj, spin); 2532 KMP_INIT_YIELD(spins); 2533 // main wait spin loop 2534 while (!f(spin, check)) { 2535 KMP_FSYNC_SPIN_PREPARE(obj); 2536 /* if we have waited a bit, or are oversubscribed, yield */ 2537 /* pause is in the following code */ 2538 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2539 KMP_YIELD_SPIN(spins); 2540 } 2541 KMP_FSYNC_SPIN_ACQUIRED(obj); 2542 } 2543 2544 } // extern "C" 2545 2546 #ifdef KMP_GOMP_COMPAT 2547 2548 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2549 enum sched_type schedule, kmp_int32 lb, 2550 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2551 int push_ws) { 2552 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2553 push_ws); 2554 } 2555 2556 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2557 enum sched_type schedule, kmp_uint32 lb, 2558 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2559 int push_ws) { 2560 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2561 push_ws); 2562 } 2563 2564 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2565 enum sched_type schedule, kmp_int64 lb, 2566 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2567 int push_ws) { 2568 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2569 push_ws); 2570 } 2571 2572 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2573 enum sched_type schedule, kmp_uint64 lb, 2574 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2575 int push_ws) { 2576 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2577 push_ws); 2578 } 2579 2580 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2581 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2582 } 2583 2584 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2585 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2586 } 2587 2588 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2589 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2590 } 2591 2592 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2593 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2594 } 2595 2596 #endif /* KMP_GOMP_COMPAT */ 2597 2598 /* ------------------------------------------------------------------------ */ 2599