1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Initialize a dispatch_private_info_template<T> buffer for a particular 72 // type of schedule,chunk. The loop description is found in lb (lower bound), 73 // ub (upper bound), and st (stride). nproc is the number of threads relevant 74 // to the scheduling (often the number of threads in a team, but not always if 75 // hierarchical scheduling is used). tid is the id of the thread calling 76 // the function within the group of nproc threads. It will have a value 77 // between 0 and nproc - 1. This is often just the thread id within a team, but 78 // is not necessarily the case when using hierarchical scheduling. 79 // loc is the source file location of the corresponding loop 80 // gtid is the global thread id 81 template <typename T> 82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 83 dispatch_private_info_template<T> *pr, 84 enum sched_type schedule, T lb, T ub, 85 typename traits_t<T>::signed_t st, 86 #if USE_ITT_BUILD 87 kmp_uint64 *cur_chunk, 88 #endif 89 typename traits_t<T>::signed_t chunk, 90 T nproc, T tid) { 91 typedef typename traits_t<T>::unsigned_t UT; 92 typedef typename traits_t<T>::floating_t DBL; 93 94 int active; 95 T tc; 96 kmp_info_t *th; 97 kmp_team_t *team; 98 99 #ifdef KMP_DEBUG 100 typedef typename traits_t<T>::signed_t ST; 101 { 102 char *buff; 103 // create format specifiers before the debug output 104 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 105 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 106 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 107 traits_t<T>::spec, traits_t<T>::spec, 108 traits_t<ST>::spec, traits_t<ST>::spec, 109 traits_t<T>::spec, traits_t<T>::spec); 110 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 111 __kmp_str_free(&buff); 112 } 113 #endif 114 /* setup data */ 115 th = __kmp_threads[gtid]; 116 team = th->th.th_team; 117 active = !team->t.t_serialized; 118 119 #if USE_ITT_BUILD 120 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 121 __kmp_forkjoin_frames_mode == 3 && 122 KMP_MASTER_GTID(gtid) && 123 #if OMP_40_ENABLED 124 th->th.th_teams_microtask == NULL && 125 #endif 126 team->t.t_active_level == 1; 127 #endif 128 #if (KMP_STATIC_STEAL_ENABLED) 129 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 130 // AC: we now have only one implementation of stealing, so use it 131 schedule = kmp_sch_static_steal; 132 else 133 #endif 134 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 135 136 /* Pick up the nomerge/ordered bits from the scheduling type */ 137 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 138 pr->flags.nomerge = TRUE; 139 schedule = 140 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 141 } else { 142 pr->flags.nomerge = FALSE; 143 } 144 pr->type_size = traits_t<T>::type_size; // remember the size of variables 145 if (kmp_ord_lower & schedule) { 146 pr->flags.ordered = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.ordered = FALSE; 151 } 152 153 if (schedule == kmp_sch_static) { 154 schedule = __kmp_static; 155 } else { 156 if (schedule == kmp_sch_runtime) { 157 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 158 // not specified) 159 schedule = team->t.t_sched.r_sched_type; 160 // Detail the schedule if needed (global controls are differentiated 161 // appropriately) 162 if (schedule == kmp_sch_guided_chunked) { 163 schedule = __kmp_guided; 164 } else if (schedule == kmp_sch_static) { 165 schedule = __kmp_static; 166 } 167 // Use the chunk size specified by OMP_SCHEDULE (or default if not 168 // specified) 169 chunk = team->t.t_sched.chunk; 170 #if USE_ITT_BUILD 171 if (cur_chunk) 172 *cur_chunk = chunk; 173 #endif 174 #ifdef KMP_DEBUG 175 { 176 char *buff; 177 // create format specifiers before the debug output 178 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 179 "schedule:%%d chunk:%%%s\n", 180 traits_t<ST>::spec); 181 KD_TRACE(10, (buff, gtid, schedule, chunk)); 182 __kmp_str_free(&buff); 183 } 184 #endif 185 } else { 186 if (schedule == kmp_sch_guided_chunked) { 187 schedule = __kmp_guided; 188 } 189 if (chunk <= 0) { 190 chunk = KMP_DEFAULT_CHUNK; 191 } 192 } 193 194 if (schedule == kmp_sch_auto) { 195 // mapping and differentiation: in the __kmp_do_serial_initialize() 196 schedule = __kmp_auto; 197 #ifdef KMP_DEBUG 198 { 199 char *buff; 200 // create format specifiers before the debug output 201 buff = __kmp_str_format( 202 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 203 "schedule:%%d chunk:%%%s\n", 204 traits_t<ST>::spec); 205 KD_TRACE(10, (buff, gtid, schedule, chunk)); 206 __kmp_str_free(&buff); 207 } 208 #endif 209 } 210 211 /* guided analytical not safe for too many threads */ 212 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 213 schedule = kmp_sch_guided_iterative_chunked; 214 KMP_WARNING(DispatchManyThreads); 215 } 216 #if OMP_45_ENABLED 217 if (schedule == kmp_sch_runtime_simd) { 218 // compiler provides simd_width in the chunk parameter 219 schedule = team->t.t_sched.r_sched_type; 220 // Detail the schedule if needed (global controls are differentiated 221 // appropriately) 222 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 223 schedule == __kmp_static) { 224 schedule = kmp_sch_static_balanced_chunked; 225 } else { 226 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 227 schedule = kmp_sch_guided_simd; 228 } 229 chunk = team->t.t_sched.chunk * chunk; 230 } 231 #if USE_ITT_BUILD 232 if (cur_chunk) 233 *cur_chunk = chunk; 234 #endif 235 #ifdef KMP_DEBUG 236 { 237 char *buff; 238 // create format specifiers before the debug output 239 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 240 " chunk:%%%s\n", 241 traits_t<ST>::spec); 242 KD_TRACE(10, (buff, gtid, schedule, chunk)); 243 __kmp_str_free(&buff); 244 } 245 #endif 246 } 247 #endif // OMP_45_ENABLED 248 pr->u.p.parm1 = chunk; 249 } 250 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 251 "unknown scheduling type"); 252 253 pr->u.p.count = 0; 254 255 if (__kmp_env_consistency_check) { 256 if (st == 0) { 257 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 258 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 259 } 260 } 261 // compute trip count 262 if (st == 1) { // most common case 263 if (ub >= lb) { 264 tc = ub - lb + 1; 265 } else { // ub < lb 266 tc = 0; // zero-trip 267 } 268 } else if (st < 0) { 269 if (lb >= ub) { 270 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 271 // where the division needs to be unsigned regardless of the result type 272 tc = (UT)(lb - ub) / (-st) + 1; 273 } else { // lb < ub 274 tc = 0; // zero-trip 275 } 276 } else { // st > 0 277 if (ub >= lb) { 278 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(ub - lb) / st + 1; 281 } else { // ub < lb 282 tc = 0; // zero-trip 283 } 284 } 285 286 pr->u.p.lb = lb; 287 pr->u.p.ub = ub; 288 pr->u.p.st = st; 289 pr->u.p.tc = tc; 290 291 #if KMP_OS_WINDOWS 292 pr->u.p.last_upper = ub + st; 293 #endif /* KMP_OS_WINDOWS */ 294 295 /* NOTE: only the active parallel region(s) has active ordered sections */ 296 297 if (active) { 298 if (pr->flags.ordered) { 299 pr->ordered_bumped = 0; 300 pr->u.p.ordered_lower = 1; 301 pr->u.p.ordered_upper = 0; 302 } 303 } 304 305 switch (schedule) { 306 #if (KMP_STATIC_STEAL_ENABLED) 307 case kmp_sch_static_steal: { 308 T ntc, init; 309 310 KD_TRACE(100, 311 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 312 gtid)); 313 314 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 315 if (nproc > 1 && ntc >= nproc) { 316 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 317 T id = tid; 318 T small_chunk, extras; 319 320 small_chunk = ntc / nproc; 321 extras = ntc % nproc; 322 323 init = id * small_chunk + (id < extras ? id : extras); 324 pr->u.p.count = init; 325 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 326 327 pr->u.p.parm2 = lb; 328 // pr->pfields.parm3 = 0; // it's not used in static_steal 329 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 330 pr->u.p.st = st; 331 if (traits_t<T>::type_size > 4) { 332 // AC: TODO: check if 16-byte CAS available and use it to 333 // improve performance (probably wait for explicit request 334 // before spending time on this). 335 // For now use dynamically allocated per-thread lock, 336 // free memory in __kmp_dispatch_next when status==0. 337 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 338 th->th.th_dispatch->th_steal_lock = 339 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 340 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 341 } 342 break; 343 } else { 344 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 345 "kmp_sch_static_balanced\n", 346 gtid)); 347 schedule = kmp_sch_static_balanced; 348 /* too few iterations: fall-through to kmp_sch_static_balanced */ 349 } // if 350 /* FALL-THROUGH to static balanced */ 351 KMP_FALLTHROUGH(); 352 } // case 353 #endif 354 case kmp_sch_static_balanced: { 355 T init, limit; 356 357 KD_TRACE( 358 100, 359 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 360 gtid)); 361 362 if (nproc > 1) { 363 T id = tid; 364 365 if (tc < nproc) { 366 if (id < tc) { 367 init = id; 368 limit = id; 369 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 370 } else { 371 pr->u.p.count = 1; /* means no more chunks to execute */ 372 pr->u.p.parm1 = FALSE; 373 break; 374 } 375 } else { 376 T small_chunk = tc / nproc; 377 T extras = tc % nproc; 378 init = id * small_chunk + (id < extras ? id : extras); 379 limit = init + small_chunk - (id < extras ? 0 : 1); 380 pr->u.p.parm1 = (id == nproc - 1); 381 } 382 } else { 383 if (tc > 0) { 384 init = 0; 385 limit = tc - 1; 386 pr->u.p.parm1 = TRUE; 387 } else { 388 // zero trip count 389 pr->u.p.count = 1; /* means no more chunks to execute */ 390 pr->u.p.parm1 = FALSE; 391 break; 392 } 393 } 394 #if USE_ITT_BUILD 395 // Calculate chunk for metadata report 396 if (itt_need_metadata_reporting) 397 if (cur_chunk) 398 *cur_chunk = limit - init + 1; 399 #endif 400 if (st == 1) { 401 pr->u.p.lb = lb + init; 402 pr->u.p.ub = lb + limit; 403 } else { 404 // calculated upper bound, "ub" is user-defined upper bound 405 T ub_tmp = lb + limit * st; 406 pr->u.p.lb = lb + init * st; 407 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 408 // it exactly 409 if (st > 0) { 410 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 411 } else { 412 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 413 } 414 } 415 if (pr->flags.ordered) { 416 pr->u.p.ordered_lower = init; 417 pr->u.p.ordered_upper = limit; 418 } 419 break; 420 } // case 421 #if OMP_45_ENABLED 422 case kmp_sch_static_balanced_chunked: { 423 // similar to balanced, but chunk adjusted to multiple of simd width 424 T nth = nproc; 425 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 426 " -> falling-through to static_greedy\n", 427 gtid)); 428 schedule = kmp_sch_static_greedy; 429 if (nth > 1) 430 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 431 else 432 pr->u.p.parm1 = tc; 433 break; 434 } // case 435 case kmp_sch_guided_simd: 436 #endif // OMP_45_ENABLED 437 case kmp_sch_guided_iterative_chunked: { 438 KD_TRACE( 439 100, 440 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 441 " case\n", 442 gtid)); 443 444 if (nproc > 1) { 445 if ((2L * chunk + 1) * nproc >= tc) { 446 /* chunk size too large, switch to dynamic */ 447 schedule = kmp_sch_dynamic_chunked; 448 } else { 449 // when remaining iters become less than parm2 - switch to dynamic 450 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 451 *(double *)&pr->u.p.parm3 = 452 guided_flt_param / nproc; // may occupy parm3 and parm4 453 } 454 } else { 455 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 456 "kmp_sch_static_greedy\n", 457 gtid)); 458 schedule = kmp_sch_static_greedy; 459 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 460 KD_TRACE( 461 100, 462 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 463 gtid)); 464 pr->u.p.parm1 = tc; 465 } // if 466 } // case 467 break; 468 case kmp_sch_guided_analytical_chunked: { 469 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 470 "kmp_sch_guided_analytical_chunked case\n", 471 gtid)); 472 473 if (nproc > 1) { 474 if ((2L * chunk + 1) * nproc >= tc) { 475 /* chunk size too large, switch to dynamic */ 476 schedule = kmp_sch_dynamic_chunked; 477 } else { 478 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 479 DBL x; 480 481 #if KMP_USE_X87CONTROL 482 /* Linux* OS already has 64-bit computation by default for long double, 483 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 484 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 485 instead of the default 53-bit. Even though long double doesn't work 486 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 487 expected to impact the correctness of the algorithm, but this has not 488 been mathematically proven. */ 489 // save original FPCW and set precision to 64-bit, as 490 // Windows* OS on IA-32 architecture defaults to 53-bit 491 unsigned int oldFpcw = _control87(0, 0); 492 _control87(_PC_64, _MCW_PC); // 0,0x30000 493 #endif 494 /* value used for comparison in solver for cross-over point */ 495 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 496 497 /* crossover point--chunk indexes equal to or greater than 498 this point switch to dynamic-style scheduling */ 499 UT cross; 500 501 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 502 x = (long double)1.0 - (long double)0.5 / nproc; 503 504 #ifdef KMP_DEBUG 505 { // test natural alignment 506 struct _test_a { 507 char a; 508 union { 509 char b; 510 DBL d; 511 }; 512 } t; 513 ptrdiff_t natural_alignment = 514 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 515 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 516 // long)natural_alignment ); 517 KMP_DEBUG_ASSERT( 518 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 519 } 520 #endif // KMP_DEBUG 521 522 /* save the term in thread private dispatch structure */ 523 *(DBL *)&pr->u.p.parm3 = x; 524 525 /* solve for the crossover point to the nearest integer i for which C_i 526 <= chunk */ 527 { 528 UT left, right, mid; 529 long double p; 530 531 /* estimate initial upper and lower bound */ 532 533 /* doesn't matter what value right is as long as it is positive, but 534 it affects performance of the solver */ 535 right = 229; 536 p = __kmp_pow<UT>(x, right); 537 if (p > target) { 538 do { 539 p *= p; 540 right <<= 1; 541 } while (p > target && right < (1 << 27)); 542 /* lower bound is previous (failed) estimate of upper bound */ 543 left = right >> 1; 544 } else { 545 left = 0; 546 } 547 548 /* bisection root-finding method */ 549 while (left + 1 < right) { 550 mid = (left + right) / 2; 551 if (__kmp_pow<UT>(x, mid) > target) { 552 left = mid; 553 } else { 554 right = mid; 555 } 556 } // while 557 cross = right; 558 } 559 /* assert sanity of computed crossover point */ 560 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 561 __kmp_pow<UT>(x, cross) <= target); 562 563 /* save the crossover point in thread private dispatch structure */ 564 pr->u.p.parm2 = cross; 565 566 // C75803 567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 569 #else 570 #define GUIDED_ANALYTICAL_WORKAROUND (x) 571 #endif 572 /* dynamic-style scheduling offset */ 573 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 574 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 575 cross * chunk; 576 #if KMP_USE_X87CONTROL 577 // restore FPCW 578 _control87(oldFpcw, _MCW_PC); 579 #endif 580 } // if 581 } else { 582 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 583 "kmp_sch_static_greedy\n", 584 gtid)); 585 schedule = kmp_sch_static_greedy; 586 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 587 pr->u.p.parm1 = tc; 588 } // if 589 } // case 590 break; 591 case kmp_sch_static_greedy: 592 KD_TRACE( 593 100, 594 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 595 gtid)); 596 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 597 break; 598 case kmp_sch_static_chunked: 599 case kmp_sch_dynamic_chunked: 600 if (pr->u.p.parm1 <= 0) { 601 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 602 } 603 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 604 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 605 gtid)); 606 break; 607 case kmp_sch_trapezoidal: { 608 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 609 610 T parm1, parm2, parm3, parm4; 611 KD_TRACE(100, 612 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 613 gtid)); 614 615 parm1 = chunk; 616 617 /* F : size of the first cycle */ 618 parm2 = (tc / (2 * nproc)); 619 620 if (parm2 < 1) { 621 parm2 = 1; 622 } 623 624 /* L : size of the last cycle. Make sure the last cycle is not larger 625 than the first cycle. */ 626 if (parm1 < 1) { 627 parm1 = 1; 628 } else if (parm1 > parm2) { 629 parm1 = parm2; 630 } 631 632 /* N : number of cycles */ 633 parm3 = (parm2 + parm1); 634 parm3 = (2 * tc + parm3 - 1) / parm3; 635 636 if (parm3 < 2) { 637 parm3 = 2; 638 } 639 640 /* sigma : decreasing incr of the trapezoid */ 641 parm4 = (parm3 - 1); 642 parm4 = (parm2 - parm1) / parm4; 643 644 // pointless check, because parm4 >= 0 always 645 // if ( parm4 < 0 ) { 646 // parm4 = 0; 647 //} 648 649 pr->u.p.parm1 = parm1; 650 pr->u.p.parm2 = parm2; 651 pr->u.p.parm3 = parm3; 652 pr->u.p.parm4 = parm4; 653 } // case 654 break; 655 656 default: { 657 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 658 KMP_HNT(GetNewerLibrary), // Hint 659 __kmp_msg_null // Variadic argument list terminator 660 ); 661 } break; 662 } // switch 663 pr->schedule = schedule; 664 } 665 666 #if KMP_USE_HIER_SCHED 667 template <typename T> 668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 669 typename traits_t<T>::signed_t st); 670 template <> 671 inline void 672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 673 kmp_int32 ub, kmp_int32 st) { 674 __kmp_dispatch_init_hierarchy<kmp_int32>( 675 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 676 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 677 } 678 template <> 679 inline void 680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 681 kmp_uint32 ub, kmp_int32 st) { 682 __kmp_dispatch_init_hierarchy<kmp_uint32>( 683 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 684 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 685 } 686 template <> 687 inline void 688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 689 kmp_int64 ub, kmp_int64 st) { 690 __kmp_dispatch_init_hierarchy<kmp_int64>( 691 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 692 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 693 } 694 template <> 695 inline void 696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 697 kmp_uint64 ub, kmp_int64 st) { 698 __kmp_dispatch_init_hierarchy<kmp_uint64>( 699 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 700 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 701 } 702 703 // free all the hierarchy scheduling memory associated with the team 704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 705 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 706 for (int i = 0; i < num_disp_buff; ++i) { 707 // type does not matter here so use kmp_int32 708 auto sh = 709 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 710 &team->t.t_disp_buffer[i]); 711 if (sh->hier) { 712 sh->hier->deallocate(); 713 __kmp_free(sh->hier); 714 } 715 } 716 } 717 #endif 718 719 // UT - unsigned flavor of T, ST - signed flavor of T, 720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 721 template <typename T> 722 static void 723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 724 T ub, typename traits_t<T>::signed_t st, 725 typename traits_t<T>::signed_t chunk, int push_ws) { 726 typedef typename traits_t<T>::unsigned_t UT; 727 728 int active; 729 kmp_info_t *th; 730 kmp_team_t *team; 731 kmp_uint32 my_buffer_index; 732 dispatch_private_info_template<T> *pr; 733 dispatch_shared_info_template<T> volatile *sh; 734 735 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 736 sizeof(dispatch_private_info)); 737 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 738 sizeof(dispatch_shared_info)); 739 740 if (!TCR_4(__kmp_init_parallel)) 741 __kmp_parallel_initialize(); 742 743 #if OMP_50_ENABLED 744 __kmp_resume_if_soft_paused(); 745 #endif 746 747 #if INCLUDE_SSC_MARKS 748 SSC_MARK_DISPATCH_INIT(); 749 #endif 750 #ifdef KMP_DEBUG 751 typedef typename traits_t<T>::signed_t ST; 752 { 753 char *buff; 754 // create format specifiers before the debug output 755 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 756 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 757 traits_t<ST>::spec, traits_t<T>::spec, 758 traits_t<T>::spec, traits_t<ST>::spec); 759 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 760 __kmp_str_free(&buff); 761 } 762 #endif 763 /* setup data */ 764 th = __kmp_threads[gtid]; 765 team = th->th.th_team; 766 active = !team->t.t_serialized; 767 th->th.th_ident = loc; 768 769 // Any half-decent optimizer will remove this test when the blocks are empty 770 // since the macros expand to nothing 771 // when statistics are disabled. 772 if (schedule == __kmp_static) { 773 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 774 } else { 775 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 776 } 777 778 #if KMP_USE_HIER_SCHED 779 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 780 // Hierarchical scheduling does not work with ordered, so if ordered is 781 // detected, then revert back to threaded scheduling. 782 bool ordered; 783 enum sched_type my_sched = schedule; 784 my_buffer_index = th->th.th_dispatch->th_disp_index; 785 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 786 &th->th.th_dispatch 787 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 788 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 789 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 790 my_sched = 791 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 792 ordered = (kmp_ord_lower & my_sched); 793 if (pr->flags.use_hier) { 794 if (ordered) { 795 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 796 "Disabling hierarchical scheduling.\n", 797 gtid)); 798 pr->flags.use_hier = FALSE; 799 } 800 } 801 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 802 // Don't use hierarchical for ordered parallel loops and don't 803 // use the runtime hierarchy if one was specified in the program 804 if (!ordered && !pr->flags.use_hier) 805 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 806 } 807 #endif // KMP_USE_HIER_SCHED 808 809 #if USE_ITT_BUILD 810 kmp_uint64 cur_chunk = chunk; 811 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 812 __kmp_forkjoin_frames_mode == 3 && 813 KMP_MASTER_GTID(gtid) && 814 #if OMP_40_ENABLED 815 th->th.th_teams_microtask == NULL && 816 #endif 817 team->t.t_active_level == 1; 818 #endif 819 if (!active) { 820 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 821 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 822 } else { 823 KMP_DEBUG_ASSERT(th->th.th_dispatch == 824 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 825 826 my_buffer_index = th->th.th_dispatch->th_disp_index++; 827 828 /* What happens when number of threads changes, need to resize buffer? */ 829 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 830 &th->th.th_dispatch 831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 832 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 833 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 834 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 835 my_buffer_index)); 836 } 837 838 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 839 #if USE_ITT_BUILD 840 &cur_chunk, 841 #endif 842 chunk, (T)th->th.th_team_nproc, 843 (T)th->th.th_info.ds.ds_tid); 844 if (active) { 845 if (pr->flags.ordered == 0) { 846 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 847 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 848 } else { 849 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 850 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 851 } 852 } 853 854 if (active) { 855 /* The name of this buffer should be my_buffer_index when it's free to use 856 * it */ 857 858 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 859 "sh->buffer_index:%d\n", 860 gtid, my_buffer_index, sh->buffer_index)); 861 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 862 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 863 // Note: KMP_WAIT() cannot be used there: buffer index and 864 // my_buffer_index are *always* 32-bit integers. 865 KMP_MB(); /* is this necessary? */ 866 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 867 "sh->buffer_index:%d\n", 868 gtid, my_buffer_index, sh->buffer_index)); 869 870 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 871 th->th.th_dispatch->th_dispatch_sh_current = 872 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 873 #if USE_ITT_BUILD 874 if (pr->flags.ordered) { 875 __kmp_itt_ordered_init(gtid); 876 } 877 // Report loop metadata 878 if (itt_need_metadata_reporting) { 879 // Only report metadata by master of active team at level 1 880 kmp_uint64 schedtype = 0; 881 switch (schedule) { 882 case kmp_sch_static_chunked: 883 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 884 break; 885 case kmp_sch_static_greedy: 886 cur_chunk = pr->u.p.parm1; 887 break; 888 case kmp_sch_dynamic_chunked: 889 schedtype = 1; 890 break; 891 case kmp_sch_guided_iterative_chunked: 892 case kmp_sch_guided_analytical_chunked: 893 #if OMP_45_ENABLED 894 case kmp_sch_guided_simd: 895 #endif 896 schedtype = 2; 897 break; 898 default: 899 // Should we put this case under "static"? 900 // case kmp_sch_static_steal: 901 schedtype = 3; 902 break; 903 } 904 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 905 } 906 #if KMP_USE_HIER_SCHED 907 if (pr->flags.use_hier) { 908 pr->u.p.count = 0; 909 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 910 } 911 #endif // KMP_USER_HIER_SCHED 912 #endif /* USE_ITT_BUILD */ 913 } 914 915 #ifdef KMP_DEBUG 916 { 917 char *buff; 918 // create format specifiers before the debug output 919 buff = __kmp_str_format( 920 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 921 "lb:%%%s ub:%%%s" 922 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 923 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 924 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 925 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 926 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 927 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 928 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 929 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 930 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 931 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 932 __kmp_str_free(&buff); 933 } 934 #endif 935 #if (KMP_STATIC_STEAL_ENABLED) 936 // It cannot be guaranteed that after execution of a loop with some other 937 // schedule kind all the parm3 variables will contain the same value. Even if 938 // all parm3 will be the same, it still exists a bad case like using 0 and 1 939 // rather than program life-time increment. So the dedicated variable is 940 // required. The 'static_steal_counter' is used. 941 if (schedule == kmp_sch_static_steal) { 942 // Other threads will inspect this variable when searching for a victim. 943 // This is a flag showing that other threads may steal from this thread 944 // since then. 945 volatile T *p = &pr->u.p.static_steal_counter; 946 *p = *p + 1; 947 } 948 #endif // ( KMP_STATIC_STEAL_ENABLED ) 949 950 #if OMPT_SUPPORT && OMPT_OPTIONAL 951 if (ompt_enabled.ompt_callback_work) { 952 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 953 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 954 ompt_callbacks.ompt_callback(ompt_callback_work)( 955 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 956 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 957 } 958 #endif 959 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 960 } 961 962 /* For ordered loops, either __kmp_dispatch_finish() should be called after 963 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 964 * every chunk of iterations. If the ordered section(s) were not executed 965 * for this iteration (or every iteration in this chunk), we need to set the 966 * ordered iteration counters so that the next thread can proceed. */ 967 template <typename UT> 968 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 969 typedef typename traits_t<UT>::signed_t ST; 970 kmp_info_t *th = __kmp_threads[gtid]; 971 972 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 973 if (!th->th.th_team->t.t_serialized) { 974 975 dispatch_private_info_template<UT> *pr = 976 reinterpret_cast<dispatch_private_info_template<UT> *>( 977 th->th.th_dispatch->th_dispatch_pr_current); 978 dispatch_shared_info_template<UT> volatile *sh = 979 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 980 th->th.th_dispatch->th_dispatch_sh_current); 981 KMP_DEBUG_ASSERT(pr); 982 KMP_DEBUG_ASSERT(sh); 983 KMP_DEBUG_ASSERT(th->th.th_dispatch == 984 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 985 986 if (pr->ordered_bumped) { 987 KD_TRACE( 988 1000, 989 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 990 gtid)); 991 pr->ordered_bumped = 0; 992 } else { 993 UT lower = pr->u.p.ordered_lower; 994 995 #ifdef KMP_DEBUG 996 { 997 char *buff; 998 // create format specifiers before the debug output 999 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1000 "ordered_iteration:%%%s lower:%%%s\n", 1001 traits_t<UT>::spec, traits_t<UT>::spec); 1002 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1003 __kmp_str_free(&buff); 1004 } 1005 #endif 1006 1007 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1008 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1009 KMP_MB(); /* is this necessary? */ 1010 #ifdef KMP_DEBUG 1011 { 1012 char *buff; 1013 // create format specifiers before the debug output 1014 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1015 "ordered_iteration:%%%s lower:%%%s\n", 1016 traits_t<UT>::spec, traits_t<UT>::spec); 1017 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1018 __kmp_str_free(&buff); 1019 } 1020 #endif 1021 1022 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1023 } // if 1024 } // if 1025 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1026 } 1027 1028 #ifdef KMP_GOMP_COMPAT 1029 1030 template <typename UT> 1031 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1032 typedef typename traits_t<UT>::signed_t ST; 1033 kmp_info_t *th = __kmp_threads[gtid]; 1034 1035 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1036 if (!th->th.th_team->t.t_serialized) { 1037 // int cid; 1038 dispatch_private_info_template<UT> *pr = 1039 reinterpret_cast<dispatch_private_info_template<UT> *>( 1040 th->th.th_dispatch->th_dispatch_pr_current); 1041 dispatch_shared_info_template<UT> volatile *sh = 1042 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1043 th->th.th_dispatch->th_dispatch_sh_current); 1044 KMP_DEBUG_ASSERT(pr); 1045 KMP_DEBUG_ASSERT(sh); 1046 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1047 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1048 1049 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1050 UT lower = pr->u.p.ordered_lower; 1051 UT upper = pr->u.p.ordered_upper; 1052 UT inc = upper - lower + 1; 1053 1054 if (pr->ordered_bumped == inc) { 1055 KD_TRACE( 1056 1000, 1057 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1058 gtid)); 1059 pr->ordered_bumped = 0; 1060 } else { 1061 inc -= pr->ordered_bumped; 1062 1063 #ifdef KMP_DEBUG 1064 { 1065 char *buff; 1066 // create format specifiers before the debug output 1067 buff = __kmp_str_format( 1068 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1069 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1070 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1071 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1072 __kmp_str_free(&buff); 1073 } 1074 #endif 1075 1076 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1077 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1078 1079 KMP_MB(); /* is this necessary? */ 1080 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1081 "ordered_bumped to zero\n", 1082 gtid)); 1083 pr->ordered_bumped = 0; 1084 //!!!!! TODO check if the inc should be unsigned, or signed??? 1085 #ifdef KMP_DEBUG 1086 { 1087 char *buff; 1088 // create format specifiers before the debug output 1089 buff = __kmp_str_format( 1090 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1091 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1092 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1093 traits_t<UT>::spec); 1094 KD_TRACE(1000, 1095 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1096 __kmp_str_free(&buff); 1097 } 1098 #endif 1099 1100 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1101 } 1102 // } 1103 } 1104 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1105 } 1106 1107 #endif /* KMP_GOMP_COMPAT */ 1108 1109 template <typename T> 1110 int __kmp_dispatch_next_algorithm(int gtid, 1111 dispatch_private_info_template<T> *pr, 1112 dispatch_shared_info_template<T> volatile *sh, 1113 kmp_int32 *p_last, T *p_lb, T *p_ub, 1114 typename traits_t<T>::signed_t *p_st, T nproc, 1115 T tid) { 1116 typedef typename traits_t<T>::unsigned_t UT; 1117 typedef typename traits_t<T>::signed_t ST; 1118 typedef typename traits_t<T>::floating_t DBL; 1119 int status = 0; 1120 kmp_int32 last = 0; 1121 T start; 1122 ST incr; 1123 UT limit, trip, init; 1124 kmp_info_t *th = __kmp_threads[gtid]; 1125 kmp_team_t *team = th->th.th_team; 1126 1127 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1128 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1129 KMP_DEBUG_ASSERT(pr); 1130 KMP_DEBUG_ASSERT(sh); 1131 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1132 #ifdef KMP_DEBUG 1133 { 1134 char *buff; 1135 // create format specifiers before the debug output 1136 buff = 1137 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1138 "sh:%%p nproc:%%%s tid:%%%s\n", 1139 traits_t<T>::spec, traits_t<T>::spec); 1140 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1141 __kmp_str_free(&buff); 1142 } 1143 #endif 1144 1145 // zero trip count 1146 if (pr->u.p.tc == 0) { 1147 KD_TRACE(10, 1148 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1149 "zero status:%d\n", 1150 gtid, status)); 1151 return 0; 1152 } 1153 1154 switch (pr->schedule) { 1155 #if (KMP_STATIC_STEAL_ENABLED) 1156 case kmp_sch_static_steal: { 1157 T chunk = pr->u.p.parm1; 1158 1159 KD_TRACE(100, 1160 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1161 gtid)); 1162 1163 trip = pr->u.p.tc - 1; 1164 1165 if (traits_t<T>::type_size > 4) { 1166 // use lock for 8-byte and CAS for 4-byte induction 1167 // variable. TODO (optional): check and use 16-byte CAS 1168 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1169 KMP_DEBUG_ASSERT(lck != NULL); 1170 if (pr->u.p.count < (UT)pr->u.p.ub) { 1171 __kmp_acquire_lock(lck, gtid); 1172 // try to get own chunk of iterations 1173 init = (pr->u.p.count)++; 1174 status = (init < (UT)pr->u.p.ub); 1175 __kmp_release_lock(lck, gtid); 1176 } else { 1177 status = 0; // no own chunks 1178 } 1179 if (!status) { // try to steal 1180 kmp_info_t **other_threads = team->t.t_threads; 1181 int while_limit = nproc; // nproc attempts to find a victim 1182 int while_index = 0; 1183 // TODO: algorithm of searching for a victim 1184 // should be cleaned up and measured 1185 while ((!status) && (while_limit != ++while_index)) { 1186 T remaining; 1187 T victimIdx = pr->u.p.parm4; 1188 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1189 dispatch_private_info_template<T> *victim = 1190 reinterpret_cast<dispatch_private_info_template<T> *>( 1191 other_threads[victimIdx] 1192 ->th.th_dispatch->th_dispatch_pr_current); 1193 while ((victim == NULL || victim == pr || 1194 (*(volatile T *)&victim->u.p.static_steal_counter != 1195 *(volatile T *)&pr->u.p.static_steal_counter)) && 1196 oldVictimIdx != victimIdx) { 1197 victimIdx = (victimIdx + 1) % nproc; 1198 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1199 other_threads[victimIdx] 1200 ->th.th_dispatch->th_dispatch_pr_current); 1201 } 1202 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1203 *(volatile T *)&pr->u.p.static_steal_counter)) { 1204 continue; // try once more (nproc attempts in total) 1205 // no victim is ready yet to participate in stealing 1206 // because all victims are still in kmp_init_dispatch 1207 } 1208 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1209 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1210 continue; // not enough chunks to steal, goto next victim 1211 } 1212 1213 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1214 KMP_ASSERT(lck != NULL); 1215 __kmp_acquire_lock(lck, gtid); 1216 limit = victim->u.p.ub; // keep initial ub 1217 if (victim->u.p.count >= limit || 1218 (remaining = limit - victim->u.p.count) < 2) { 1219 __kmp_release_lock(lck, gtid); 1220 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1221 continue; // not enough chunks to steal 1222 } 1223 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1224 // by 1 1225 if (remaining > 3) { 1226 // steal 1/4 of remaining 1227 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1228 init = (victim->u.p.ub -= (remaining >> 2)); 1229 } else { 1230 // steal 1 chunk of 2 or 3 remaining 1231 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1232 init = (victim->u.p.ub -= 1); 1233 } 1234 __kmp_release_lock(lck, gtid); 1235 1236 KMP_DEBUG_ASSERT(init + 1 <= limit); 1237 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1238 status = 1; 1239 while_index = 0; 1240 // now update own count and ub with stolen range but init chunk 1241 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1242 pr->u.p.count = init + 1; 1243 pr->u.p.ub = limit; 1244 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1245 } // while (search for victim) 1246 } // if (try to find victim and steal) 1247 } else { 1248 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1249 typedef union { 1250 struct { 1251 UT count; 1252 T ub; 1253 } p; 1254 kmp_int64 b; 1255 } union_i4; 1256 // All operations on 'count' or 'ub' must be combined atomically 1257 // together. 1258 { 1259 union_i4 vold, vnew; 1260 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1261 vnew = vold; 1262 vnew.p.count++; 1263 while (!KMP_COMPARE_AND_STORE_ACQ64( 1264 (volatile kmp_int64 *)&pr->u.p.count, 1265 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1266 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1267 KMP_CPU_PAUSE(); 1268 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1269 vnew = vold; 1270 vnew.p.count++; 1271 } 1272 vnew = vold; 1273 init = vnew.p.count; 1274 status = (init < (UT)vnew.p.ub); 1275 } 1276 1277 if (!status) { 1278 kmp_info_t **other_threads = team->t.t_threads; 1279 int while_limit = nproc; // nproc attempts to find a victim 1280 int while_index = 0; 1281 1282 // TODO: algorithm of searching for a victim 1283 // should be cleaned up and measured 1284 while ((!status) && (while_limit != ++while_index)) { 1285 union_i4 vold, vnew; 1286 kmp_int32 remaining; 1287 T victimIdx = pr->u.p.parm4; 1288 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1289 dispatch_private_info_template<T> *victim = 1290 reinterpret_cast<dispatch_private_info_template<T> *>( 1291 other_threads[victimIdx] 1292 ->th.th_dispatch->th_dispatch_pr_current); 1293 while ((victim == NULL || victim == pr || 1294 (*(volatile T *)&victim->u.p.static_steal_counter != 1295 *(volatile T *)&pr->u.p.static_steal_counter)) && 1296 oldVictimIdx != victimIdx) { 1297 victimIdx = (victimIdx + 1) % nproc; 1298 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1299 other_threads[victimIdx] 1300 ->th.th_dispatch->th_dispatch_pr_current); 1301 } 1302 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1303 *(volatile T *)&pr->u.p.static_steal_counter)) { 1304 continue; // try once more (nproc attempts in total) 1305 // no victim is ready yet to participate in stealing 1306 // because all victims are still in kmp_init_dispatch 1307 } 1308 pr->u.p.parm4 = victimIdx; // new victim found 1309 while (1) { // CAS loop if victim has enough chunks to steal 1310 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1311 vnew = vold; 1312 1313 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1314 if (vnew.p.count >= (UT)vnew.p.ub || 1315 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1316 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1317 break; // not enough chunks to steal, goto next victim 1318 } 1319 if (remaining > 3) { 1320 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1321 } else { 1322 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1323 } 1324 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1325 // TODO: Should this be acquire or release? 1326 if (KMP_COMPARE_AND_STORE_ACQ64( 1327 (volatile kmp_int64 *)&victim->u.p.count, 1328 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1329 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1330 // stealing succedded 1331 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1332 vold.p.ub - vnew.p.ub); 1333 status = 1; 1334 while_index = 0; 1335 // now update own count and ub 1336 init = vnew.p.ub; 1337 vold.p.count = init + 1; 1338 #if KMP_ARCH_X86 1339 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1340 #else 1341 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1342 #endif 1343 break; 1344 } // if (check CAS result) 1345 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1346 } // while (try to steal from particular victim) 1347 } // while (search for victim) 1348 } // if (try to find victim and steal) 1349 } // if (4-byte induction variable) 1350 if (!status) { 1351 *p_lb = 0; 1352 *p_ub = 0; 1353 if (p_st != NULL) 1354 *p_st = 0; 1355 } else { 1356 start = pr->u.p.parm2; 1357 init *= chunk; 1358 limit = chunk + init - 1; 1359 incr = pr->u.p.st; 1360 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1361 1362 KMP_DEBUG_ASSERT(init <= trip); 1363 if ((last = (limit >= trip)) != 0) 1364 limit = trip; 1365 if (p_st != NULL) 1366 *p_st = incr; 1367 1368 if (incr == 1) { 1369 *p_lb = start + init; 1370 *p_ub = start + limit; 1371 } else { 1372 *p_lb = start + init * incr; 1373 *p_ub = start + limit * incr; 1374 } 1375 1376 if (pr->flags.ordered) { 1377 pr->u.p.ordered_lower = init; 1378 pr->u.p.ordered_upper = limit; 1379 } // if 1380 } // if 1381 break; 1382 } // case 1383 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1384 case kmp_sch_static_balanced: { 1385 KD_TRACE( 1386 10, 1387 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1388 gtid)); 1389 /* check if thread has any iteration to do */ 1390 if ((status = !pr->u.p.count) != 0) { 1391 pr->u.p.count = 1; 1392 *p_lb = pr->u.p.lb; 1393 *p_ub = pr->u.p.ub; 1394 last = pr->u.p.parm1; 1395 if (p_st != NULL) 1396 *p_st = pr->u.p.st; 1397 } else { /* no iterations to do */ 1398 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1399 } 1400 } // case 1401 break; 1402 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1403 merged here */ 1404 case kmp_sch_static_chunked: { 1405 T parm1; 1406 1407 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1408 "kmp_sch_static_[affinity|chunked] case\n", 1409 gtid)); 1410 parm1 = pr->u.p.parm1; 1411 1412 trip = pr->u.p.tc - 1; 1413 init = parm1 * (pr->u.p.count + tid); 1414 1415 if ((status = (init <= trip)) != 0) { 1416 start = pr->u.p.lb; 1417 incr = pr->u.p.st; 1418 limit = parm1 + init - 1; 1419 1420 if ((last = (limit >= trip)) != 0) 1421 limit = trip; 1422 1423 if (p_st != NULL) 1424 *p_st = incr; 1425 1426 pr->u.p.count += nproc; 1427 1428 if (incr == 1) { 1429 *p_lb = start + init; 1430 *p_ub = start + limit; 1431 } else { 1432 *p_lb = start + init * incr; 1433 *p_ub = start + limit * incr; 1434 } 1435 1436 if (pr->flags.ordered) { 1437 pr->u.p.ordered_lower = init; 1438 pr->u.p.ordered_upper = limit; 1439 } // if 1440 } // if 1441 } // case 1442 break; 1443 1444 case kmp_sch_dynamic_chunked: { 1445 T chunk = pr->u.p.parm1; 1446 1447 KD_TRACE( 1448 100, 1449 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1450 gtid)); 1451 1452 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1453 trip = pr->u.p.tc - 1; 1454 1455 if ((status = (init <= trip)) == 0) { 1456 *p_lb = 0; 1457 *p_ub = 0; 1458 if (p_st != NULL) 1459 *p_st = 0; 1460 } else { 1461 start = pr->u.p.lb; 1462 limit = chunk + init - 1; 1463 incr = pr->u.p.st; 1464 1465 if ((last = (limit >= trip)) != 0) 1466 limit = trip; 1467 1468 if (p_st != NULL) 1469 *p_st = incr; 1470 1471 if (incr == 1) { 1472 *p_lb = start + init; 1473 *p_ub = start + limit; 1474 } else { 1475 *p_lb = start + init * incr; 1476 *p_ub = start + limit * incr; 1477 } 1478 1479 if (pr->flags.ordered) { 1480 pr->u.p.ordered_lower = init; 1481 pr->u.p.ordered_upper = limit; 1482 } // if 1483 } // if 1484 } // case 1485 break; 1486 1487 case kmp_sch_guided_iterative_chunked: { 1488 T chunkspec = pr->u.p.parm1; 1489 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1490 "iterative case\n", 1491 gtid)); 1492 trip = pr->u.p.tc; 1493 // Start atomic part of calculations 1494 while (1) { 1495 ST remaining; // signed, because can be < 0 1496 init = sh->u.s.iteration; // shared value 1497 remaining = trip - init; 1498 if (remaining <= 0) { // AC: need to compare with 0 first 1499 // nothing to do, don't try atomic op 1500 status = 0; 1501 break; 1502 } 1503 if ((T)remaining < 1504 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1505 // use dynamic-style shcedule 1506 // atomically inrement iterations, get old value 1507 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1508 (ST)chunkspec); 1509 remaining = trip - init; 1510 if (remaining <= 0) { 1511 status = 0; // all iterations got by other threads 1512 } else { 1513 // got some iterations to work on 1514 status = 1; 1515 if ((T)remaining > chunkspec) { 1516 limit = init + chunkspec - 1; 1517 } else { 1518 last = 1; // the last chunk 1519 limit = init + remaining - 1; 1520 } // if 1521 } // if 1522 break; 1523 } // if 1524 limit = init + 1525 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1526 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1527 (ST)init, (ST)limit)) { 1528 // CAS was successful, chunk obtained 1529 status = 1; 1530 --limit; 1531 break; 1532 } // if 1533 } // while 1534 if (status != 0) { 1535 start = pr->u.p.lb; 1536 incr = pr->u.p.st; 1537 if (p_st != NULL) 1538 *p_st = incr; 1539 *p_lb = start + init * incr; 1540 *p_ub = start + limit * incr; 1541 if (pr->flags.ordered) { 1542 pr->u.p.ordered_lower = init; 1543 pr->u.p.ordered_upper = limit; 1544 } // if 1545 } else { 1546 *p_lb = 0; 1547 *p_ub = 0; 1548 if (p_st != NULL) 1549 *p_st = 0; 1550 } // if 1551 } // case 1552 break; 1553 1554 #if OMP_45_ENABLED 1555 case kmp_sch_guided_simd: { 1556 // same as iterative but curr-chunk adjusted to be multiple of given 1557 // chunk 1558 T chunk = pr->u.p.parm1; 1559 KD_TRACE(100, 1560 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1561 gtid)); 1562 trip = pr->u.p.tc; 1563 // Start atomic part of calculations 1564 while (1) { 1565 ST remaining; // signed, because can be < 0 1566 init = sh->u.s.iteration; // shared value 1567 remaining = trip - init; 1568 if (remaining <= 0) { // AC: need to compare with 0 first 1569 status = 0; // nothing to do, don't try atomic op 1570 break; 1571 } 1572 KMP_DEBUG_ASSERT(init % chunk == 0); 1573 // compare with K*nproc*(chunk+1), K=2 by default 1574 if ((T)remaining < pr->u.p.parm2) { 1575 // use dynamic-style shcedule 1576 // atomically inrement iterations, get old value 1577 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1578 (ST)chunk); 1579 remaining = trip - init; 1580 if (remaining <= 0) { 1581 status = 0; // all iterations got by other threads 1582 } else { 1583 // got some iterations to work on 1584 status = 1; 1585 if ((T)remaining > chunk) { 1586 limit = init + chunk - 1; 1587 } else { 1588 last = 1; // the last chunk 1589 limit = init + remaining - 1; 1590 } // if 1591 } // if 1592 break; 1593 } // if 1594 // divide by K*nproc 1595 UT span = remaining * (*(double *)&pr->u.p.parm3); 1596 UT rem = span % chunk; 1597 if (rem) // adjust so that span%chunk == 0 1598 span += chunk - rem; 1599 limit = init + span; 1600 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1601 (ST)init, (ST)limit)) { 1602 // CAS was successful, chunk obtained 1603 status = 1; 1604 --limit; 1605 break; 1606 } // if 1607 } // while 1608 if (status != 0) { 1609 start = pr->u.p.lb; 1610 incr = pr->u.p.st; 1611 if (p_st != NULL) 1612 *p_st = incr; 1613 *p_lb = start + init * incr; 1614 *p_ub = start + limit * incr; 1615 if (pr->flags.ordered) { 1616 pr->u.p.ordered_lower = init; 1617 pr->u.p.ordered_upper = limit; 1618 } // if 1619 } else { 1620 *p_lb = 0; 1621 *p_ub = 0; 1622 if (p_st != NULL) 1623 *p_st = 0; 1624 } // if 1625 } // case 1626 break; 1627 #endif // OMP_45_ENABLED 1628 1629 case kmp_sch_guided_analytical_chunked: { 1630 T chunkspec = pr->u.p.parm1; 1631 UT chunkIdx; 1632 #if KMP_USE_X87CONTROL 1633 /* for storing original FPCW value for Windows* OS on 1634 IA-32 architecture 8-byte version */ 1635 unsigned int oldFpcw; 1636 unsigned int fpcwSet = 0; 1637 #endif 1638 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1639 "kmp_sch_guided_analytical_chunked case\n", 1640 gtid)); 1641 1642 trip = pr->u.p.tc; 1643 1644 KMP_DEBUG_ASSERT(nproc > 1); 1645 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1646 1647 while (1) { /* this while loop is a safeguard against unexpected zero 1648 chunk sizes */ 1649 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1650 if (chunkIdx >= (UT)pr->u.p.parm2) { 1651 --trip; 1652 /* use dynamic-style scheduling */ 1653 init = chunkIdx * chunkspec + pr->u.p.count; 1654 /* need to verify init > 0 in case of overflow in the above 1655 * calculation */ 1656 if ((status = (init > 0 && init <= trip)) != 0) { 1657 limit = init + chunkspec - 1; 1658 1659 if ((last = (limit >= trip)) != 0) 1660 limit = trip; 1661 } 1662 break; 1663 } else { 1664 /* use exponential-style scheduling */ 1665 /* The following check is to workaround the lack of long double precision on 1666 Windows* OS. 1667 This check works around the possible effect that init != 0 for chunkIdx == 0. 1668 */ 1669 #if KMP_USE_X87CONTROL 1670 /* If we haven't already done so, save original 1671 FPCW and set precision to 64-bit, as Windows* OS 1672 on IA-32 architecture defaults to 53-bit */ 1673 if (!fpcwSet) { 1674 oldFpcw = _control87(0, 0); 1675 _control87(_PC_64, _MCW_PC); 1676 fpcwSet = 0x30000; 1677 } 1678 #endif 1679 if (chunkIdx) { 1680 init = __kmp_dispatch_guided_remaining<T>( 1681 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1682 KMP_DEBUG_ASSERT(init); 1683 init = trip - init; 1684 } else 1685 init = 0; 1686 limit = trip - __kmp_dispatch_guided_remaining<T>( 1687 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1688 KMP_ASSERT(init <= limit); 1689 if (init < limit) { 1690 KMP_DEBUG_ASSERT(limit <= trip); 1691 --limit; 1692 status = 1; 1693 break; 1694 } // if 1695 } // if 1696 } // while (1) 1697 #if KMP_USE_X87CONTROL 1698 /* restore FPCW if necessary 1699 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1700 */ 1701 if (fpcwSet && (oldFpcw & fpcwSet)) 1702 _control87(oldFpcw, _MCW_PC); 1703 #endif 1704 if (status != 0) { 1705 start = pr->u.p.lb; 1706 incr = pr->u.p.st; 1707 if (p_st != NULL) 1708 *p_st = incr; 1709 *p_lb = start + init * incr; 1710 *p_ub = start + limit * incr; 1711 if (pr->flags.ordered) { 1712 pr->u.p.ordered_lower = init; 1713 pr->u.p.ordered_upper = limit; 1714 } 1715 } else { 1716 *p_lb = 0; 1717 *p_ub = 0; 1718 if (p_st != NULL) 1719 *p_st = 0; 1720 } 1721 } // case 1722 break; 1723 1724 case kmp_sch_trapezoidal: { 1725 UT index; 1726 T parm2 = pr->u.p.parm2; 1727 T parm3 = pr->u.p.parm3; 1728 T parm4 = pr->u.p.parm4; 1729 KD_TRACE(100, 1730 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1731 gtid)); 1732 1733 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1734 1735 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1736 trip = pr->u.p.tc - 1; 1737 1738 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1739 *p_lb = 0; 1740 *p_ub = 0; 1741 if (p_st != NULL) 1742 *p_st = 0; 1743 } else { 1744 start = pr->u.p.lb; 1745 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1746 incr = pr->u.p.st; 1747 1748 if ((last = (limit >= trip)) != 0) 1749 limit = trip; 1750 1751 if (p_st != NULL) 1752 *p_st = incr; 1753 1754 if (incr == 1) { 1755 *p_lb = start + init; 1756 *p_ub = start + limit; 1757 } else { 1758 *p_lb = start + init * incr; 1759 *p_ub = start + limit * incr; 1760 } 1761 1762 if (pr->flags.ordered) { 1763 pr->u.p.ordered_lower = init; 1764 pr->u.p.ordered_upper = limit; 1765 } // if 1766 } // if 1767 } // case 1768 break; 1769 default: { 1770 status = 0; // to avoid complaints on uninitialized variable use 1771 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1772 KMP_HNT(GetNewerLibrary), // Hint 1773 __kmp_msg_null // Variadic argument list terminator 1774 ); 1775 } break; 1776 } // switch 1777 if (p_last) 1778 *p_last = last; 1779 #ifdef KMP_DEBUG 1780 if (pr->flags.ordered) { 1781 char *buff; 1782 // create format specifiers before the debug output 1783 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1784 "ordered_lower:%%%s ordered_upper:%%%s\n", 1785 traits_t<UT>::spec, traits_t<UT>::spec); 1786 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1787 __kmp_str_free(&buff); 1788 } 1789 { 1790 char *buff; 1791 // create format specifiers before the debug output 1792 buff = __kmp_str_format( 1793 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1794 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1795 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1796 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1797 __kmp_str_free(&buff); 1798 } 1799 #endif 1800 return status; 1801 } 1802 1803 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1804 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1805 is not called. */ 1806 #if OMPT_SUPPORT && OMPT_OPTIONAL 1807 #define OMPT_LOOP_END \ 1808 if (status == 0) { \ 1809 if (ompt_enabled.ompt_callback_work) { \ 1810 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1811 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1812 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1813 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1814 &(task_info->task_data), 0, codeptr); \ 1815 } \ 1816 } 1817 // TODO: implement count 1818 #else 1819 #define OMPT_LOOP_END // no-op 1820 #endif 1821 1822 #if KMP_STATS_ENABLED 1823 #define KMP_STATS_LOOP_END \ 1824 { \ 1825 kmp_int64 u, l, t, i; \ 1826 l = (kmp_int64)(*p_lb); \ 1827 u = (kmp_int64)(*p_ub); \ 1828 i = (kmp_int64)(pr->u.p.st); \ 1829 if (status == 0) { \ 1830 t = 0; \ 1831 KMP_POP_PARTITIONED_TIMER(); \ 1832 } else if (i == 1) { \ 1833 if (u >= l) \ 1834 t = u - l + 1; \ 1835 else \ 1836 t = 0; \ 1837 } else if (i < 0) { \ 1838 if (l >= u) \ 1839 t = (l - u) / (-i) + 1; \ 1840 else \ 1841 t = 0; \ 1842 } else { \ 1843 if (u >= l) \ 1844 t = (u - l) / i + 1; \ 1845 else \ 1846 t = 0; \ 1847 } \ 1848 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1849 } 1850 #else 1851 #define KMP_STATS_LOOP_END /* Nothing */ 1852 #endif 1853 1854 template <typename T> 1855 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1856 T *p_lb, T *p_ub, 1857 typename traits_t<T>::signed_t *p_st 1858 #if OMPT_SUPPORT && OMPT_OPTIONAL 1859 , 1860 void *codeptr 1861 #endif 1862 ) { 1863 1864 typedef typename traits_t<T>::unsigned_t UT; 1865 typedef typename traits_t<T>::signed_t ST; 1866 // This is potentially slightly misleading, schedule(runtime) will appear here 1867 // even if the actual runtme schedule is static. (Which points out a 1868 // disadavantage of schedule(runtime): even when static scheduling is used it 1869 // costs more than a compile time choice to use static scheduling would.) 1870 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1871 1872 int status; 1873 dispatch_private_info_template<T> *pr; 1874 kmp_info_t *th = __kmp_threads[gtid]; 1875 kmp_team_t *team = th->th.th_team; 1876 1877 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1878 KD_TRACE( 1879 1000, 1880 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1881 gtid, p_lb, p_ub, p_st, p_last)); 1882 1883 if (team->t.t_serialized) { 1884 /* NOTE: serialize this dispatch becase we are not at the active level */ 1885 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1886 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1887 KMP_DEBUG_ASSERT(pr); 1888 1889 if ((status = (pr->u.p.tc != 0)) == 0) { 1890 *p_lb = 0; 1891 *p_ub = 0; 1892 // if ( p_last != NULL ) 1893 // *p_last = 0; 1894 if (p_st != NULL) 1895 *p_st = 0; 1896 if (__kmp_env_consistency_check) { 1897 if (pr->pushed_ws != ct_none) { 1898 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1899 } 1900 } 1901 } else if (pr->flags.nomerge) { 1902 kmp_int32 last; 1903 T start; 1904 UT limit, trip, init; 1905 ST incr; 1906 T chunk = pr->u.p.parm1; 1907 1908 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1909 gtid)); 1910 1911 init = chunk * pr->u.p.count++; 1912 trip = pr->u.p.tc - 1; 1913 1914 if ((status = (init <= trip)) == 0) { 1915 *p_lb = 0; 1916 *p_ub = 0; 1917 // if ( p_last != NULL ) 1918 // *p_last = 0; 1919 if (p_st != NULL) 1920 *p_st = 0; 1921 if (__kmp_env_consistency_check) { 1922 if (pr->pushed_ws != ct_none) { 1923 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1924 } 1925 } 1926 } else { 1927 start = pr->u.p.lb; 1928 limit = chunk + init - 1; 1929 incr = pr->u.p.st; 1930 1931 if ((last = (limit >= trip)) != 0) { 1932 limit = trip; 1933 #if KMP_OS_WINDOWS 1934 pr->u.p.last_upper = pr->u.p.ub; 1935 #endif /* KMP_OS_WINDOWS */ 1936 } 1937 if (p_last != NULL) 1938 *p_last = last; 1939 if (p_st != NULL) 1940 *p_st = incr; 1941 if (incr == 1) { 1942 *p_lb = start + init; 1943 *p_ub = start + limit; 1944 } else { 1945 *p_lb = start + init * incr; 1946 *p_ub = start + limit * incr; 1947 } 1948 1949 if (pr->flags.ordered) { 1950 pr->u.p.ordered_lower = init; 1951 pr->u.p.ordered_upper = limit; 1952 #ifdef KMP_DEBUG 1953 { 1954 char *buff; 1955 // create format specifiers before the debug output 1956 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1957 "ordered_lower:%%%s ordered_upper:%%%s\n", 1958 traits_t<UT>::spec, traits_t<UT>::spec); 1959 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1960 pr->u.p.ordered_upper)); 1961 __kmp_str_free(&buff); 1962 } 1963 #endif 1964 } // if 1965 } // if 1966 } else { 1967 pr->u.p.tc = 0; 1968 *p_lb = pr->u.p.lb; 1969 *p_ub = pr->u.p.ub; 1970 #if KMP_OS_WINDOWS 1971 pr->u.p.last_upper = *p_ub; 1972 #endif /* KMP_OS_WINDOWS */ 1973 if (p_last != NULL) 1974 *p_last = TRUE; 1975 if (p_st != NULL) 1976 *p_st = pr->u.p.st; 1977 } // if 1978 #ifdef KMP_DEBUG 1979 { 1980 char *buff; 1981 // create format specifiers before the debug output 1982 buff = __kmp_str_format( 1983 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1984 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1985 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1986 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1987 __kmp_str_free(&buff); 1988 } 1989 #endif 1990 #if INCLUDE_SSC_MARKS 1991 SSC_MARK_DISPATCH_NEXT(); 1992 #endif 1993 OMPT_LOOP_END; 1994 KMP_STATS_LOOP_END; 1995 return status; 1996 } else { 1997 kmp_int32 last = 0; 1998 dispatch_shared_info_template<T> volatile *sh; 1999 2000 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2001 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2002 2003 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2004 th->th.th_dispatch->th_dispatch_pr_current); 2005 KMP_DEBUG_ASSERT(pr); 2006 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2007 th->th.th_dispatch->th_dispatch_sh_current); 2008 KMP_DEBUG_ASSERT(sh); 2009 2010 #if KMP_USE_HIER_SCHED 2011 if (pr->flags.use_hier) 2012 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2013 else 2014 #endif // KMP_USE_HIER_SCHED 2015 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2016 p_st, th->th.th_team_nproc, 2017 th->th.th_info.ds.ds_tid); 2018 // status == 0: no more iterations to execute 2019 if (status == 0) { 2020 UT num_done; 2021 2022 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2023 #ifdef KMP_DEBUG 2024 { 2025 char *buff; 2026 // create format specifiers before the debug output 2027 buff = __kmp_str_format( 2028 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2029 traits_t<UT>::spec); 2030 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2031 __kmp_str_free(&buff); 2032 } 2033 #endif 2034 2035 #if KMP_USE_HIER_SCHED 2036 pr->flags.use_hier = FALSE; 2037 #endif 2038 if ((ST)num_done == th->th.th_team_nproc - 1) { 2039 #if (KMP_STATIC_STEAL_ENABLED) 2040 if (pr->schedule == kmp_sch_static_steal && 2041 traits_t<T>::type_size > 4) { 2042 int i; 2043 kmp_info_t **other_threads = team->t.t_threads; 2044 // loop complete, safe to destroy locks used for stealing 2045 for (i = 0; i < th->th.th_team_nproc; ++i) { 2046 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2047 KMP_ASSERT(lck != NULL); 2048 __kmp_destroy_lock(lck); 2049 __kmp_free(lck); 2050 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2051 } 2052 } 2053 #endif 2054 /* NOTE: release this buffer to be reused */ 2055 2056 KMP_MB(); /* Flush all pending memory write invalidates. */ 2057 2058 sh->u.s.num_done = 0; 2059 sh->u.s.iteration = 0; 2060 2061 /* TODO replace with general release procedure? */ 2062 if (pr->flags.ordered) { 2063 sh->u.s.ordered_iteration = 0; 2064 } 2065 2066 KMP_MB(); /* Flush all pending memory write invalidates. */ 2067 2068 sh->buffer_index += __kmp_dispatch_num_buffers; 2069 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2070 gtid, sh->buffer_index)); 2071 2072 KMP_MB(); /* Flush all pending memory write invalidates. */ 2073 2074 } // if 2075 if (__kmp_env_consistency_check) { 2076 if (pr->pushed_ws != ct_none) { 2077 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2078 } 2079 } 2080 2081 th->th.th_dispatch->th_deo_fcn = NULL; 2082 th->th.th_dispatch->th_dxo_fcn = NULL; 2083 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2084 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2085 } // if (status == 0) 2086 #if KMP_OS_WINDOWS 2087 else if (last) { 2088 pr->u.p.last_upper = pr->u.p.ub; 2089 } 2090 #endif /* KMP_OS_WINDOWS */ 2091 if (p_last != NULL && status != 0) 2092 *p_last = last; 2093 } // if 2094 2095 #ifdef KMP_DEBUG 2096 { 2097 char *buff; 2098 // create format specifiers before the debug output 2099 buff = __kmp_str_format( 2100 "__kmp_dispatch_next: T#%%d normal case: " 2101 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2102 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2103 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2104 (p_last ? *p_last : 0), status)); 2105 __kmp_str_free(&buff); 2106 } 2107 #endif 2108 #if INCLUDE_SSC_MARKS 2109 SSC_MARK_DISPATCH_NEXT(); 2110 #endif 2111 OMPT_LOOP_END; 2112 KMP_STATS_LOOP_END; 2113 return status; 2114 } 2115 2116 template <typename T> 2117 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2118 kmp_int32 *plastiter, T *plower, T *pupper, 2119 typename traits_t<T>::signed_t incr) { 2120 typedef typename traits_t<T>::unsigned_t UT; 2121 kmp_uint32 team_id; 2122 kmp_uint32 nteams; 2123 UT trip_count; 2124 kmp_team_t *team; 2125 kmp_info_t *th; 2126 2127 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2128 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2129 #ifdef KMP_DEBUG 2130 typedef typename traits_t<T>::signed_t ST; 2131 { 2132 char *buff; 2133 // create format specifiers before the debug output 2134 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2135 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2136 traits_t<T>::spec, traits_t<T>::spec, 2137 traits_t<ST>::spec, traits_t<T>::spec); 2138 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2139 __kmp_str_free(&buff); 2140 } 2141 #endif 2142 2143 if (__kmp_env_consistency_check) { 2144 if (incr == 0) { 2145 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2146 loc); 2147 } 2148 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2149 // The loop is illegal. 2150 // Some zero-trip loops maintained by compiler, e.g.: 2151 // for(i=10;i<0;++i) // lower >= upper - run-time check 2152 // for(i=0;i>10;--i) // lower <= upper - run-time check 2153 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2154 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2155 // Compiler does not check the following illegal loops: 2156 // for(i=0;i<10;i+=incr) // where incr<0 2157 // for(i=10;i>0;i-=incr) // where incr<0 2158 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2159 } 2160 } 2161 th = __kmp_threads[gtid]; 2162 team = th->th.th_team; 2163 #if OMP_40_ENABLED 2164 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2165 nteams = th->th.th_teams_size.nteams; 2166 #endif 2167 team_id = team->t.t_master_tid; 2168 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2169 2170 // compute global trip count 2171 if (incr == 1) { 2172 trip_count = *pupper - *plower + 1; 2173 } else if (incr == -1) { 2174 trip_count = *plower - *pupper + 1; 2175 } else if (incr > 0) { 2176 // upper-lower can exceed the limit of signed type 2177 trip_count = (UT)(*pupper - *plower) / incr + 1; 2178 } else { 2179 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2180 } 2181 2182 if (trip_count <= nteams) { 2183 KMP_DEBUG_ASSERT( 2184 __kmp_static == kmp_sch_static_greedy || 2185 __kmp_static == 2186 kmp_sch_static_balanced); // Unknown static scheduling type. 2187 // only some teams get single iteration, others get nothing 2188 if (team_id < trip_count) { 2189 *pupper = *plower = *plower + team_id * incr; 2190 } else { 2191 *plower = *pupper + incr; // zero-trip loop 2192 } 2193 if (plastiter != NULL) 2194 *plastiter = (team_id == trip_count - 1); 2195 } else { 2196 if (__kmp_static == kmp_sch_static_balanced) { 2197 UT chunk = trip_count / nteams; 2198 UT extras = trip_count % nteams; 2199 *plower += 2200 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2201 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2202 if (plastiter != NULL) 2203 *plastiter = (team_id == nteams - 1); 2204 } else { 2205 T chunk_inc_count = 2206 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2207 T upper = *pupper; 2208 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2209 // Unknown static scheduling type. 2210 *plower += team_id * chunk_inc_count; 2211 *pupper = *plower + chunk_inc_count - incr; 2212 // Check/correct bounds if needed 2213 if (incr > 0) { 2214 if (*pupper < *plower) 2215 *pupper = traits_t<T>::max_value; 2216 if (plastiter != NULL) 2217 *plastiter = *plower <= upper && *pupper > upper - incr; 2218 if (*pupper > upper) 2219 *pupper = upper; // tracker C73258 2220 } else { 2221 if (*pupper > *plower) 2222 *pupper = traits_t<T>::min_value; 2223 if (plastiter != NULL) 2224 *plastiter = *plower >= upper && *pupper < upper - incr; 2225 if (*pupper < upper) 2226 *pupper = upper; // tracker C73258 2227 } 2228 } 2229 } 2230 } 2231 2232 //----------------------------------------------------------------------------- 2233 // Dispatch routines 2234 // Transfer call to template< type T > 2235 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2236 // T lb, T ub, ST st, ST chunk ) 2237 extern "C" { 2238 2239 /*! 2240 @ingroup WORK_SHARING 2241 @{ 2242 @param loc Source location 2243 @param gtid Global thread id 2244 @param schedule Schedule type 2245 @param lb Lower bound 2246 @param ub Upper bound 2247 @param st Step (or increment if you prefer) 2248 @param chunk The chunk size to block with 2249 2250 This function prepares the runtime to start a dynamically scheduled for loop, 2251 saving the loop arguments. 2252 These functions are all identical apart from the types of the arguments. 2253 */ 2254 2255 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2256 enum sched_type schedule, kmp_int32 lb, 2257 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2258 KMP_DEBUG_ASSERT(__kmp_init_serial); 2259 #if OMPT_SUPPORT && OMPT_OPTIONAL 2260 OMPT_STORE_RETURN_ADDRESS(gtid); 2261 #endif 2262 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2263 } 2264 /*! 2265 See @ref __kmpc_dispatch_init_4 2266 */ 2267 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2268 enum sched_type schedule, kmp_uint32 lb, 2269 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2270 KMP_DEBUG_ASSERT(__kmp_init_serial); 2271 #if OMPT_SUPPORT && OMPT_OPTIONAL 2272 OMPT_STORE_RETURN_ADDRESS(gtid); 2273 #endif 2274 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2275 } 2276 2277 /*! 2278 See @ref __kmpc_dispatch_init_4 2279 */ 2280 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2281 enum sched_type schedule, kmp_int64 lb, 2282 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2283 KMP_DEBUG_ASSERT(__kmp_init_serial); 2284 #if OMPT_SUPPORT && OMPT_OPTIONAL 2285 OMPT_STORE_RETURN_ADDRESS(gtid); 2286 #endif 2287 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2288 } 2289 2290 /*! 2291 See @ref __kmpc_dispatch_init_4 2292 */ 2293 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2294 enum sched_type schedule, kmp_uint64 lb, 2295 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2296 KMP_DEBUG_ASSERT(__kmp_init_serial); 2297 #if OMPT_SUPPORT && OMPT_OPTIONAL 2298 OMPT_STORE_RETURN_ADDRESS(gtid); 2299 #endif 2300 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2301 } 2302 2303 /*! 2304 See @ref __kmpc_dispatch_init_4 2305 2306 Difference from __kmpc_dispatch_init set of functions is these functions 2307 are called for composite distribute parallel for construct. Thus before 2308 regular iterations dispatching we need to calc per-team iteration space. 2309 2310 These functions are all identical apart from the types of the arguments. 2311 */ 2312 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2313 enum sched_type schedule, kmp_int32 *p_last, 2314 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2315 kmp_int32 chunk) { 2316 KMP_DEBUG_ASSERT(__kmp_init_serial); 2317 #if OMPT_SUPPORT && OMPT_OPTIONAL 2318 OMPT_STORE_RETURN_ADDRESS(gtid); 2319 #endif 2320 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2321 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2322 } 2323 2324 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2325 enum sched_type schedule, kmp_int32 *p_last, 2326 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2327 kmp_int32 chunk) { 2328 KMP_DEBUG_ASSERT(__kmp_init_serial); 2329 #if OMPT_SUPPORT && OMPT_OPTIONAL 2330 OMPT_STORE_RETURN_ADDRESS(gtid); 2331 #endif 2332 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2333 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2334 } 2335 2336 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2337 enum sched_type schedule, kmp_int32 *p_last, 2338 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2339 kmp_int64 chunk) { 2340 KMP_DEBUG_ASSERT(__kmp_init_serial); 2341 #if OMPT_SUPPORT && OMPT_OPTIONAL 2342 OMPT_STORE_RETURN_ADDRESS(gtid); 2343 #endif 2344 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2345 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2346 } 2347 2348 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2349 enum sched_type schedule, kmp_int32 *p_last, 2350 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2351 kmp_int64 chunk) { 2352 KMP_DEBUG_ASSERT(__kmp_init_serial); 2353 #if OMPT_SUPPORT && OMPT_OPTIONAL 2354 OMPT_STORE_RETURN_ADDRESS(gtid); 2355 #endif 2356 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2357 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2358 } 2359 2360 /*! 2361 @param loc Source code location 2362 @param gtid Global thread id 2363 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2364 otherwise 2365 @param p_lb Pointer to the lower bound for the next chunk of work 2366 @param p_ub Pointer to the upper bound for the next chunk of work 2367 @param p_st Pointer to the stride for the next chunk of work 2368 @return one if there is work to be done, zero otherwise 2369 2370 Get the next dynamically allocated chunk of work for this thread. 2371 If there is no more work, then the lb,ub and stride need not be modified. 2372 */ 2373 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2374 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2376 OMPT_STORE_RETURN_ADDRESS(gtid); 2377 #endif 2378 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2379 #if OMPT_SUPPORT && OMPT_OPTIONAL 2380 , 2381 OMPT_LOAD_RETURN_ADDRESS(gtid) 2382 #endif 2383 ); 2384 } 2385 2386 /*! 2387 See @ref __kmpc_dispatch_next_4 2388 */ 2389 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2390 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2391 kmp_int32 *p_st) { 2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 OMPT_STORE_RETURN_ADDRESS(gtid); 2394 #endif 2395 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2396 #if OMPT_SUPPORT && OMPT_OPTIONAL 2397 , 2398 OMPT_LOAD_RETURN_ADDRESS(gtid) 2399 #endif 2400 ); 2401 } 2402 2403 /*! 2404 See @ref __kmpc_dispatch_next_4 2405 */ 2406 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2407 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2408 #if OMPT_SUPPORT && OMPT_OPTIONAL 2409 OMPT_STORE_RETURN_ADDRESS(gtid); 2410 #endif 2411 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2412 #if OMPT_SUPPORT && OMPT_OPTIONAL 2413 , 2414 OMPT_LOAD_RETURN_ADDRESS(gtid) 2415 #endif 2416 ); 2417 } 2418 2419 /*! 2420 See @ref __kmpc_dispatch_next_4 2421 */ 2422 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2423 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2424 kmp_int64 *p_st) { 2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2426 OMPT_STORE_RETURN_ADDRESS(gtid); 2427 #endif 2428 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2429 #if OMPT_SUPPORT && OMPT_OPTIONAL 2430 , 2431 OMPT_LOAD_RETURN_ADDRESS(gtid) 2432 #endif 2433 ); 2434 } 2435 2436 /*! 2437 @param loc Source code location 2438 @param gtid Global thread id 2439 2440 Mark the end of a dynamic loop. 2441 */ 2442 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2443 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2444 } 2445 2446 /*! 2447 See @ref __kmpc_dispatch_fini_4 2448 */ 2449 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2450 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2451 } 2452 2453 /*! 2454 See @ref __kmpc_dispatch_fini_4 2455 */ 2456 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2457 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2458 } 2459 2460 /*! 2461 See @ref __kmpc_dispatch_fini_4 2462 */ 2463 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2464 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2465 } 2466 /*! @} */ 2467 2468 //----------------------------------------------------------------------------- 2469 // Non-template routines from kmp_dispatch.cpp used in other sources 2470 2471 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2472 return value == checker; 2473 } 2474 2475 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2476 return value != checker; 2477 } 2478 2479 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2480 return value < checker; 2481 } 2482 2483 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2484 return value >= checker; 2485 } 2486 2487 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2488 return value <= checker; 2489 } 2490 2491 kmp_uint32 2492 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2493 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2494 void *obj // Higher-level synchronization object, or NULL. 2495 ) { 2496 // note: we may not belong to a team at this point 2497 volatile kmp_uint32 *spin = spinner; 2498 kmp_uint32 check = checker; 2499 kmp_uint32 spins; 2500 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2501 kmp_uint32 r; 2502 2503 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2504 KMP_INIT_YIELD(spins); 2505 // main wait spin loop 2506 while (!f(r = TCR_4(*spin), check)) { 2507 KMP_FSYNC_SPIN_PREPARE(obj); 2508 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2509 split. It causes problems with infinite recursion because of exit lock */ 2510 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2511 __kmp_abort_thread(); */ 2512 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2513 } 2514 KMP_FSYNC_SPIN_ACQUIRED(obj); 2515 return r; 2516 } 2517 2518 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2519 kmp_uint32 (*pred)(void *, kmp_uint32), 2520 void *obj // Higher-level synchronization object, or NULL. 2521 ) { 2522 // note: we may not belong to a team at this point 2523 void *spin = spinner; 2524 kmp_uint32 check = checker; 2525 kmp_uint32 spins; 2526 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2527 2528 KMP_FSYNC_SPIN_INIT(obj, spin); 2529 KMP_INIT_YIELD(spins); 2530 // main wait spin loop 2531 while (!f(spin, check)) { 2532 KMP_FSYNC_SPIN_PREPARE(obj); 2533 /* if we have waited a bit, or are noversubscribed, yield */ 2534 /* pause is in the following code */ 2535 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2536 } 2537 KMP_FSYNC_SPIN_ACQUIRED(obj); 2538 } 2539 2540 } // extern "C" 2541 2542 #ifdef KMP_GOMP_COMPAT 2543 2544 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2545 enum sched_type schedule, kmp_int32 lb, 2546 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2547 int push_ws) { 2548 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2549 push_ws); 2550 } 2551 2552 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2553 enum sched_type schedule, kmp_uint32 lb, 2554 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2555 int push_ws) { 2556 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2557 push_ws); 2558 } 2559 2560 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2561 enum sched_type schedule, kmp_int64 lb, 2562 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2563 int push_ws) { 2564 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2565 push_ws); 2566 } 2567 2568 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2569 enum sched_type schedule, kmp_uint64 lb, 2570 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2571 int push_ws) { 2572 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2573 push_ws); 2574 } 2575 2576 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2577 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2578 } 2579 2580 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2581 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2582 } 2583 2584 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2585 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2586 } 2587 2588 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2589 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2590 } 2591 2592 #endif /* KMP_GOMP_COMPAT */ 2593 2594 /* ------------------------------------------------------------------------ */ 2595