1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Initialize a dispatch_private_info_template<T> buffer for a particular 72 // type of schedule,chunk. The loop description is found in lb (lower bound), 73 // ub (upper bound), and st (stride). nproc is the number of threads relevant 74 // to the scheduling (often the number of threads in a team, but not always if 75 // hierarchical scheduling is used). tid is the id of the thread calling 76 // the function within the group of nproc threads. It will have a value 77 // between 0 and nproc - 1. This is often just the thread id within a team, but 78 // is not necessarily the case when using hierarchical scheduling. 79 // loc is the source file location of the corresponding loop 80 // gtid is the global thread id 81 template <typename T> 82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 83 dispatch_private_info_template<T> *pr, 84 enum sched_type schedule, T lb, T ub, 85 typename traits_t<T>::signed_t st, 86 #if USE_ITT_BUILD 87 kmp_uint64 *cur_chunk, 88 #endif 89 typename traits_t<T>::signed_t chunk, 90 T nproc, T tid) { 91 typedef typename traits_t<T>::unsigned_t UT; 92 typedef typename traits_t<T>::floating_t DBL; 93 94 int active; 95 T tc; 96 kmp_info_t *th; 97 kmp_team_t *team; 98 99 #ifdef KMP_DEBUG 100 typedef typename traits_t<T>::signed_t ST; 101 { 102 char *buff; 103 // create format specifiers before the debug output 104 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 105 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 106 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 107 traits_t<T>::spec, traits_t<T>::spec, 108 traits_t<ST>::spec, traits_t<ST>::spec, 109 traits_t<T>::spec, traits_t<T>::spec); 110 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 111 __kmp_str_free(&buff); 112 } 113 #endif 114 /* setup data */ 115 th = __kmp_threads[gtid]; 116 team = th->th.th_team; 117 active = !team->t.t_serialized; 118 119 #if USE_ITT_BUILD 120 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 121 __kmp_forkjoin_frames_mode == 3 && 122 KMP_MASTER_GTID(gtid) && 123 #if OMP_40_ENABLED 124 th->th.th_teams_microtask == NULL && 125 #endif 126 team->t.t_active_level == 1; 127 #endif 128 #if (KMP_STATIC_STEAL_ENABLED) 129 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 130 // AC: we now have only one implementation of stealing, so use it 131 schedule = kmp_sch_static_steal; 132 else 133 #endif 134 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 135 136 /* Pick up the nomerge/ordered bits from the scheduling type */ 137 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 138 pr->flags.nomerge = TRUE; 139 schedule = 140 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 141 } else { 142 pr->flags.nomerge = FALSE; 143 } 144 pr->type_size = traits_t<T>::type_size; // remember the size of variables 145 if (kmp_ord_lower & schedule) { 146 pr->flags.ordered = TRUE; 147 schedule = 148 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 149 } else { 150 pr->flags.ordered = FALSE; 151 } 152 153 if (schedule == kmp_sch_static) { 154 schedule = __kmp_static; 155 } else { 156 if (schedule == kmp_sch_runtime) { 157 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 158 // not specified) 159 schedule = team->t.t_sched.r_sched_type; 160 // Detail the schedule if needed (global controls are differentiated 161 // appropriately) 162 if (schedule == kmp_sch_guided_chunked) { 163 schedule = __kmp_guided; 164 } else if (schedule == kmp_sch_static) { 165 schedule = __kmp_static; 166 } 167 // Use the chunk size specified by OMP_SCHEDULE (or default if not 168 // specified) 169 chunk = team->t.t_sched.chunk; 170 #if USE_ITT_BUILD 171 if (cur_chunk) 172 *cur_chunk = chunk; 173 #endif 174 #ifdef KMP_DEBUG 175 { 176 char *buff; 177 // create format specifiers before the debug output 178 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 179 "schedule:%%d chunk:%%%s\n", 180 traits_t<ST>::spec); 181 KD_TRACE(10, (buff, gtid, schedule, chunk)); 182 __kmp_str_free(&buff); 183 } 184 #endif 185 } else { 186 if (schedule == kmp_sch_guided_chunked) { 187 schedule = __kmp_guided; 188 } 189 if (chunk <= 0) { 190 chunk = KMP_DEFAULT_CHUNK; 191 } 192 } 193 194 if (schedule == kmp_sch_auto) { 195 // mapping and differentiation: in the __kmp_do_serial_initialize() 196 schedule = __kmp_auto; 197 #ifdef KMP_DEBUG 198 { 199 char *buff; 200 // create format specifiers before the debug output 201 buff = __kmp_str_format( 202 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 203 "schedule:%%d chunk:%%%s\n", 204 traits_t<ST>::spec); 205 KD_TRACE(10, (buff, gtid, schedule, chunk)); 206 __kmp_str_free(&buff); 207 } 208 #endif 209 } 210 211 /* guided analytical not safe for too many threads */ 212 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 213 schedule = kmp_sch_guided_iterative_chunked; 214 KMP_WARNING(DispatchManyThreads); 215 } 216 #if OMP_45_ENABLED 217 if (schedule == kmp_sch_runtime_simd) { 218 // compiler provides simd_width in the chunk parameter 219 schedule = team->t.t_sched.r_sched_type; 220 // Detail the schedule if needed (global controls are differentiated 221 // appropriately) 222 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 223 schedule == __kmp_static) { 224 schedule = kmp_sch_static_balanced_chunked; 225 } else { 226 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 227 schedule = kmp_sch_guided_simd; 228 } 229 chunk = team->t.t_sched.chunk * chunk; 230 } 231 #if USE_ITT_BUILD 232 if (cur_chunk) 233 *cur_chunk = chunk; 234 #endif 235 #ifdef KMP_DEBUG 236 { 237 char *buff; 238 // create format specifiers before the debug output 239 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 240 " chunk:%%%s\n", 241 traits_t<ST>::spec); 242 KD_TRACE(10, (buff, gtid, schedule, chunk)); 243 __kmp_str_free(&buff); 244 } 245 #endif 246 } 247 #endif // OMP_45_ENABLED 248 pr->u.p.parm1 = chunk; 249 } 250 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 251 "unknown scheduling type"); 252 253 pr->u.p.count = 0; 254 255 if (__kmp_env_consistency_check) { 256 if (st == 0) { 257 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 258 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 259 } 260 } 261 // compute trip count 262 if (st == 1) { // most common case 263 if (ub >= lb) { 264 tc = ub - lb + 1; 265 } else { // ub < lb 266 tc = 0; // zero-trip 267 } 268 } else if (st < 0) { 269 if (lb >= ub) { 270 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 271 // where the division needs to be unsigned regardless of the result type 272 tc = (UT)(lb - ub) / (-st) + 1; 273 } else { // lb < ub 274 tc = 0; // zero-trip 275 } 276 } else { // st > 0 277 if (ub >= lb) { 278 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 279 // where the division needs to be unsigned regardless of the result type 280 tc = (UT)(ub - lb) / st + 1; 281 } else { // ub < lb 282 tc = 0; // zero-trip 283 } 284 } 285 286 #if KMP_STATS_ENABLED 287 if (KMP_MASTER_GTID(gtid)) { 288 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 289 } 290 #endif 291 292 pr->u.p.lb = lb; 293 pr->u.p.ub = ub; 294 pr->u.p.st = st; 295 pr->u.p.tc = tc; 296 297 #if KMP_OS_WINDOWS 298 pr->u.p.last_upper = ub + st; 299 #endif /* KMP_OS_WINDOWS */ 300 301 /* NOTE: only the active parallel region(s) has active ordered sections */ 302 303 if (active) { 304 if (pr->flags.ordered) { 305 pr->ordered_bumped = 0; 306 pr->u.p.ordered_lower = 1; 307 pr->u.p.ordered_upper = 0; 308 } 309 } 310 311 switch (schedule) { 312 #if (KMP_STATIC_STEAL_ENABLED) 313 case kmp_sch_static_steal: { 314 T ntc, init; 315 316 KD_TRACE(100, 317 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 318 gtid)); 319 320 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 321 if (nproc > 1 && ntc >= nproc) { 322 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 323 T id = tid; 324 T small_chunk, extras; 325 326 small_chunk = ntc / nproc; 327 extras = ntc % nproc; 328 329 init = id * small_chunk + (id < extras ? id : extras); 330 pr->u.p.count = init; 331 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 332 333 pr->u.p.parm2 = lb; 334 // pr->pfields.parm3 = 0; // it's not used in static_steal 335 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 336 pr->u.p.st = st; 337 if (traits_t<T>::type_size > 4) { 338 // AC: TODO: check if 16-byte CAS available and use it to 339 // improve performance (probably wait for explicit request 340 // before spending time on this). 341 // For now use dynamically allocated per-thread lock, 342 // free memory in __kmp_dispatch_next when status==0. 343 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 344 th->th.th_dispatch->th_steal_lock = 345 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 346 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 347 } 348 break; 349 } else { 350 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 351 "kmp_sch_static_balanced\n", 352 gtid)); 353 schedule = kmp_sch_static_balanced; 354 /* too few iterations: fall-through to kmp_sch_static_balanced */ 355 } // if 356 /* FALL-THROUGH to static balanced */ 357 KMP_FALLTHROUGH(); 358 } // case 359 #endif 360 case kmp_sch_static_balanced: { 361 T init, limit; 362 363 KD_TRACE( 364 100, 365 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 366 gtid)); 367 368 if (nproc > 1) { 369 T id = tid; 370 371 if (tc < nproc) { 372 if (id < tc) { 373 init = id; 374 limit = id; 375 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 376 } else { 377 pr->u.p.count = 1; /* means no more chunks to execute */ 378 pr->u.p.parm1 = FALSE; 379 break; 380 } 381 } else { 382 T small_chunk = tc / nproc; 383 T extras = tc % nproc; 384 init = id * small_chunk + (id < extras ? id : extras); 385 limit = init + small_chunk - (id < extras ? 0 : 1); 386 pr->u.p.parm1 = (id == nproc - 1); 387 } 388 } else { 389 if (tc > 0) { 390 init = 0; 391 limit = tc - 1; 392 pr->u.p.parm1 = TRUE; 393 } else { 394 // zero trip count 395 pr->u.p.count = 1; /* means no more chunks to execute */ 396 pr->u.p.parm1 = FALSE; 397 break; 398 } 399 } 400 #if USE_ITT_BUILD 401 // Calculate chunk for metadata report 402 if (itt_need_metadata_reporting) 403 if (cur_chunk) 404 *cur_chunk = limit - init + 1; 405 #endif 406 if (st == 1) { 407 pr->u.p.lb = lb + init; 408 pr->u.p.ub = lb + limit; 409 } else { 410 // calculated upper bound, "ub" is user-defined upper bound 411 T ub_tmp = lb + limit * st; 412 pr->u.p.lb = lb + init * st; 413 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 414 // it exactly 415 if (st > 0) { 416 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 417 } else { 418 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 419 } 420 } 421 if (pr->flags.ordered) { 422 pr->u.p.ordered_lower = init; 423 pr->u.p.ordered_upper = limit; 424 } 425 break; 426 } // case 427 #if OMP_45_ENABLED 428 case kmp_sch_static_balanced_chunked: { 429 // similar to balanced, but chunk adjusted to multiple of simd width 430 T nth = nproc; 431 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 432 " -> falling-through to static_greedy\n", 433 gtid)); 434 schedule = kmp_sch_static_greedy; 435 if (nth > 1) 436 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 437 else 438 pr->u.p.parm1 = tc; 439 break; 440 } // case 441 case kmp_sch_guided_simd: 442 #endif // OMP_45_ENABLED 443 case kmp_sch_guided_iterative_chunked: { 444 KD_TRACE( 445 100, 446 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 447 " case\n", 448 gtid)); 449 450 if (nproc > 1) { 451 if ((2L * chunk + 1) * nproc >= tc) { 452 /* chunk size too large, switch to dynamic */ 453 schedule = kmp_sch_dynamic_chunked; 454 } else { 455 // when remaining iters become less than parm2 - switch to dynamic 456 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 457 *(double *)&pr->u.p.parm3 = 458 guided_flt_param / nproc; // may occupy parm3 and parm4 459 } 460 } else { 461 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 462 "kmp_sch_static_greedy\n", 463 gtid)); 464 schedule = kmp_sch_static_greedy; 465 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 466 KD_TRACE( 467 100, 468 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 469 gtid)); 470 pr->u.p.parm1 = tc; 471 } // if 472 } // case 473 break; 474 case kmp_sch_guided_analytical_chunked: { 475 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 476 "kmp_sch_guided_analytical_chunked case\n", 477 gtid)); 478 479 if (nproc > 1) { 480 if ((2L * chunk + 1) * nproc >= tc) { 481 /* chunk size too large, switch to dynamic */ 482 schedule = kmp_sch_dynamic_chunked; 483 } else { 484 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 485 DBL x; 486 487 #if KMP_USE_X87CONTROL 488 /* Linux* OS already has 64-bit computation by default for long double, 489 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 490 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 491 instead of the default 53-bit. Even though long double doesn't work 492 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 493 expected to impact the correctness of the algorithm, but this has not 494 been mathematically proven. */ 495 // save original FPCW and set precision to 64-bit, as 496 // Windows* OS on IA-32 architecture defaults to 53-bit 497 unsigned int oldFpcw = _control87(0, 0); 498 _control87(_PC_64, _MCW_PC); // 0,0x30000 499 #endif 500 /* value used for comparison in solver for cross-over point */ 501 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 502 503 /* crossover point--chunk indexes equal to or greater than 504 this point switch to dynamic-style scheduling */ 505 UT cross; 506 507 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 508 x = (long double)1.0 - (long double)0.5 / nproc; 509 510 #ifdef KMP_DEBUG 511 { // test natural alignment 512 struct _test_a { 513 char a; 514 union { 515 char b; 516 DBL d; 517 }; 518 } t; 519 ptrdiff_t natural_alignment = 520 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 521 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 522 // long)natural_alignment ); 523 KMP_DEBUG_ASSERT( 524 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 525 } 526 #endif // KMP_DEBUG 527 528 /* save the term in thread private dispatch structure */ 529 *(DBL *)&pr->u.p.parm3 = x; 530 531 /* solve for the crossover point to the nearest integer i for which C_i 532 <= chunk */ 533 { 534 UT left, right, mid; 535 long double p; 536 537 /* estimate initial upper and lower bound */ 538 539 /* doesn't matter what value right is as long as it is positive, but 540 it affects performance of the solver */ 541 right = 229; 542 p = __kmp_pow<UT>(x, right); 543 if (p > target) { 544 do { 545 p *= p; 546 right <<= 1; 547 } while (p > target && right < (1 << 27)); 548 /* lower bound is previous (failed) estimate of upper bound */ 549 left = right >> 1; 550 } else { 551 left = 0; 552 } 553 554 /* bisection root-finding method */ 555 while (left + 1 < right) { 556 mid = (left + right) / 2; 557 if (__kmp_pow<UT>(x, mid) > target) { 558 left = mid; 559 } else { 560 right = mid; 561 } 562 } // while 563 cross = right; 564 } 565 /* assert sanity of computed crossover point */ 566 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 567 __kmp_pow<UT>(x, cross) <= target); 568 569 /* save the crossover point in thread private dispatch structure */ 570 pr->u.p.parm2 = cross; 571 572 // C75803 573 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 574 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 575 #else 576 #define GUIDED_ANALYTICAL_WORKAROUND (x) 577 #endif 578 /* dynamic-style scheduling offset */ 579 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 580 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 581 cross * chunk; 582 #if KMP_USE_X87CONTROL 583 // restore FPCW 584 _control87(oldFpcw, _MCW_PC); 585 #endif 586 } // if 587 } else { 588 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 589 "kmp_sch_static_greedy\n", 590 gtid)); 591 schedule = kmp_sch_static_greedy; 592 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 593 pr->u.p.parm1 = tc; 594 } // if 595 } // case 596 break; 597 case kmp_sch_static_greedy: 598 KD_TRACE( 599 100, 600 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 601 gtid)); 602 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 603 break; 604 case kmp_sch_static_chunked: 605 case kmp_sch_dynamic_chunked: 606 if (pr->u.p.parm1 <= 0) { 607 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 608 } 609 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 610 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 611 gtid)); 612 break; 613 case kmp_sch_trapezoidal: { 614 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 615 616 T parm1, parm2, parm3, parm4; 617 KD_TRACE(100, 618 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 619 gtid)); 620 621 parm1 = chunk; 622 623 /* F : size of the first cycle */ 624 parm2 = (tc / (2 * nproc)); 625 626 if (parm2 < 1) { 627 parm2 = 1; 628 } 629 630 /* L : size of the last cycle. Make sure the last cycle is not larger 631 than the first cycle. */ 632 if (parm1 < 1) { 633 parm1 = 1; 634 } else if (parm1 > parm2) { 635 parm1 = parm2; 636 } 637 638 /* N : number of cycles */ 639 parm3 = (parm2 + parm1); 640 parm3 = (2 * tc + parm3 - 1) / parm3; 641 642 if (parm3 < 2) { 643 parm3 = 2; 644 } 645 646 /* sigma : decreasing incr of the trapezoid */ 647 parm4 = (parm3 - 1); 648 parm4 = (parm2 - parm1) / parm4; 649 650 // pointless check, because parm4 >= 0 always 651 // if ( parm4 < 0 ) { 652 // parm4 = 0; 653 //} 654 655 pr->u.p.parm1 = parm1; 656 pr->u.p.parm2 = parm2; 657 pr->u.p.parm3 = parm3; 658 pr->u.p.parm4 = parm4; 659 } // case 660 break; 661 662 default: { 663 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 664 KMP_HNT(GetNewerLibrary), // Hint 665 __kmp_msg_null // Variadic argument list terminator 666 ); 667 } break; 668 } // switch 669 pr->schedule = schedule; 670 } 671 672 #if KMP_USE_HIER_SCHED 673 template <typename T> 674 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 675 typename traits_t<T>::signed_t st); 676 template <> 677 inline void 678 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 679 kmp_int32 ub, kmp_int32 st) { 680 __kmp_dispatch_init_hierarchy<kmp_int32>( 681 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 682 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 683 } 684 template <> 685 inline void 686 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 687 kmp_uint32 ub, kmp_int32 st) { 688 __kmp_dispatch_init_hierarchy<kmp_uint32>( 689 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 690 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 691 } 692 template <> 693 inline void 694 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 695 kmp_int64 ub, kmp_int64 st) { 696 __kmp_dispatch_init_hierarchy<kmp_int64>( 697 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 698 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 699 } 700 template <> 701 inline void 702 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 703 kmp_uint64 ub, kmp_int64 st) { 704 __kmp_dispatch_init_hierarchy<kmp_uint64>( 705 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 706 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 707 } 708 709 // free all the hierarchy scheduling memory associated with the team 710 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 711 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 712 for (int i = 0; i < num_disp_buff; ++i) { 713 // type does not matter here so use kmp_int32 714 auto sh = 715 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 716 &team->t.t_disp_buffer[i]); 717 if (sh->hier) { 718 sh->hier->deallocate(); 719 __kmp_free(sh->hier); 720 } 721 } 722 } 723 #endif 724 725 // UT - unsigned flavor of T, ST - signed flavor of T, 726 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 727 template <typename T> 728 static void 729 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 730 T ub, typename traits_t<T>::signed_t st, 731 typename traits_t<T>::signed_t chunk, int push_ws) { 732 typedef typename traits_t<T>::unsigned_t UT; 733 734 int active; 735 kmp_info_t *th; 736 kmp_team_t *team; 737 kmp_uint32 my_buffer_index; 738 dispatch_private_info_template<T> *pr; 739 dispatch_shared_info_template<T> volatile *sh; 740 741 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 742 sizeof(dispatch_private_info)); 743 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 744 sizeof(dispatch_shared_info)); 745 746 if (!TCR_4(__kmp_init_parallel)) 747 __kmp_parallel_initialize(); 748 749 #if OMP_50_ENABLED 750 __kmp_resume_if_soft_paused(); 751 #endif 752 753 #if INCLUDE_SSC_MARKS 754 SSC_MARK_DISPATCH_INIT(); 755 #endif 756 #ifdef KMP_DEBUG 757 typedef typename traits_t<T>::signed_t ST; 758 { 759 char *buff; 760 // create format specifiers before the debug output 761 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 762 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 763 traits_t<ST>::spec, traits_t<T>::spec, 764 traits_t<T>::spec, traits_t<ST>::spec); 765 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 766 __kmp_str_free(&buff); 767 } 768 #endif 769 /* setup data */ 770 th = __kmp_threads[gtid]; 771 team = th->th.th_team; 772 active = !team->t.t_serialized; 773 th->th.th_ident = loc; 774 775 // Any half-decent optimizer will remove this test when the blocks are empty 776 // since the macros expand to nothing 777 // when statistics are disabled. 778 if (schedule == __kmp_static) { 779 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 780 } else { 781 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 782 } 783 784 #if KMP_USE_HIER_SCHED 785 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 786 // Hierarchical scheduling does not work with ordered, so if ordered is 787 // detected, then revert back to threaded scheduling. 788 bool ordered; 789 enum sched_type my_sched = schedule; 790 my_buffer_index = th->th.th_dispatch->th_disp_index; 791 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 792 &th->th.th_dispatch 793 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 794 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 795 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 796 my_sched = 797 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 798 ordered = (kmp_ord_lower & my_sched); 799 if (pr->flags.use_hier) { 800 if (ordered) { 801 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 802 "Disabling hierarchical scheduling.\n", 803 gtid)); 804 pr->flags.use_hier = FALSE; 805 } 806 } 807 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 808 // Don't use hierarchical for ordered parallel loops and don't 809 // use the runtime hierarchy if one was specified in the program 810 if (!ordered && !pr->flags.use_hier) 811 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 812 } 813 #endif // KMP_USE_HIER_SCHED 814 815 #if USE_ITT_BUILD 816 kmp_uint64 cur_chunk = chunk; 817 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 818 __kmp_forkjoin_frames_mode == 3 && 819 KMP_MASTER_GTID(gtid) && 820 #if OMP_40_ENABLED 821 th->th.th_teams_microtask == NULL && 822 #endif 823 team->t.t_active_level == 1; 824 #endif 825 if (!active) { 826 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 827 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 828 } else { 829 KMP_DEBUG_ASSERT(th->th.th_dispatch == 830 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 831 832 my_buffer_index = th->th.th_dispatch->th_disp_index++; 833 834 /* What happens when number of threads changes, need to resize buffer? */ 835 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 836 &th->th.th_dispatch 837 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 838 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 839 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 840 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 841 my_buffer_index)); 842 } 843 844 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 845 #if USE_ITT_BUILD 846 &cur_chunk, 847 #endif 848 chunk, (T)th->th.th_team_nproc, 849 (T)th->th.th_info.ds.ds_tid); 850 if (active) { 851 if (pr->flags.ordered == 0) { 852 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 853 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 854 } else { 855 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 856 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 857 } 858 } 859 860 if (active) { 861 /* The name of this buffer should be my_buffer_index when it's free to use 862 * it */ 863 864 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 865 "sh->buffer_index:%d\n", 866 gtid, my_buffer_index, sh->buffer_index)); 867 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 868 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 869 // Note: KMP_WAIT() cannot be used there: buffer index and 870 // my_buffer_index are *always* 32-bit integers. 871 KMP_MB(); /* is this necessary? */ 872 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 873 "sh->buffer_index:%d\n", 874 gtid, my_buffer_index, sh->buffer_index)); 875 876 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 877 th->th.th_dispatch->th_dispatch_sh_current = 878 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 879 #if USE_ITT_BUILD 880 if (pr->flags.ordered) { 881 __kmp_itt_ordered_init(gtid); 882 } 883 // Report loop metadata 884 if (itt_need_metadata_reporting) { 885 // Only report metadata by master of active team at level 1 886 kmp_uint64 schedtype = 0; 887 switch (schedule) { 888 case kmp_sch_static_chunked: 889 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 890 break; 891 case kmp_sch_static_greedy: 892 cur_chunk = pr->u.p.parm1; 893 break; 894 case kmp_sch_dynamic_chunked: 895 schedtype = 1; 896 break; 897 case kmp_sch_guided_iterative_chunked: 898 case kmp_sch_guided_analytical_chunked: 899 #if OMP_45_ENABLED 900 case kmp_sch_guided_simd: 901 #endif 902 schedtype = 2; 903 break; 904 default: 905 // Should we put this case under "static"? 906 // case kmp_sch_static_steal: 907 schedtype = 3; 908 break; 909 } 910 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 911 } 912 #if KMP_USE_HIER_SCHED 913 if (pr->flags.use_hier) { 914 pr->u.p.count = 0; 915 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 916 } 917 #endif // KMP_USER_HIER_SCHED 918 #endif /* USE_ITT_BUILD */ 919 } 920 921 #ifdef KMP_DEBUG 922 { 923 char *buff; 924 // create format specifiers before the debug output 925 buff = __kmp_str_format( 926 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 927 "lb:%%%s ub:%%%s" 928 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 929 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 930 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 931 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 932 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 933 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 934 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 935 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 936 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 937 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 938 __kmp_str_free(&buff); 939 } 940 #endif 941 #if (KMP_STATIC_STEAL_ENABLED) 942 // It cannot be guaranteed that after execution of a loop with some other 943 // schedule kind all the parm3 variables will contain the same value. Even if 944 // all parm3 will be the same, it still exists a bad case like using 0 and 1 945 // rather than program life-time increment. So the dedicated variable is 946 // required. The 'static_steal_counter' is used. 947 if (schedule == kmp_sch_static_steal) { 948 // Other threads will inspect this variable when searching for a victim. 949 // This is a flag showing that other threads may steal from this thread 950 // since then. 951 volatile T *p = &pr->u.p.static_steal_counter; 952 *p = *p + 1; 953 } 954 #endif // ( KMP_STATIC_STEAL_ENABLED ) 955 956 #if OMPT_SUPPORT && OMPT_OPTIONAL 957 if (ompt_enabled.ompt_callback_work) { 958 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 959 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 960 ompt_callbacks.ompt_callback(ompt_callback_work)( 961 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 962 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 963 } 964 #endif 965 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 966 } 967 968 /* For ordered loops, either __kmp_dispatch_finish() should be called after 969 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 970 * every chunk of iterations. If the ordered section(s) were not executed 971 * for this iteration (or every iteration in this chunk), we need to set the 972 * ordered iteration counters so that the next thread can proceed. */ 973 template <typename UT> 974 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 975 typedef typename traits_t<UT>::signed_t ST; 976 kmp_info_t *th = __kmp_threads[gtid]; 977 978 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 979 if (!th->th.th_team->t.t_serialized) { 980 981 dispatch_private_info_template<UT> *pr = 982 reinterpret_cast<dispatch_private_info_template<UT> *>( 983 th->th.th_dispatch->th_dispatch_pr_current); 984 dispatch_shared_info_template<UT> volatile *sh = 985 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 986 th->th.th_dispatch->th_dispatch_sh_current); 987 KMP_DEBUG_ASSERT(pr); 988 KMP_DEBUG_ASSERT(sh); 989 KMP_DEBUG_ASSERT(th->th.th_dispatch == 990 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 991 992 if (pr->ordered_bumped) { 993 KD_TRACE( 994 1000, 995 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 996 gtid)); 997 pr->ordered_bumped = 0; 998 } else { 999 UT lower = pr->u.p.ordered_lower; 1000 1001 #ifdef KMP_DEBUG 1002 { 1003 char *buff; 1004 // create format specifiers before the debug output 1005 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1006 "ordered_iteration:%%%s lower:%%%s\n", 1007 traits_t<UT>::spec, traits_t<UT>::spec); 1008 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1009 __kmp_str_free(&buff); 1010 } 1011 #endif 1012 1013 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1014 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1015 KMP_MB(); /* is this necessary? */ 1016 #ifdef KMP_DEBUG 1017 { 1018 char *buff; 1019 // create format specifiers before the debug output 1020 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1021 "ordered_iteration:%%%s lower:%%%s\n", 1022 traits_t<UT>::spec, traits_t<UT>::spec); 1023 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1024 __kmp_str_free(&buff); 1025 } 1026 #endif 1027 1028 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1029 } // if 1030 } // if 1031 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1032 } 1033 1034 #ifdef KMP_GOMP_COMPAT 1035 1036 template <typename UT> 1037 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1038 typedef typename traits_t<UT>::signed_t ST; 1039 kmp_info_t *th = __kmp_threads[gtid]; 1040 1041 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1042 if (!th->th.th_team->t.t_serialized) { 1043 // int cid; 1044 dispatch_private_info_template<UT> *pr = 1045 reinterpret_cast<dispatch_private_info_template<UT> *>( 1046 th->th.th_dispatch->th_dispatch_pr_current); 1047 dispatch_shared_info_template<UT> volatile *sh = 1048 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1049 th->th.th_dispatch->th_dispatch_sh_current); 1050 KMP_DEBUG_ASSERT(pr); 1051 KMP_DEBUG_ASSERT(sh); 1052 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1053 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1054 1055 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1056 UT lower = pr->u.p.ordered_lower; 1057 UT upper = pr->u.p.ordered_upper; 1058 UT inc = upper - lower + 1; 1059 1060 if (pr->ordered_bumped == inc) { 1061 KD_TRACE( 1062 1000, 1063 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1064 gtid)); 1065 pr->ordered_bumped = 0; 1066 } else { 1067 inc -= pr->ordered_bumped; 1068 1069 #ifdef KMP_DEBUG 1070 { 1071 char *buff; 1072 // create format specifiers before the debug output 1073 buff = __kmp_str_format( 1074 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1075 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1076 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1077 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1078 __kmp_str_free(&buff); 1079 } 1080 #endif 1081 1082 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1083 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1084 1085 KMP_MB(); /* is this necessary? */ 1086 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1087 "ordered_bumped to zero\n", 1088 gtid)); 1089 pr->ordered_bumped = 0; 1090 //!!!!! TODO check if the inc should be unsigned, or signed??? 1091 #ifdef KMP_DEBUG 1092 { 1093 char *buff; 1094 // create format specifiers before the debug output 1095 buff = __kmp_str_format( 1096 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1097 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1098 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1099 traits_t<UT>::spec); 1100 KD_TRACE(1000, 1101 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1102 __kmp_str_free(&buff); 1103 } 1104 #endif 1105 1106 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1107 } 1108 // } 1109 } 1110 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1111 } 1112 1113 #endif /* KMP_GOMP_COMPAT */ 1114 1115 template <typename T> 1116 int __kmp_dispatch_next_algorithm(int gtid, 1117 dispatch_private_info_template<T> *pr, 1118 dispatch_shared_info_template<T> volatile *sh, 1119 kmp_int32 *p_last, T *p_lb, T *p_ub, 1120 typename traits_t<T>::signed_t *p_st, T nproc, 1121 T tid) { 1122 typedef typename traits_t<T>::unsigned_t UT; 1123 typedef typename traits_t<T>::signed_t ST; 1124 typedef typename traits_t<T>::floating_t DBL; 1125 int status = 0; 1126 kmp_int32 last = 0; 1127 T start; 1128 ST incr; 1129 UT limit, trip, init; 1130 kmp_info_t *th = __kmp_threads[gtid]; 1131 kmp_team_t *team = th->th.th_team; 1132 1133 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1134 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1135 KMP_DEBUG_ASSERT(pr); 1136 KMP_DEBUG_ASSERT(sh); 1137 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1138 #ifdef KMP_DEBUG 1139 { 1140 char *buff; 1141 // create format specifiers before the debug output 1142 buff = 1143 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1144 "sh:%%p nproc:%%%s tid:%%%s\n", 1145 traits_t<T>::spec, traits_t<T>::spec); 1146 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1147 __kmp_str_free(&buff); 1148 } 1149 #endif 1150 1151 // zero trip count 1152 if (pr->u.p.tc == 0) { 1153 KD_TRACE(10, 1154 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1155 "zero status:%d\n", 1156 gtid, status)); 1157 return 0; 1158 } 1159 1160 switch (pr->schedule) { 1161 #if (KMP_STATIC_STEAL_ENABLED) 1162 case kmp_sch_static_steal: { 1163 T chunk = pr->u.p.parm1; 1164 1165 KD_TRACE(100, 1166 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1167 gtid)); 1168 1169 trip = pr->u.p.tc - 1; 1170 1171 if (traits_t<T>::type_size > 4) { 1172 // use lock for 8-byte and CAS for 4-byte induction 1173 // variable. TODO (optional): check and use 16-byte CAS 1174 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1175 KMP_DEBUG_ASSERT(lck != NULL); 1176 if (pr->u.p.count < (UT)pr->u.p.ub) { 1177 __kmp_acquire_lock(lck, gtid); 1178 // try to get own chunk of iterations 1179 init = (pr->u.p.count)++; 1180 status = (init < (UT)pr->u.p.ub); 1181 __kmp_release_lock(lck, gtid); 1182 } else { 1183 status = 0; // no own chunks 1184 } 1185 if (!status) { // try to steal 1186 kmp_info_t **other_threads = team->t.t_threads; 1187 int while_limit = nproc; // nproc attempts to find a victim 1188 int while_index = 0; 1189 // TODO: algorithm of searching for a victim 1190 // should be cleaned up and measured 1191 while ((!status) && (while_limit != ++while_index)) { 1192 T remaining; 1193 T victimIdx = pr->u.p.parm4; 1194 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1195 dispatch_private_info_template<T> *victim = 1196 reinterpret_cast<dispatch_private_info_template<T> *>( 1197 other_threads[victimIdx] 1198 ->th.th_dispatch->th_dispatch_pr_current); 1199 while ((victim == NULL || victim == pr || 1200 (*(volatile T *)&victim->u.p.static_steal_counter != 1201 *(volatile T *)&pr->u.p.static_steal_counter)) && 1202 oldVictimIdx != victimIdx) { 1203 victimIdx = (victimIdx + 1) % nproc; 1204 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1205 other_threads[victimIdx] 1206 ->th.th_dispatch->th_dispatch_pr_current); 1207 } 1208 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1209 *(volatile T *)&pr->u.p.static_steal_counter)) { 1210 continue; // try once more (nproc attempts in total) 1211 // no victim is ready yet to participate in stealing 1212 // because all victims are still in kmp_init_dispatch 1213 } 1214 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1215 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1216 continue; // not enough chunks to steal, goto next victim 1217 } 1218 1219 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1220 KMP_ASSERT(lck != NULL); 1221 __kmp_acquire_lock(lck, gtid); 1222 limit = victim->u.p.ub; // keep initial ub 1223 if (victim->u.p.count >= limit || 1224 (remaining = limit - victim->u.p.count) < 2) { 1225 __kmp_release_lock(lck, gtid); 1226 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1227 continue; // not enough chunks to steal 1228 } 1229 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1230 // by 1 1231 if (remaining > 3) { 1232 // steal 1/4 of remaining 1233 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1234 init = (victim->u.p.ub -= (remaining >> 2)); 1235 } else { 1236 // steal 1 chunk of 2 or 3 remaining 1237 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1238 init = (victim->u.p.ub -= 1); 1239 } 1240 __kmp_release_lock(lck, gtid); 1241 1242 KMP_DEBUG_ASSERT(init + 1 <= limit); 1243 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1244 status = 1; 1245 while_index = 0; 1246 // now update own count and ub with stolen range but init chunk 1247 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1248 pr->u.p.count = init + 1; 1249 pr->u.p.ub = limit; 1250 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1251 } // while (search for victim) 1252 } // if (try to find victim and steal) 1253 } else { 1254 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1255 typedef union { 1256 struct { 1257 UT count; 1258 T ub; 1259 } p; 1260 kmp_int64 b; 1261 } union_i4; 1262 // All operations on 'count' or 'ub' must be combined atomically 1263 // together. 1264 { 1265 union_i4 vold, vnew; 1266 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1267 vnew = vold; 1268 vnew.p.count++; 1269 while (!KMP_COMPARE_AND_STORE_ACQ64( 1270 (volatile kmp_int64 *)&pr->u.p.count, 1271 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1272 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1273 KMP_CPU_PAUSE(); 1274 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1275 vnew = vold; 1276 vnew.p.count++; 1277 } 1278 vnew = vold; 1279 init = vnew.p.count; 1280 status = (init < (UT)vnew.p.ub); 1281 } 1282 1283 if (!status) { 1284 kmp_info_t **other_threads = team->t.t_threads; 1285 int while_limit = nproc; // nproc attempts to find a victim 1286 int while_index = 0; 1287 1288 // TODO: algorithm of searching for a victim 1289 // should be cleaned up and measured 1290 while ((!status) && (while_limit != ++while_index)) { 1291 union_i4 vold, vnew; 1292 kmp_int32 remaining; 1293 T victimIdx = pr->u.p.parm4; 1294 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1295 dispatch_private_info_template<T> *victim = 1296 reinterpret_cast<dispatch_private_info_template<T> *>( 1297 other_threads[victimIdx] 1298 ->th.th_dispatch->th_dispatch_pr_current); 1299 while ((victim == NULL || victim == pr || 1300 (*(volatile T *)&victim->u.p.static_steal_counter != 1301 *(volatile T *)&pr->u.p.static_steal_counter)) && 1302 oldVictimIdx != victimIdx) { 1303 victimIdx = (victimIdx + 1) % nproc; 1304 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1305 other_threads[victimIdx] 1306 ->th.th_dispatch->th_dispatch_pr_current); 1307 } 1308 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1309 *(volatile T *)&pr->u.p.static_steal_counter)) { 1310 continue; // try once more (nproc attempts in total) 1311 // no victim is ready yet to participate in stealing 1312 // because all victims are still in kmp_init_dispatch 1313 } 1314 pr->u.p.parm4 = victimIdx; // new victim found 1315 while (1) { // CAS loop if victim has enough chunks to steal 1316 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1317 vnew = vold; 1318 1319 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1320 if (vnew.p.count >= (UT)vnew.p.ub || 1321 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1322 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1323 break; // not enough chunks to steal, goto next victim 1324 } 1325 if (remaining > 3) { 1326 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1327 } else { 1328 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1329 } 1330 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1331 // TODO: Should this be acquire or release? 1332 if (KMP_COMPARE_AND_STORE_ACQ64( 1333 (volatile kmp_int64 *)&victim->u.p.count, 1334 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1335 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1336 // stealing succedded 1337 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1338 vold.p.ub - vnew.p.ub); 1339 status = 1; 1340 while_index = 0; 1341 // now update own count and ub 1342 init = vnew.p.ub; 1343 vold.p.count = init + 1; 1344 #if KMP_ARCH_X86 1345 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1346 #else 1347 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1348 #endif 1349 break; 1350 } // if (check CAS result) 1351 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1352 } // while (try to steal from particular victim) 1353 } // while (search for victim) 1354 } // if (try to find victim and steal) 1355 } // if (4-byte induction variable) 1356 if (!status) { 1357 *p_lb = 0; 1358 *p_ub = 0; 1359 if (p_st != NULL) 1360 *p_st = 0; 1361 } else { 1362 start = pr->u.p.parm2; 1363 init *= chunk; 1364 limit = chunk + init - 1; 1365 incr = pr->u.p.st; 1366 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1367 1368 KMP_DEBUG_ASSERT(init <= trip); 1369 if ((last = (limit >= trip)) != 0) 1370 limit = trip; 1371 if (p_st != NULL) 1372 *p_st = incr; 1373 1374 if (incr == 1) { 1375 *p_lb = start + init; 1376 *p_ub = start + limit; 1377 } else { 1378 *p_lb = start + init * incr; 1379 *p_ub = start + limit * incr; 1380 } 1381 1382 if (pr->flags.ordered) { 1383 pr->u.p.ordered_lower = init; 1384 pr->u.p.ordered_upper = limit; 1385 } // if 1386 } // if 1387 break; 1388 } // case 1389 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1390 case kmp_sch_static_balanced: { 1391 KD_TRACE( 1392 10, 1393 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1394 gtid)); 1395 /* check if thread has any iteration to do */ 1396 if ((status = !pr->u.p.count) != 0) { 1397 pr->u.p.count = 1; 1398 *p_lb = pr->u.p.lb; 1399 *p_ub = pr->u.p.ub; 1400 last = pr->u.p.parm1; 1401 if (p_st != NULL) 1402 *p_st = pr->u.p.st; 1403 } else { /* no iterations to do */ 1404 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1405 } 1406 } // case 1407 break; 1408 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1409 merged here */ 1410 case kmp_sch_static_chunked: { 1411 T parm1; 1412 1413 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1414 "kmp_sch_static_[affinity|chunked] case\n", 1415 gtid)); 1416 parm1 = pr->u.p.parm1; 1417 1418 trip = pr->u.p.tc - 1; 1419 init = parm1 * (pr->u.p.count + tid); 1420 1421 if ((status = (init <= trip)) != 0) { 1422 start = pr->u.p.lb; 1423 incr = pr->u.p.st; 1424 limit = parm1 + init - 1; 1425 1426 if ((last = (limit >= trip)) != 0) 1427 limit = trip; 1428 1429 if (p_st != NULL) 1430 *p_st = incr; 1431 1432 pr->u.p.count += nproc; 1433 1434 if (incr == 1) { 1435 *p_lb = start + init; 1436 *p_ub = start + limit; 1437 } else { 1438 *p_lb = start + init * incr; 1439 *p_ub = start + limit * incr; 1440 } 1441 1442 if (pr->flags.ordered) { 1443 pr->u.p.ordered_lower = init; 1444 pr->u.p.ordered_upper = limit; 1445 } // if 1446 } // if 1447 } // case 1448 break; 1449 1450 case kmp_sch_dynamic_chunked: { 1451 T chunk = pr->u.p.parm1; 1452 1453 KD_TRACE( 1454 100, 1455 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1456 gtid)); 1457 1458 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1459 trip = pr->u.p.tc - 1; 1460 1461 if ((status = (init <= trip)) == 0) { 1462 *p_lb = 0; 1463 *p_ub = 0; 1464 if (p_st != NULL) 1465 *p_st = 0; 1466 } else { 1467 start = pr->u.p.lb; 1468 limit = chunk + init - 1; 1469 incr = pr->u.p.st; 1470 1471 if ((last = (limit >= trip)) != 0) 1472 limit = trip; 1473 1474 if (p_st != NULL) 1475 *p_st = incr; 1476 1477 if (incr == 1) { 1478 *p_lb = start + init; 1479 *p_ub = start + limit; 1480 } else { 1481 *p_lb = start + init * incr; 1482 *p_ub = start + limit * incr; 1483 } 1484 1485 if (pr->flags.ordered) { 1486 pr->u.p.ordered_lower = init; 1487 pr->u.p.ordered_upper = limit; 1488 } // if 1489 } // if 1490 } // case 1491 break; 1492 1493 case kmp_sch_guided_iterative_chunked: { 1494 T chunkspec = pr->u.p.parm1; 1495 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1496 "iterative case\n", 1497 gtid)); 1498 trip = pr->u.p.tc; 1499 // Start atomic part of calculations 1500 while (1) { 1501 ST remaining; // signed, because can be < 0 1502 init = sh->u.s.iteration; // shared value 1503 remaining = trip - init; 1504 if (remaining <= 0) { // AC: need to compare with 0 first 1505 // nothing to do, don't try atomic op 1506 status = 0; 1507 break; 1508 } 1509 if ((T)remaining < 1510 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1511 // use dynamic-style shcedule 1512 // atomically inrement iterations, get old value 1513 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1514 (ST)chunkspec); 1515 remaining = trip - init; 1516 if (remaining <= 0) { 1517 status = 0; // all iterations got by other threads 1518 } else { 1519 // got some iterations to work on 1520 status = 1; 1521 if ((T)remaining > chunkspec) { 1522 limit = init + chunkspec - 1; 1523 } else { 1524 last = 1; // the last chunk 1525 limit = init + remaining - 1; 1526 } // if 1527 } // if 1528 break; 1529 } // if 1530 limit = init + 1531 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1532 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1533 (ST)init, (ST)limit)) { 1534 // CAS was successful, chunk obtained 1535 status = 1; 1536 --limit; 1537 break; 1538 } // if 1539 } // while 1540 if (status != 0) { 1541 start = pr->u.p.lb; 1542 incr = pr->u.p.st; 1543 if (p_st != NULL) 1544 *p_st = incr; 1545 *p_lb = start + init * incr; 1546 *p_ub = start + limit * incr; 1547 if (pr->flags.ordered) { 1548 pr->u.p.ordered_lower = init; 1549 pr->u.p.ordered_upper = limit; 1550 } // if 1551 } else { 1552 *p_lb = 0; 1553 *p_ub = 0; 1554 if (p_st != NULL) 1555 *p_st = 0; 1556 } // if 1557 } // case 1558 break; 1559 1560 #if OMP_45_ENABLED 1561 case kmp_sch_guided_simd: { 1562 // same as iterative but curr-chunk adjusted to be multiple of given 1563 // chunk 1564 T chunk = pr->u.p.parm1; 1565 KD_TRACE(100, 1566 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1567 gtid)); 1568 trip = pr->u.p.tc; 1569 // Start atomic part of calculations 1570 while (1) { 1571 ST remaining; // signed, because can be < 0 1572 init = sh->u.s.iteration; // shared value 1573 remaining = trip - init; 1574 if (remaining <= 0) { // AC: need to compare with 0 first 1575 status = 0; // nothing to do, don't try atomic op 1576 break; 1577 } 1578 KMP_DEBUG_ASSERT(init % chunk == 0); 1579 // compare with K*nproc*(chunk+1), K=2 by default 1580 if ((T)remaining < pr->u.p.parm2) { 1581 // use dynamic-style shcedule 1582 // atomically inrement iterations, get old value 1583 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1584 (ST)chunk); 1585 remaining = trip - init; 1586 if (remaining <= 0) { 1587 status = 0; // all iterations got by other threads 1588 } else { 1589 // got some iterations to work on 1590 status = 1; 1591 if ((T)remaining > chunk) { 1592 limit = init + chunk - 1; 1593 } else { 1594 last = 1; // the last chunk 1595 limit = init + remaining - 1; 1596 } // if 1597 } // if 1598 break; 1599 } // if 1600 // divide by K*nproc 1601 UT span = remaining * (*(double *)&pr->u.p.parm3); 1602 UT rem = span % chunk; 1603 if (rem) // adjust so that span%chunk == 0 1604 span += chunk - rem; 1605 limit = init + span; 1606 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1607 (ST)init, (ST)limit)) { 1608 // CAS was successful, chunk obtained 1609 status = 1; 1610 --limit; 1611 break; 1612 } // if 1613 } // while 1614 if (status != 0) { 1615 start = pr->u.p.lb; 1616 incr = pr->u.p.st; 1617 if (p_st != NULL) 1618 *p_st = incr; 1619 *p_lb = start + init * incr; 1620 *p_ub = start + limit * incr; 1621 if (pr->flags.ordered) { 1622 pr->u.p.ordered_lower = init; 1623 pr->u.p.ordered_upper = limit; 1624 } // if 1625 } else { 1626 *p_lb = 0; 1627 *p_ub = 0; 1628 if (p_st != NULL) 1629 *p_st = 0; 1630 } // if 1631 } // case 1632 break; 1633 #endif // OMP_45_ENABLED 1634 1635 case kmp_sch_guided_analytical_chunked: { 1636 T chunkspec = pr->u.p.parm1; 1637 UT chunkIdx; 1638 #if KMP_USE_X87CONTROL 1639 /* for storing original FPCW value for Windows* OS on 1640 IA-32 architecture 8-byte version */ 1641 unsigned int oldFpcw; 1642 unsigned int fpcwSet = 0; 1643 #endif 1644 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1645 "kmp_sch_guided_analytical_chunked case\n", 1646 gtid)); 1647 1648 trip = pr->u.p.tc; 1649 1650 KMP_DEBUG_ASSERT(nproc > 1); 1651 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1652 1653 while (1) { /* this while loop is a safeguard against unexpected zero 1654 chunk sizes */ 1655 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1656 if (chunkIdx >= (UT)pr->u.p.parm2) { 1657 --trip; 1658 /* use dynamic-style scheduling */ 1659 init = chunkIdx * chunkspec + pr->u.p.count; 1660 /* need to verify init > 0 in case of overflow in the above 1661 * calculation */ 1662 if ((status = (init > 0 && init <= trip)) != 0) { 1663 limit = init + chunkspec - 1; 1664 1665 if ((last = (limit >= trip)) != 0) 1666 limit = trip; 1667 } 1668 break; 1669 } else { 1670 /* use exponential-style scheduling */ 1671 /* The following check is to workaround the lack of long double precision on 1672 Windows* OS. 1673 This check works around the possible effect that init != 0 for chunkIdx == 0. 1674 */ 1675 #if KMP_USE_X87CONTROL 1676 /* If we haven't already done so, save original 1677 FPCW and set precision to 64-bit, as Windows* OS 1678 on IA-32 architecture defaults to 53-bit */ 1679 if (!fpcwSet) { 1680 oldFpcw = _control87(0, 0); 1681 _control87(_PC_64, _MCW_PC); 1682 fpcwSet = 0x30000; 1683 } 1684 #endif 1685 if (chunkIdx) { 1686 init = __kmp_dispatch_guided_remaining<T>( 1687 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1688 KMP_DEBUG_ASSERT(init); 1689 init = trip - init; 1690 } else 1691 init = 0; 1692 limit = trip - __kmp_dispatch_guided_remaining<T>( 1693 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1694 KMP_ASSERT(init <= limit); 1695 if (init < limit) { 1696 KMP_DEBUG_ASSERT(limit <= trip); 1697 --limit; 1698 status = 1; 1699 break; 1700 } // if 1701 } // if 1702 } // while (1) 1703 #if KMP_USE_X87CONTROL 1704 /* restore FPCW if necessary 1705 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1706 */ 1707 if (fpcwSet && (oldFpcw & fpcwSet)) 1708 _control87(oldFpcw, _MCW_PC); 1709 #endif 1710 if (status != 0) { 1711 start = pr->u.p.lb; 1712 incr = pr->u.p.st; 1713 if (p_st != NULL) 1714 *p_st = incr; 1715 *p_lb = start + init * incr; 1716 *p_ub = start + limit * incr; 1717 if (pr->flags.ordered) { 1718 pr->u.p.ordered_lower = init; 1719 pr->u.p.ordered_upper = limit; 1720 } 1721 } else { 1722 *p_lb = 0; 1723 *p_ub = 0; 1724 if (p_st != NULL) 1725 *p_st = 0; 1726 } 1727 } // case 1728 break; 1729 1730 case kmp_sch_trapezoidal: { 1731 UT index; 1732 T parm2 = pr->u.p.parm2; 1733 T parm3 = pr->u.p.parm3; 1734 T parm4 = pr->u.p.parm4; 1735 KD_TRACE(100, 1736 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1737 gtid)); 1738 1739 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1740 1741 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1742 trip = pr->u.p.tc - 1; 1743 1744 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1745 *p_lb = 0; 1746 *p_ub = 0; 1747 if (p_st != NULL) 1748 *p_st = 0; 1749 } else { 1750 start = pr->u.p.lb; 1751 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1752 incr = pr->u.p.st; 1753 1754 if ((last = (limit >= trip)) != 0) 1755 limit = trip; 1756 1757 if (p_st != NULL) 1758 *p_st = incr; 1759 1760 if (incr == 1) { 1761 *p_lb = start + init; 1762 *p_ub = start + limit; 1763 } else { 1764 *p_lb = start + init * incr; 1765 *p_ub = start + limit * incr; 1766 } 1767 1768 if (pr->flags.ordered) { 1769 pr->u.p.ordered_lower = init; 1770 pr->u.p.ordered_upper = limit; 1771 } // if 1772 } // if 1773 } // case 1774 break; 1775 default: { 1776 status = 0; // to avoid complaints on uninitialized variable use 1777 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1778 KMP_HNT(GetNewerLibrary), // Hint 1779 __kmp_msg_null // Variadic argument list terminator 1780 ); 1781 } break; 1782 } // switch 1783 if (p_last) 1784 *p_last = last; 1785 #ifdef KMP_DEBUG 1786 if (pr->flags.ordered) { 1787 char *buff; 1788 // create format specifiers before the debug output 1789 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1790 "ordered_lower:%%%s ordered_upper:%%%s\n", 1791 traits_t<UT>::spec, traits_t<UT>::spec); 1792 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1793 __kmp_str_free(&buff); 1794 } 1795 { 1796 char *buff; 1797 // create format specifiers before the debug output 1798 buff = __kmp_str_format( 1799 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1800 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1801 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1802 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1803 __kmp_str_free(&buff); 1804 } 1805 #endif 1806 return status; 1807 } 1808 1809 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1810 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1811 is not called. */ 1812 #if OMPT_SUPPORT && OMPT_OPTIONAL 1813 #define OMPT_LOOP_END \ 1814 if (status == 0) { \ 1815 if (ompt_enabled.ompt_callback_work) { \ 1816 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1817 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1818 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1819 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1820 &(task_info->task_data), 0, codeptr); \ 1821 } \ 1822 } 1823 // TODO: implement count 1824 #else 1825 #define OMPT_LOOP_END // no-op 1826 #endif 1827 1828 #if KMP_STATS_ENABLED 1829 #define KMP_STATS_LOOP_END \ 1830 { \ 1831 kmp_int64 u, l, t, i; \ 1832 l = (kmp_int64)(*p_lb); \ 1833 u = (kmp_int64)(*p_ub); \ 1834 i = (kmp_int64)(pr->u.p.st); \ 1835 if (status == 0) { \ 1836 t = 0; \ 1837 KMP_POP_PARTITIONED_TIMER(); \ 1838 } else if (i == 1) { \ 1839 if (u >= l) \ 1840 t = u - l + 1; \ 1841 else \ 1842 t = 0; \ 1843 } else if (i < 0) { \ 1844 if (l >= u) \ 1845 t = (l - u) / (-i) + 1; \ 1846 else \ 1847 t = 0; \ 1848 } else { \ 1849 if (u >= l) \ 1850 t = (u - l) / i + 1; \ 1851 else \ 1852 t = 0; \ 1853 } \ 1854 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1855 } 1856 #else 1857 #define KMP_STATS_LOOP_END /* Nothing */ 1858 #endif 1859 1860 template <typename T> 1861 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1862 T *p_lb, T *p_ub, 1863 typename traits_t<T>::signed_t *p_st 1864 #if OMPT_SUPPORT && OMPT_OPTIONAL 1865 , 1866 void *codeptr 1867 #endif 1868 ) { 1869 1870 typedef typename traits_t<T>::unsigned_t UT; 1871 typedef typename traits_t<T>::signed_t ST; 1872 // This is potentially slightly misleading, schedule(runtime) will appear here 1873 // even if the actual runtme schedule is static. (Which points out a 1874 // disadavantage of schedule(runtime): even when static scheduling is used it 1875 // costs more than a compile time choice to use static scheduling would.) 1876 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1877 1878 int status; 1879 dispatch_private_info_template<T> *pr; 1880 kmp_info_t *th = __kmp_threads[gtid]; 1881 kmp_team_t *team = th->th.th_team; 1882 1883 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1884 KD_TRACE( 1885 1000, 1886 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1887 gtid, p_lb, p_ub, p_st, p_last)); 1888 1889 if (team->t.t_serialized) { 1890 /* NOTE: serialize this dispatch becase we are not at the active level */ 1891 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1892 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1893 KMP_DEBUG_ASSERT(pr); 1894 1895 if ((status = (pr->u.p.tc != 0)) == 0) { 1896 *p_lb = 0; 1897 *p_ub = 0; 1898 // if ( p_last != NULL ) 1899 // *p_last = 0; 1900 if (p_st != NULL) 1901 *p_st = 0; 1902 if (__kmp_env_consistency_check) { 1903 if (pr->pushed_ws != ct_none) { 1904 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1905 } 1906 } 1907 } else if (pr->flags.nomerge) { 1908 kmp_int32 last; 1909 T start; 1910 UT limit, trip, init; 1911 ST incr; 1912 T chunk = pr->u.p.parm1; 1913 1914 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1915 gtid)); 1916 1917 init = chunk * pr->u.p.count++; 1918 trip = pr->u.p.tc - 1; 1919 1920 if ((status = (init <= trip)) == 0) { 1921 *p_lb = 0; 1922 *p_ub = 0; 1923 // if ( p_last != NULL ) 1924 // *p_last = 0; 1925 if (p_st != NULL) 1926 *p_st = 0; 1927 if (__kmp_env_consistency_check) { 1928 if (pr->pushed_ws != ct_none) { 1929 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1930 } 1931 } 1932 } else { 1933 start = pr->u.p.lb; 1934 limit = chunk + init - 1; 1935 incr = pr->u.p.st; 1936 1937 if ((last = (limit >= trip)) != 0) { 1938 limit = trip; 1939 #if KMP_OS_WINDOWS 1940 pr->u.p.last_upper = pr->u.p.ub; 1941 #endif /* KMP_OS_WINDOWS */ 1942 } 1943 if (p_last != NULL) 1944 *p_last = last; 1945 if (p_st != NULL) 1946 *p_st = incr; 1947 if (incr == 1) { 1948 *p_lb = start + init; 1949 *p_ub = start + limit; 1950 } else { 1951 *p_lb = start + init * incr; 1952 *p_ub = start + limit * incr; 1953 } 1954 1955 if (pr->flags.ordered) { 1956 pr->u.p.ordered_lower = init; 1957 pr->u.p.ordered_upper = limit; 1958 #ifdef KMP_DEBUG 1959 { 1960 char *buff; 1961 // create format specifiers before the debug output 1962 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1963 "ordered_lower:%%%s ordered_upper:%%%s\n", 1964 traits_t<UT>::spec, traits_t<UT>::spec); 1965 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1966 pr->u.p.ordered_upper)); 1967 __kmp_str_free(&buff); 1968 } 1969 #endif 1970 } // if 1971 } // if 1972 } else { 1973 pr->u.p.tc = 0; 1974 *p_lb = pr->u.p.lb; 1975 *p_ub = pr->u.p.ub; 1976 #if KMP_OS_WINDOWS 1977 pr->u.p.last_upper = *p_ub; 1978 #endif /* KMP_OS_WINDOWS */ 1979 if (p_last != NULL) 1980 *p_last = TRUE; 1981 if (p_st != NULL) 1982 *p_st = pr->u.p.st; 1983 } // if 1984 #ifdef KMP_DEBUG 1985 { 1986 char *buff; 1987 // create format specifiers before the debug output 1988 buff = __kmp_str_format( 1989 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1990 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1991 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1992 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1993 __kmp_str_free(&buff); 1994 } 1995 #endif 1996 #if INCLUDE_SSC_MARKS 1997 SSC_MARK_DISPATCH_NEXT(); 1998 #endif 1999 OMPT_LOOP_END; 2000 KMP_STATS_LOOP_END; 2001 return status; 2002 } else { 2003 kmp_int32 last = 0; 2004 dispatch_shared_info_template<T> volatile *sh; 2005 2006 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2007 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2008 2009 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2010 th->th.th_dispatch->th_dispatch_pr_current); 2011 KMP_DEBUG_ASSERT(pr); 2012 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2013 th->th.th_dispatch->th_dispatch_sh_current); 2014 KMP_DEBUG_ASSERT(sh); 2015 2016 #if KMP_USE_HIER_SCHED 2017 if (pr->flags.use_hier) 2018 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2019 else 2020 #endif // KMP_USE_HIER_SCHED 2021 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2022 p_st, th->th.th_team_nproc, 2023 th->th.th_info.ds.ds_tid); 2024 // status == 0: no more iterations to execute 2025 if (status == 0) { 2026 UT num_done; 2027 2028 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2029 #ifdef KMP_DEBUG 2030 { 2031 char *buff; 2032 // create format specifiers before the debug output 2033 buff = __kmp_str_format( 2034 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2035 traits_t<UT>::spec); 2036 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2037 __kmp_str_free(&buff); 2038 } 2039 #endif 2040 2041 #if KMP_USE_HIER_SCHED 2042 pr->flags.use_hier = FALSE; 2043 #endif 2044 if ((ST)num_done == th->th.th_team_nproc - 1) { 2045 #if (KMP_STATIC_STEAL_ENABLED) 2046 if (pr->schedule == kmp_sch_static_steal && 2047 traits_t<T>::type_size > 4) { 2048 int i; 2049 kmp_info_t **other_threads = team->t.t_threads; 2050 // loop complete, safe to destroy locks used for stealing 2051 for (i = 0; i < th->th.th_team_nproc; ++i) { 2052 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2053 KMP_ASSERT(lck != NULL); 2054 __kmp_destroy_lock(lck); 2055 __kmp_free(lck); 2056 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2057 } 2058 } 2059 #endif 2060 /* NOTE: release this buffer to be reused */ 2061 2062 KMP_MB(); /* Flush all pending memory write invalidates. */ 2063 2064 sh->u.s.num_done = 0; 2065 sh->u.s.iteration = 0; 2066 2067 /* TODO replace with general release procedure? */ 2068 if (pr->flags.ordered) { 2069 sh->u.s.ordered_iteration = 0; 2070 } 2071 2072 KMP_MB(); /* Flush all pending memory write invalidates. */ 2073 2074 sh->buffer_index += __kmp_dispatch_num_buffers; 2075 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2076 gtid, sh->buffer_index)); 2077 2078 KMP_MB(); /* Flush all pending memory write invalidates. */ 2079 2080 } // if 2081 if (__kmp_env_consistency_check) { 2082 if (pr->pushed_ws != ct_none) { 2083 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2084 } 2085 } 2086 2087 th->th.th_dispatch->th_deo_fcn = NULL; 2088 th->th.th_dispatch->th_dxo_fcn = NULL; 2089 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2090 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2091 } // if (status == 0) 2092 #if KMP_OS_WINDOWS 2093 else if (last) { 2094 pr->u.p.last_upper = pr->u.p.ub; 2095 } 2096 #endif /* KMP_OS_WINDOWS */ 2097 if (p_last != NULL && status != 0) 2098 *p_last = last; 2099 } // if 2100 2101 #ifdef KMP_DEBUG 2102 { 2103 char *buff; 2104 // create format specifiers before the debug output 2105 buff = __kmp_str_format( 2106 "__kmp_dispatch_next: T#%%d normal case: " 2107 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2108 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2109 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2110 (p_last ? *p_last : 0), status)); 2111 __kmp_str_free(&buff); 2112 } 2113 #endif 2114 #if INCLUDE_SSC_MARKS 2115 SSC_MARK_DISPATCH_NEXT(); 2116 #endif 2117 OMPT_LOOP_END; 2118 KMP_STATS_LOOP_END; 2119 return status; 2120 } 2121 2122 template <typename T> 2123 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2124 kmp_int32 *plastiter, T *plower, T *pupper, 2125 typename traits_t<T>::signed_t incr) { 2126 typedef typename traits_t<T>::unsigned_t UT; 2127 kmp_uint32 team_id; 2128 kmp_uint32 nteams; 2129 UT trip_count; 2130 kmp_team_t *team; 2131 kmp_info_t *th; 2132 2133 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2134 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2135 #ifdef KMP_DEBUG 2136 typedef typename traits_t<T>::signed_t ST; 2137 { 2138 char *buff; 2139 // create format specifiers before the debug output 2140 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2141 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2142 traits_t<T>::spec, traits_t<T>::spec, 2143 traits_t<ST>::spec, traits_t<T>::spec); 2144 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2145 __kmp_str_free(&buff); 2146 } 2147 #endif 2148 2149 if (__kmp_env_consistency_check) { 2150 if (incr == 0) { 2151 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2152 loc); 2153 } 2154 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2155 // The loop is illegal. 2156 // Some zero-trip loops maintained by compiler, e.g.: 2157 // for(i=10;i<0;++i) // lower >= upper - run-time check 2158 // for(i=0;i>10;--i) // lower <= upper - run-time check 2159 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2160 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2161 // Compiler does not check the following illegal loops: 2162 // for(i=0;i<10;i+=incr) // where incr<0 2163 // for(i=10;i>0;i-=incr) // where incr<0 2164 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2165 } 2166 } 2167 th = __kmp_threads[gtid]; 2168 team = th->th.th_team; 2169 #if OMP_40_ENABLED 2170 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2171 nteams = th->th.th_teams_size.nteams; 2172 #endif 2173 team_id = team->t.t_master_tid; 2174 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2175 2176 // compute global trip count 2177 if (incr == 1) { 2178 trip_count = *pupper - *plower + 1; 2179 } else if (incr == -1) { 2180 trip_count = *plower - *pupper + 1; 2181 } else if (incr > 0) { 2182 // upper-lower can exceed the limit of signed type 2183 trip_count = (UT)(*pupper - *plower) / incr + 1; 2184 } else { 2185 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2186 } 2187 2188 if (trip_count <= nteams) { 2189 KMP_DEBUG_ASSERT( 2190 __kmp_static == kmp_sch_static_greedy || 2191 __kmp_static == 2192 kmp_sch_static_balanced); // Unknown static scheduling type. 2193 // only some teams get single iteration, others get nothing 2194 if (team_id < trip_count) { 2195 *pupper = *plower = *plower + team_id * incr; 2196 } else { 2197 *plower = *pupper + incr; // zero-trip loop 2198 } 2199 if (plastiter != NULL) 2200 *plastiter = (team_id == trip_count - 1); 2201 } else { 2202 if (__kmp_static == kmp_sch_static_balanced) { 2203 UT chunk = trip_count / nteams; 2204 UT extras = trip_count % nteams; 2205 *plower += 2206 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2207 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2208 if (plastiter != NULL) 2209 *plastiter = (team_id == nteams - 1); 2210 } else { 2211 T chunk_inc_count = 2212 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2213 T upper = *pupper; 2214 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2215 // Unknown static scheduling type. 2216 *plower += team_id * chunk_inc_count; 2217 *pupper = *plower + chunk_inc_count - incr; 2218 // Check/correct bounds if needed 2219 if (incr > 0) { 2220 if (*pupper < *plower) 2221 *pupper = traits_t<T>::max_value; 2222 if (plastiter != NULL) 2223 *plastiter = *plower <= upper && *pupper > upper - incr; 2224 if (*pupper > upper) 2225 *pupper = upper; // tracker C73258 2226 } else { 2227 if (*pupper > *plower) 2228 *pupper = traits_t<T>::min_value; 2229 if (plastiter != NULL) 2230 *plastiter = *plower >= upper && *pupper < upper - incr; 2231 if (*pupper < upper) 2232 *pupper = upper; // tracker C73258 2233 } 2234 } 2235 } 2236 } 2237 2238 //----------------------------------------------------------------------------- 2239 // Dispatch routines 2240 // Transfer call to template< type T > 2241 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2242 // T lb, T ub, ST st, ST chunk ) 2243 extern "C" { 2244 2245 /*! 2246 @ingroup WORK_SHARING 2247 @{ 2248 @param loc Source location 2249 @param gtid Global thread id 2250 @param schedule Schedule type 2251 @param lb Lower bound 2252 @param ub Upper bound 2253 @param st Step (or increment if you prefer) 2254 @param chunk The chunk size to block with 2255 2256 This function prepares the runtime to start a dynamically scheduled for loop, 2257 saving the loop arguments. 2258 These functions are all identical apart from the types of the arguments. 2259 */ 2260 2261 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2262 enum sched_type schedule, kmp_int32 lb, 2263 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2264 KMP_DEBUG_ASSERT(__kmp_init_serial); 2265 #if OMPT_SUPPORT && OMPT_OPTIONAL 2266 OMPT_STORE_RETURN_ADDRESS(gtid); 2267 #endif 2268 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2269 } 2270 /*! 2271 See @ref __kmpc_dispatch_init_4 2272 */ 2273 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2274 enum sched_type schedule, kmp_uint32 lb, 2275 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2276 KMP_DEBUG_ASSERT(__kmp_init_serial); 2277 #if OMPT_SUPPORT && OMPT_OPTIONAL 2278 OMPT_STORE_RETURN_ADDRESS(gtid); 2279 #endif 2280 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2281 } 2282 2283 /*! 2284 See @ref __kmpc_dispatch_init_4 2285 */ 2286 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2287 enum sched_type schedule, kmp_int64 lb, 2288 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2289 KMP_DEBUG_ASSERT(__kmp_init_serial); 2290 #if OMPT_SUPPORT && OMPT_OPTIONAL 2291 OMPT_STORE_RETURN_ADDRESS(gtid); 2292 #endif 2293 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2294 } 2295 2296 /*! 2297 See @ref __kmpc_dispatch_init_4 2298 */ 2299 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2300 enum sched_type schedule, kmp_uint64 lb, 2301 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2302 KMP_DEBUG_ASSERT(__kmp_init_serial); 2303 #if OMPT_SUPPORT && OMPT_OPTIONAL 2304 OMPT_STORE_RETURN_ADDRESS(gtid); 2305 #endif 2306 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2307 } 2308 2309 /*! 2310 See @ref __kmpc_dispatch_init_4 2311 2312 Difference from __kmpc_dispatch_init set of functions is these functions 2313 are called for composite distribute parallel for construct. Thus before 2314 regular iterations dispatching we need to calc per-team iteration space. 2315 2316 These functions are all identical apart from the types of the arguments. 2317 */ 2318 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2319 enum sched_type schedule, kmp_int32 *p_last, 2320 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2321 kmp_int32 chunk) { 2322 KMP_DEBUG_ASSERT(__kmp_init_serial); 2323 #if OMPT_SUPPORT && OMPT_OPTIONAL 2324 OMPT_STORE_RETURN_ADDRESS(gtid); 2325 #endif 2326 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2327 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2328 } 2329 2330 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2331 enum sched_type schedule, kmp_int32 *p_last, 2332 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2333 kmp_int32 chunk) { 2334 KMP_DEBUG_ASSERT(__kmp_init_serial); 2335 #if OMPT_SUPPORT && OMPT_OPTIONAL 2336 OMPT_STORE_RETURN_ADDRESS(gtid); 2337 #endif 2338 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2339 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2340 } 2341 2342 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2343 enum sched_type schedule, kmp_int32 *p_last, 2344 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2345 kmp_int64 chunk) { 2346 KMP_DEBUG_ASSERT(__kmp_init_serial); 2347 #if OMPT_SUPPORT && OMPT_OPTIONAL 2348 OMPT_STORE_RETURN_ADDRESS(gtid); 2349 #endif 2350 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2351 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2352 } 2353 2354 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2355 enum sched_type schedule, kmp_int32 *p_last, 2356 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2357 kmp_int64 chunk) { 2358 KMP_DEBUG_ASSERT(__kmp_init_serial); 2359 #if OMPT_SUPPORT && OMPT_OPTIONAL 2360 OMPT_STORE_RETURN_ADDRESS(gtid); 2361 #endif 2362 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2363 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2364 } 2365 2366 /*! 2367 @param loc Source code location 2368 @param gtid Global thread id 2369 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2370 otherwise 2371 @param p_lb Pointer to the lower bound for the next chunk of work 2372 @param p_ub Pointer to the upper bound for the next chunk of work 2373 @param p_st Pointer to the stride for the next chunk of work 2374 @return one if there is work to be done, zero otherwise 2375 2376 Get the next dynamically allocated chunk of work for this thread. 2377 If there is no more work, then the lb,ub and stride need not be modified. 2378 */ 2379 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2380 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2381 #if OMPT_SUPPORT && OMPT_OPTIONAL 2382 OMPT_STORE_RETURN_ADDRESS(gtid); 2383 #endif 2384 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2385 #if OMPT_SUPPORT && OMPT_OPTIONAL 2386 , 2387 OMPT_LOAD_RETURN_ADDRESS(gtid) 2388 #endif 2389 ); 2390 } 2391 2392 /*! 2393 See @ref __kmpc_dispatch_next_4 2394 */ 2395 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2396 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2397 kmp_int32 *p_st) { 2398 #if OMPT_SUPPORT && OMPT_OPTIONAL 2399 OMPT_STORE_RETURN_ADDRESS(gtid); 2400 #endif 2401 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2402 #if OMPT_SUPPORT && OMPT_OPTIONAL 2403 , 2404 OMPT_LOAD_RETURN_ADDRESS(gtid) 2405 #endif 2406 ); 2407 } 2408 2409 /*! 2410 See @ref __kmpc_dispatch_next_4 2411 */ 2412 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2413 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2414 #if OMPT_SUPPORT && OMPT_OPTIONAL 2415 OMPT_STORE_RETURN_ADDRESS(gtid); 2416 #endif 2417 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2418 #if OMPT_SUPPORT && OMPT_OPTIONAL 2419 , 2420 OMPT_LOAD_RETURN_ADDRESS(gtid) 2421 #endif 2422 ); 2423 } 2424 2425 /*! 2426 See @ref __kmpc_dispatch_next_4 2427 */ 2428 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2429 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2430 kmp_int64 *p_st) { 2431 #if OMPT_SUPPORT && OMPT_OPTIONAL 2432 OMPT_STORE_RETURN_ADDRESS(gtid); 2433 #endif 2434 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2435 #if OMPT_SUPPORT && OMPT_OPTIONAL 2436 , 2437 OMPT_LOAD_RETURN_ADDRESS(gtid) 2438 #endif 2439 ); 2440 } 2441 2442 /*! 2443 @param loc Source code location 2444 @param gtid Global thread id 2445 2446 Mark the end of a dynamic loop. 2447 */ 2448 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2449 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2450 } 2451 2452 /*! 2453 See @ref __kmpc_dispatch_fini_4 2454 */ 2455 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2456 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2457 } 2458 2459 /*! 2460 See @ref __kmpc_dispatch_fini_4 2461 */ 2462 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2463 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2464 } 2465 2466 /*! 2467 See @ref __kmpc_dispatch_fini_4 2468 */ 2469 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2470 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2471 } 2472 /*! @} */ 2473 2474 //----------------------------------------------------------------------------- 2475 // Non-template routines from kmp_dispatch.cpp used in other sources 2476 2477 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2478 return value == checker; 2479 } 2480 2481 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2482 return value != checker; 2483 } 2484 2485 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2486 return value < checker; 2487 } 2488 2489 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2490 return value >= checker; 2491 } 2492 2493 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2494 return value <= checker; 2495 } 2496 2497 kmp_uint32 2498 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2499 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2500 void *obj // Higher-level synchronization object, or NULL. 2501 ) { 2502 // note: we may not belong to a team at this point 2503 volatile kmp_uint32 *spin = spinner; 2504 kmp_uint32 check = checker; 2505 kmp_uint32 spins; 2506 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2507 kmp_uint32 r; 2508 2509 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2510 KMP_INIT_YIELD(spins); 2511 // main wait spin loop 2512 while (!f(r = TCR_4(*spin), check)) { 2513 KMP_FSYNC_SPIN_PREPARE(obj); 2514 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2515 split. It causes problems with infinite recursion because of exit lock */ 2516 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2517 __kmp_abort_thread(); */ 2518 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2519 } 2520 KMP_FSYNC_SPIN_ACQUIRED(obj); 2521 return r; 2522 } 2523 2524 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2525 kmp_uint32 (*pred)(void *, kmp_uint32), 2526 void *obj // Higher-level synchronization object, or NULL. 2527 ) { 2528 // note: we may not belong to a team at this point 2529 void *spin = spinner; 2530 kmp_uint32 check = checker; 2531 kmp_uint32 spins; 2532 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2533 2534 KMP_FSYNC_SPIN_INIT(obj, spin); 2535 KMP_INIT_YIELD(spins); 2536 // main wait spin loop 2537 while (!f(spin, check)) { 2538 KMP_FSYNC_SPIN_PREPARE(obj); 2539 /* if we have waited a bit, or are noversubscribed, yield */ 2540 /* pause is in the following code */ 2541 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2542 } 2543 KMP_FSYNC_SPIN_ACQUIRED(obj); 2544 } 2545 2546 } // extern "C" 2547 2548 #ifdef KMP_GOMP_COMPAT 2549 2550 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2551 enum sched_type schedule, kmp_int32 lb, 2552 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2553 int push_ws) { 2554 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2555 push_ws); 2556 } 2557 2558 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2559 enum sched_type schedule, kmp_uint32 lb, 2560 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2561 int push_ws) { 2562 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2563 push_ws); 2564 } 2565 2566 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2567 enum sched_type schedule, kmp_int64 lb, 2568 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2569 int push_ws) { 2570 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2571 push_ws); 2572 } 2573 2574 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2575 enum sched_type schedule, kmp_uint64 lb, 2576 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2577 int push_ws) { 2578 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2579 push_ws); 2580 } 2581 2582 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2583 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2584 } 2585 2586 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2587 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2588 } 2589 2590 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2591 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2592 } 2593 2594 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2595 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2596 } 2597 2598 #endif /* KMP_GOMP_COMPAT */ 2599 2600 /* ------------------------------------------------------------------------ */ 2601