1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 int monotonicity; 76 // default to monotonic 77 monotonicity = SCHEDULE_MONOTONIC; 78 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 79 monotonicity = SCHEDULE_NONMONOTONIC; 80 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 81 monotonicity = SCHEDULE_MONOTONIC; 82 return monotonicity; 83 } 84 85 // Initialize a dispatch_private_info_template<T> buffer for a particular 86 // type of schedule,chunk. The loop description is found in lb (lower bound), 87 // ub (upper bound), and st (stride). nproc is the number of threads relevant 88 // to the scheduling (often the number of threads in a team, but not always if 89 // hierarchical scheduling is used). tid is the id of the thread calling 90 // the function within the group of nproc threads. It will have a value 91 // between 0 and nproc - 1. This is often just the thread id within a team, but 92 // is not necessarily the case when using hierarchical scheduling. 93 // loc is the source file location of the corresponding loop 94 // gtid is the global thread id 95 template <typename T> 96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 97 dispatch_private_info_template<T> *pr, 98 enum sched_type schedule, T lb, T ub, 99 typename traits_t<T>::signed_t st, 100 #if USE_ITT_BUILD 101 kmp_uint64 *cur_chunk, 102 #endif 103 typename traits_t<T>::signed_t chunk, 104 T nproc, T tid) { 105 typedef typename traits_t<T>::unsigned_t UT; 106 typedef typename traits_t<T>::floating_t DBL; 107 108 int active; 109 T tc; 110 kmp_info_t *th; 111 kmp_team_t *team; 112 int monotonicity; 113 bool use_hier; 114 115 #ifdef KMP_DEBUG 116 typedef typename traits_t<T>::signed_t ST; 117 { 118 char *buff; 119 // create format specifiers before the debug output 120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 121 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 123 traits_t<T>::spec, traits_t<T>::spec, 124 traits_t<ST>::spec, traits_t<ST>::spec, 125 traits_t<T>::spec, traits_t<T>::spec); 126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 127 __kmp_str_free(&buff); 128 } 129 #endif 130 /* setup data */ 131 th = __kmp_threads[gtid]; 132 team = th->th.th_team; 133 active = !team->t.t_serialized; 134 135 #if USE_ITT_BUILD 136 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 137 __kmp_forkjoin_frames_mode == 3 && 138 KMP_MASTER_GTID(gtid) && 139 #if OMP_40_ENABLED 140 th->th.th_teams_microtask == NULL && 141 #endif 142 team->t.t_active_level == 1; 143 #endif 144 145 #if KMP_USE_HIER_SCHED 146 use_hier = pr->flags.use_hier; 147 #else 148 use_hier = false; 149 #endif 150 151 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 152 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 153 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 154 155 /* Pick up the nomerge/ordered bits from the scheduling type */ 156 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 157 pr->flags.nomerge = TRUE; 158 schedule = 159 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 160 } else { 161 pr->flags.nomerge = FALSE; 162 } 163 pr->type_size = traits_t<T>::type_size; // remember the size of variables 164 if (kmp_ord_lower & schedule) { 165 pr->flags.ordered = TRUE; 166 schedule = 167 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 168 } else { 169 pr->flags.ordered = FALSE; 170 } 171 // Ordered overrides nonmonotonic 172 if (pr->flags.ordered) { 173 monotonicity = SCHEDULE_MONOTONIC; 174 } 175 176 if (schedule == kmp_sch_static) { 177 schedule = __kmp_static; 178 } else { 179 if (schedule == kmp_sch_runtime) { 180 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 181 // not specified) 182 schedule = team->t.t_sched.r_sched_type; 183 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 184 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 185 // Detail the schedule if needed (global controls are differentiated 186 // appropriately) 187 if (schedule == kmp_sch_guided_chunked) { 188 schedule = __kmp_guided; 189 } else if (schedule == kmp_sch_static) { 190 schedule = __kmp_static; 191 } 192 // Use the chunk size specified by OMP_SCHEDULE (or default if not 193 // specified) 194 chunk = team->t.t_sched.chunk; 195 #if USE_ITT_BUILD 196 if (cur_chunk) 197 *cur_chunk = chunk; 198 #endif 199 #ifdef KMP_DEBUG 200 { 201 char *buff; 202 // create format specifiers before the debug output 203 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 204 "schedule:%%d chunk:%%%s\n", 205 traits_t<ST>::spec); 206 KD_TRACE(10, (buff, gtid, schedule, chunk)); 207 __kmp_str_free(&buff); 208 } 209 #endif 210 } else { 211 if (schedule == kmp_sch_guided_chunked) { 212 schedule = __kmp_guided; 213 } 214 if (chunk <= 0) { 215 chunk = KMP_DEFAULT_CHUNK; 216 } 217 } 218 219 if (schedule == kmp_sch_auto) { 220 // mapping and differentiation: in the __kmp_do_serial_initialize() 221 schedule = __kmp_auto; 222 #ifdef KMP_DEBUG 223 { 224 char *buff; 225 // create format specifiers before the debug output 226 buff = __kmp_str_format( 227 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 228 "schedule:%%d chunk:%%%s\n", 229 traits_t<ST>::spec); 230 KD_TRACE(10, (buff, gtid, schedule, chunk)); 231 __kmp_str_free(&buff); 232 } 233 #endif 234 } 235 #if KMP_STATIC_STEAL_ENABLED 236 // map nonmonotonic:dynamic to static steal 237 if (schedule == kmp_sch_dynamic_chunked) { 238 if (monotonicity == SCHEDULE_NONMONOTONIC) 239 schedule = kmp_sch_static_steal; 240 } 241 #endif 242 /* guided analytical not safe for too many threads */ 243 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 244 schedule = kmp_sch_guided_iterative_chunked; 245 KMP_WARNING(DispatchManyThreads); 246 } 247 #if OMP_45_ENABLED 248 if (schedule == kmp_sch_runtime_simd) { 249 // compiler provides simd_width in the chunk parameter 250 schedule = team->t.t_sched.r_sched_type; 251 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 252 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 253 // Detail the schedule if needed (global controls are differentiated 254 // appropriately) 255 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 256 schedule == __kmp_static) { 257 schedule = kmp_sch_static_balanced_chunked; 258 } else { 259 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 260 schedule = kmp_sch_guided_simd; 261 } 262 chunk = team->t.t_sched.chunk * chunk; 263 } 264 #if USE_ITT_BUILD 265 if (cur_chunk) 266 *cur_chunk = chunk; 267 #endif 268 #ifdef KMP_DEBUG 269 { 270 char *buff; 271 // create format specifiers before the debug output 272 buff = __kmp_str_format( 273 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 274 " chunk:%%%s\n", 275 traits_t<ST>::spec); 276 KD_TRACE(10, (buff, gtid, schedule, chunk)); 277 __kmp_str_free(&buff); 278 } 279 #endif 280 } 281 #endif // OMP_45_ENABLED 282 pr->u.p.parm1 = chunk; 283 } 284 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 285 "unknown scheduling type"); 286 287 pr->u.p.count = 0; 288 289 if (__kmp_env_consistency_check) { 290 if (st == 0) { 291 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 292 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 293 } 294 } 295 // compute trip count 296 if (st == 1) { // most common case 297 if (ub >= lb) { 298 tc = ub - lb + 1; 299 } else { // ub < lb 300 tc = 0; // zero-trip 301 } 302 } else if (st < 0) { 303 if (lb >= ub) { 304 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 305 // where the division needs to be unsigned regardless of the result type 306 tc = (UT)(lb - ub) / (-st) + 1; 307 } else { // lb < ub 308 tc = 0; // zero-trip 309 } 310 } else { // st > 0 311 if (ub >= lb) { 312 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 313 // where the division needs to be unsigned regardless of the result type 314 tc = (UT)(ub - lb) / st + 1; 315 } else { // ub < lb 316 tc = 0; // zero-trip 317 } 318 } 319 320 #if KMP_STATS_ENABLED 321 if (KMP_MASTER_GTID(gtid)) { 322 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 323 } 324 #endif 325 326 pr->u.p.lb = lb; 327 pr->u.p.ub = ub; 328 pr->u.p.st = st; 329 pr->u.p.tc = tc; 330 331 #if KMP_OS_WINDOWS 332 pr->u.p.last_upper = ub + st; 333 #endif /* KMP_OS_WINDOWS */ 334 335 /* NOTE: only the active parallel region(s) has active ordered sections */ 336 337 if (active) { 338 if (pr->flags.ordered) { 339 pr->ordered_bumped = 0; 340 pr->u.p.ordered_lower = 1; 341 pr->u.p.ordered_upper = 0; 342 } 343 } 344 345 switch (schedule) { 346 #if (KMP_STATIC_STEAL_ENABLED) 347 case kmp_sch_static_steal: { 348 T ntc, init; 349 350 KD_TRACE(100, 351 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 352 gtid)); 353 354 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 355 if (nproc > 1 && ntc >= nproc) { 356 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 357 T id = tid; 358 T small_chunk, extras; 359 360 small_chunk = ntc / nproc; 361 extras = ntc % nproc; 362 363 init = id * small_chunk + (id < extras ? id : extras); 364 pr->u.p.count = init; 365 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 366 367 pr->u.p.parm2 = lb; 368 // parm3 is the number of times to attempt stealing which is 369 // proportional to the number of chunks per thread up until 370 // the maximum value of nproc. 371 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 372 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 373 pr->u.p.st = st; 374 if (traits_t<T>::type_size > 4) { 375 // AC: TODO: check if 16-byte CAS available and use it to 376 // improve performance (probably wait for explicit request 377 // before spending time on this). 378 // For now use dynamically allocated per-thread lock, 379 // free memory in __kmp_dispatch_next when status==0. 380 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 381 th->th.th_dispatch->th_steal_lock = 382 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 383 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 384 } 385 break; 386 } else { 387 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 388 "kmp_sch_static_balanced\n", 389 gtid)); 390 schedule = kmp_sch_static_balanced; 391 /* too few iterations: fall-through to kmp_sch_static_balanced */ 392 } // if 393 /* FALL-THROUGH to static balanced */ 394 KMP_FALLTHROUGH(); 395 } // case 396 #endif 397 case kmp_sch_static_balanced: { 398 T init, limit; 399 400 KD_TRACE( 401 100, 402 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 403 gtid)); 404 405 if (nproc > 1) { 406 T id = tid; 407 408 if (tc < nproc) { 409 if (id < tc) { 410 init = id; 411 limit = id; 412 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 413 } else { 414 pr->u.p.count = 1; /* means no more chunks to execute */ 415 pr->u.p.parm1 = FALSE; 416 break; 417 } 418 } else { 419 T small_chunk = tc / nproc; 420 T extras = tc % nproc; 421 init = id * small_chunk + (id < extras ? id : extras); 422 limit = init + small_chunk - (id < extras ? 0 : 1); 423 pr->u.p.parm1 = (id == nproc - 1); 424 } 425 } else { 426 if (tc > 0) { 427 init = 0; 428 limit = tc - 1; 429 pr->u.p.parm1 = TRUE; 430 } else { 431 // zero trip count 432 pr->u.p.count = 1; /* means no more chunks to execute */ 433 pr->u.p.parm1 = FALSE; 434 break; 435 } 436 } 437 #if USE_ITT_BUILD 438 // Calculate chunk for metadata report 439 if (itt_need_metadata_reporting) 440 if (cur_chunk) 441 *cur_chunk = limit - init + 1; 442 #endif 443 if (st == 1) { 444 pr->u.p.lb = lb + init; 445 pr->u.p.ub = lb + limit; 446 } else { 447 // calculated upper bound, "ub" is user-defined upper bound 448 T ub_tmp = lb + limit * st; 449 pr->u.p.lb = lb + init * st; 450 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 451 // it exactly 452 if (st > 0) { 453 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 454 } else { 455 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 456 } 457 } 458 if (pr->flags.ordered) { 459 pr->u.p.ordered_lower = init; 460 pr->u.p.ordered_upper = limit; 461 } 462 break; 463 } // case 464 #if OMP_45_ENABLED 465 case kmp_sch_static_balanced_chunked: { 466 // similar to balanced, but chunk adjusted to multiple of simd width 467 T nth = nproc; 468 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 469 " -> falling-through to static_greedy\n", 470 gtid)); 471 schedule = kmp_sch_static_greedy; 472 if (nth > 1) 473 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 474 else 475 pr->u.p.parm1 = tc; 476 break; 477 } // case 478 case kmp_sch_guided_simd: 479 #endif // OMP_45_ENABLED 480 case kmp_sch_guided_iterative_chunked: { 481 KD_TRACE( 482 100, 483 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 484 " case\n", 485 gtid)); 486 487 if (nproc > 1) { 488 if ((2L * chunk + 1) * nproc >= tc) { 489 /* chunk size too large, switch to dynamic */ 490 schedule = kmp_sch_dynamic_chunked; 491 } else { 492 // when remaining iters become less than parm2 - switch to dynamic 493 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 494 *(double *)&pr->u.p.parm3 = 495 guided_flt_param / nproc; // may occupy parm3 and parm4 496 } 497 } else { 498 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 499 "kmp_sch_static_greedy\n", 500 gtid)); 501 schedule = kmp_sch_static_greedy; 502 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 503 KD_TRACE( 504 100, 505 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 506 gtid)); 507 pr->u.p.parm1 = tc; 508 } // if 509 } // case 510 break; 511 case kmp_sch_guided_analytical_chunked: { 512 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 513 "kmp_sch_guided_analytical_chunked case\n", 514 gtid)); 515 516 if (nproc > 1) { 517 if ((2L * chunk + 1) * nproc >= tc) { 518 /* chunk size too large, switch to dynamic */ 519 schedule = kmp_sch_dynamic_chunked; 520 } else { 521 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 522 DBL x; 523 524 #if KMP_USE_X87CONTROL 525 /* Linux* OS already has 64-bit computation by default for long double, 526 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 527 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 528 instead of the default 53-bit. Even though long double doesn't work 529 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 530 expected to impact the correctness of the algorithm, but this has not 531 been mathematically proven. */ 532 // save original FPCW and set precision to 64-bit, as 533 // Windows* OS on IA-32 architecture defaults to 53-bit 534 unsigned int oldFpcw = _control87(0, 0); 535 _control87(_PC_64, _MCW_PC); // 0,0x30000 536 #endif 537 /* value used for comparison in solver for cross-over point */ 538 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 539 540 /* crossover point--chunk indexes equal to or greater than 541 this point switch to dynamic-style scheduling */ 542 UT cross; 543 544 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 545 x = (long double)1.0 - (long double)0.5 / nproc; 546 547 #ifdef KMP_DEBUG 548 { // test natural alignment 549 struct _test_a { 550 char a; 551 union { 552 char b; 553 DBL d; 554 }; 555 } t; 556 ptrdiff_t natural_alignment = 557 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 558 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 559 // long)natural_alignment ); 560 KMP_DEBUG_ASSERT( 561 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 562 } 563 #endif // KMP_DEBUG 564 565 /* save the term in thread private dispatch structure */ 566 *(DBL *)&pr->u.p.parm3 = x; 567 568 /* solve for the crossover point to the nearest integer i for which C_i 569 <= chunk */ 570 { 571 UT left, right, mid; 572 long double p; 573 574 /* estimate initial upper and lower bound */ 575 576 /* doesn't matter what value right is as long as it is positive, but 577 it affects performance of the solver */ 578 right = 229; 579 p = __kmp_pow<UT>(x, right); 580 if (p > target) { 581 do { 582 p *= p; 583 right <<= 1; 584 } while (p > target && right < (1 << 27)); 585 /* lower bound is previous (failed) estimate of upper bound */ 586 left = right >> 1; 587 } else { 588 left = 0; 589 } 590 591 /* bisection root-finding method */ 592 while (left + 1 < right) { 593 mid = (left + right) / 2; 594 if (__kmp_pow<UT>(x, mid) > target) { 595 left = mid; 596 } else { 597 right = mid; 598 } 599 } // while 600 cross = right; 601 } 602 /* assert sanity of computed crossover point */ 603 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 604 __kmp_pow<UT>(x, cross) <= target); 605 606 /* save the crossover point in thread private dispatch structure */ 607 pr->u.p.parm2 = cross; 608 609 // C75803 610 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 611 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 612 #else 613 #define GUIDED_ANALYTICAL_WORKAROUND (x) 614 #endif 615 /* dynamic-style scheduling offset */ 616 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 617 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 618 cross * chunk; 619 #if KMP_USE_X87CONTROL 620 // restore FPCW 621 _control87(oldFpcw, _MCW_PC); 622 #endif 623 } // if 624 } else { 625 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 626 "kmp_sch_static_greedy\n", 627 gtid)); 628 schedule = kmp_sch_static_greedy; 629 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 630 pr->u.p.parm1 = tc; 631 } // if 632 } // case 633 break; 634 case kmp_sch_static_greedy: 635 KD_TRACE( 636 100, 637 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 638 gtid)); 639 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 640 break; 641 case kmp_sch_static_chunked: 642 case kmp_sch_dynamic_chunked: 643 if (pr->u.p.parm1 <= 0) { 644 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 645 } 646 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 647 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 648 gtid)); 649 break; 650 case kmp_sch_trapezoidal: { 651 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 652 653 T parm1, parm2, parm3, parm4; 654 KD_TRACE(100, 655 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 656 gtid)); 657 658 parm1 = chunk; 659 660 /* F : size of the first cycle */ 661 parm2 = (tc / (2 * nproc)); 662 663 if (parm2 < 1) { 664 parm2 = 1; 665 } 666 667 /* L : size of the last cycle. Make sure the last cycle is not larger 668 than the first cycle. */ 669 if (parm1 < 1) { 670 parm1 = 1; 671 } else if (parm1 > parm2) { 672 parm1 = parm2; 673 } 674 675 /* N : number of cycles */ 676 parm3 = (parm2 + parm1); 677 parm3 = (2 * tc + parm3 - 1) / parm3; 678 679 if (parm3 < 2) { 680 parm3 = 2; 681 } 682 683 /* sigma : decreasing incr of the trapezoid */ 684 parm4 = (parm3 - 1); 685 parm4 = (parm2 - parm1) / parm4; 686 687 // pointless check, because parm4 >= 0 always 688 // if ( parm4 < 0 ) { 689 // parm4 = 0; 690 //} 691 692 pr->u.p.parm1 = parm1; 693 pr->u.p.parm2 = parm2; 694 pr->u.p.parm3 = parm3; 695 pr->u.p.parm4 = parm4; 696 } // case 697 break; 698 699 default: { 700 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 701 KMP_HNT(GetNewerLibrary), // Hint 702 __kmp_msg_null // Variadic argument list terminator 703 ); 704 } break; 705 } // switch 706 pr->schedule = schedule; 707 } 708 709 #if KMP_USE_HIER_SCHED 710 template <typename T> 711 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 712 typename traits_t<T>::signed_t st); 713 template <> 714 inline void 715 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 716 kmp_int32 ub, kmp_int32 st) { 717 __kmp_dispatch_init_hierarchy<kmp_int32>( 718 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 719 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 720 } 721 template <> 722 inline void 723 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 724 kmp_uint32 ub, kmp_int32 st) { 725 __kmp_dispatch_init_hierarchy<kmp_uint32>( 726 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 727 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 728 } 729 template <> 730 inline void 731 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 732 kmp_int64 ub, kmp_int64 st) { 733 __kmp_dispatch_init_hierarchy<kmp_int64>( 734 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 735 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 736 } 737 template <> 738 inline void 739 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 740 kmp_uint64 ub, kmp_int64 st) { 741 __kmp_dispatch_init_hierarchy<kmp_uint64>( 742 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 743 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 744 } 745 746 // free all the hierarchy scheduling memory associated with the team 747 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 748 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 749 for (int i = 0; i < num_disp_buff; ++i) { 750 // type does not matter here so use kmp_int32 751 auto sh = 752 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 753 &team->t.t_disp_buffer[i]); 754 if (sh->hier) { 755 sh->hier->deallocate(); 756 __kmp_free(sh->hier); 757 } 758 } 759 } 760 #endif 761 762 // UT - unsigned flavor of T, ST - signed flavor of T, 763 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 764 template <typename T> 765 static void 766 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 767 T ub, typename traits_t<T>::signed_t st, 768 typename traits_t<T>::signed_t chunk, int push_ws) { 769 typedef typename traits_t<T>::unsigned_t UT; 770 771 int active; 772 kmp_info_t *th; 773 kmp_team_t *team; 774 kmp_uint32 my_buffer_index; 775 dispatch_private_info_template<T> *pr; 776 dispatch_shared_info_template<T> volatile *sh; 777 778 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 779 sizeof(dispatch_private_info)); 780 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 781 sizeof(dispatch_shared_info)); 782 783 if (!TCR_4(__kmp_init_parallel)) 784 __kmp_parallel_initialize(); 785 786 #if OMP_50_ENABLED 787 __kmp_resume_if_soft_paused(); 788 #endif 789 790 #if INCLUDE_SSC_MARKS 791 SSC_MARK_DISPATCH_INIT(); 792 #endif 793 #ifdef KMP_DEBUG 794 typedef typename traits_t<T>::signed_t ST; 795 { 796 char *buff; 797 // create format specifiers before the debug output 798 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 799 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 800 traits_t<ST>::spec, traits_t<T>::spec, 801 traits_t<T>::spec, traits_t<ST>::spec); 802 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 803 __kmp_str_free(&buff); 804 } 805 #endif 806 /* setup data */ 807 th = __kmp_threads[gtid]; 808 team = th->th.th_team; 809 active = !team->t.t_serialized; 810 th->th.th_ident = loc; 811 812 // Any half-decent optimizer will remove this test when the blocks are empty 813 // since the macros expand to nothing 814 // when statistics are disabled. 815 if (schedule == __kmp_static) { 816 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 817 } else { 818 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 819 } 820 821 #if KMP_USE_HIER_SCHED 822 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 823 // Hierarchical scheduling does not work with ordered, so if ordered is 824 // detected, then revert back to threaded scheduling. 825 bool ordered; 826 enum sched_type my_sched = schedule; 827 my_buffer_index = th->th.th_dispatch->th_disp_index; 828 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 829 &th->th.th_dispatch 830 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 831 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 832 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 833 my_sched = 834 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 835 ordered = (kmp_ord_lower & my_sched); 836 if (pr->flags.use_hier) { 837 if (ordered) { 838 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 839 "Disabling hierarchical scheduling.\n", 840 gtid)); 841 pr->flags.use_hier = FALSE; 842 } 843 } 844 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 845 // Don't use hierarchical for ordered parallel loops and don't 846 // use the runtime hierarchy if one was specified in the program 847 if (!ordered && !pr->flags.use_hier) 848 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 849 } 850 #endif // KMP_USE_HIER_SCHED 851 852 #if USE_ITT_BUILD 853 kmp_uint64 cur_chunk = chunk; 854 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 855 __kmp_forkjoin_frames_mode == 3 && 856 KMP_MASTER_GTID(gtid) && 857 #if OMP_40_ENABLED 858 th->th.th_teams_microtask == NULL && 859 #endif 860 team->t.t_active_level == 1; 861 #endif 862 if (!active) { 863 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 864 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 865 } else { 866 KMP_DEBUG_ASSERT(th->th.th_dispatch == 867 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 868 869 my_buffer_index = th->th.th_dispatch->th_disp_index++; 870 871 /* What happens when number of threads changes, need to resize buffer? */ 872 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 873 &th->th.th_dispatch 874 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 875 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 876 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 877 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 878 my_buffer_index)); 879 } 880 881 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 882 #if USE_ITT_BUILD 883 &cur_chunk, 884 #endif 885 chunk, (T)th->th.th_team_nproc, 886 (T)th->th.th_info.ds.ds_tid); 887 if (active) { 888 if (pr->flags.ordered == 0) { 889 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 890 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 891 } else { 892 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 893 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 894 } 895 } 896 897 if (active) { 898 /* The name of this buffer should be my_buffer_index when it's free to use 899 * it */ 900 901 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 902 "sh->buffer_index:%d\n", 903 gtid, my_buffer_index, sh->buffer_index)); 904 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 905 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 906 // Note: KMP_WAIT() cannot be used there: buffer index and 907 // my_buffer_index are *always* 32-bit integers. 908 KMP_MB(); /* is this necessary? */ 909 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 910 "sh->buffer_index:%d\n", 911 gtid, my_buffer_index, sh->buffer_index)); 912 913 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 914 th->th.th_dispatch->th_dispatch_sh_current = 915 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 916 #if USE_ITT_BUILD 917 if (pr->flags.ordered) { 918 __kmp_itt_ordered_init(gtid); 919 } 920 // Report loop metadata 921 if (itt_need_metadata_reporting) { 922 // Only report metadata by master of active team at level 1 923 kmp_uint64 schedtype = 0; 924 switch (schedule) { 925 case kmp_sch_static_chunked: 926 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 927 break; 928 case kmp_sch_static_greedy: 929 cur_chunk = pr->u.p.parm1; 930 break; 931 case kmp_sch_dynamic_chunked: 932 schedtype = 1; 933 break; 934 case kmp_sch_guided_iterative_chunked: 935 case kmp_sch_guided_analytical_chunked: 936 #if OMP_45_ENABLED 937 case kmp_sch_guided_simd: 938 #endif 939 schedtype = 2; 940 break; 941 default: 942 // Should we put this case under "static"? 943 // case kmp_sch_static_steal: 944 schedtype = 3; 945 break; 946 } 947 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 948 } 949 #if KMP_USE_HIER_SCHED 950 if (pr->flags.use_hier) { 951 pr->u.p.count = 0; 952 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 953 } 954 #endif // KMP_USER_HIER_SCHED 955 #endif /* USE_ITT_BUILD */ 956 } 957 958 #ifdef KMP_DEBUG 959 { 960 char *buff; 961 // create format specifiers before the debug output 962 buff = __kmp_str_format( 963 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 964 "lb:%%%s ub:%%%s" 965 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 966 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 967 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 968 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 969 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 970 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 971 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 972 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 973 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 974 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 975 __kmp_str_free(&buff); 976 } 977 #endif 978 #if (KMP_STATIC_STEAL_ENABLED) 979 // It cannot be guaranteed that after execution of a loop with some other 980 // schedule kind all the parm3 variables will contain the same value. Even if 981 // all parm3 will be the same, it still exists a bad case like using 0 and 1 982 // rather than program life-time increment. So the dedicated variable is 983 // required. The 'static_steal_counter' is used. 984 if (schedule == kmp_sch_static_steal) { 985 // Other threads will inspect this variable when searching for a victim. 986 // This is a flag showing that other threads may steal from this thread 987 // since then. 988 volatile T *p = &pr->u.p.static_steal_counter; 989 *p = *p + 1; 990 } 991 #endif // ( KMP_STATIC_STEAL_ENABLED ) 992 993 #if OMPT_SUPPORT && OMPT_OPTIONAL 994 if (ompt_enabled.ompt_callback_work) { 995 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 996 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 997 ompt_callbacks.ompt_callback(ompt_callback_work)( 998 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 999 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1000 } 1001 #endif 1002 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1003 } 1004 1005 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1006 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1007 * every chunk of iterations. If the ordered section(s) were not executed 1008 * for this iteration (or every iteration in this chunk), we need to set the 1009 * ordered iteration counters so that the next thread can proceed. */ 1010 template <typename UT> 1011 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1012 typedef typename traits_t<UT>::signed_t ST; 1013 kmp_info_t *th = __kmp_threads[gtid]; 1014 1015 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1016 if (!th->th.th_team->t.t_serialized) { 1017 1018 dispatch_private_info_template<UT> *pr = 1019 reinterpret_cast<dispatch_private_info_template<UT> *>( 1020 th->th.th_dispatch->th_dispatch_pr_current); 1021 dispatch_shared_info_template<UT> volatile *sh = 1022 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1023 th->th.th_dispatch->th_dispatch_sh_current); 1024 KMP_DEBUG_ASSERT(pr); 1025 KMP_DEBUG_ASSERT(sh); 1026 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1027 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1028 1029 if (pr->ordered_bumped) { 1030 KD_TRACE( 1031 1000, 1032 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1033 gtid)); 1034 pr->ordered_bumped = 0; 1035 } else { 1036 UT lower = pr->u.p.ordered_lower; 1037 1038 #ifdef KMP_DEBUG 1039 { 1040 char *buff; 1041 // create format specifiers before the debug output 1042 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1043 "ordered_iteration:%%%s lower:%%%s\n", 1044 traits_t<UT>::spec, traits_t<UT>::spec); 1045 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1046 __kmp_str_free(&buff); 1047 } 1048 #endif 1049 1050 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1051 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1052 KMP_MB(); /* is this necessary? */ 1053 #ifdef KMP_DEBUG 1054 { 1055 char *buff; 1056 // create format specifiers before the debug output 1057 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1058 "ordered_iteration:%%%s lower:%%%s\n", 1059 traits_t<UT>::spec, traits_t<UT>::spec); 1060 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1061 __kmp_str_free(&buff); 1062 } 1063 #endif 1064 1065 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1066 } // if 1067 } // if 1068 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1069 } 1070 1071 #ifdef KMP_GOMP_COMPAT 1072 1073 template <typename UT> 1074 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1075 typedef typename traits_t<UT>::signed_t ST; 1076 kmp_info_t *th = __kmp_threads[gtid]; 1077 1078 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1079 if (!th->th.th_team->t.t_serialized) { 1080 // int cid; 1081 dispatch_private_info_template<UT> *pr = 1082 reinterpret_cast<dispatch_private_info_template<UT> *>( 1083 th->th.th_dispatch->th_dispatch_pr_current); 1084 dispatch_shared_info_template<UT> volatile *sh = 1085 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1086 th->th.th_dispatch->th_dispatch_sh_current); 1087 KMP_DEBUG_ASSERT(pr); 1088 KMP_DEBUG_ASSERT(sh); 1089 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1090 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1091 1092 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1093 UT lower = pr->u.p.ordered_lower; 1094 UT upper = pr->u.p.ordered_upper; 1095 UT inc = upper - lower + 1; 1096 1097 if (pr->ordered_bumped == inc) { 1098 KD_TRACE( 1099 1000, 1100 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1101 gtid)); 1102 pr->ordered_bumped = 0; 1103 } else { 1104 inc -= pr->ordered_bumped; 1105 1106 #ifdef KMP_DEBUG 1107 { 1108 char *buff; 1109 // create format specifiers before the debug output 1110 buff = __kmp_str_format( 1111 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1112 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1113 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1114 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1115 __kmp_str_free(&buff); 1116 } 1117 #endif 1118 1119 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1120 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1121 1122 KMP_MB(); /* is this necessary? */ 1123 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1124 "ordered_bumped to zero\n", 1125 gtid)); 1126 pr->ordered_bumped = 0; 1127 //!!!!! TODO check if the inc should be unsigned, or signed??? 1128 #ifdef KMP_DEBUG 1129 { 1130 char *buff; 1131 // create format specifiers before the debug output 1132 buff = __kmp_str_format( 1133 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1134 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1135 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1136 traits_t<UT>::spec); 1137 KD_TRACE(1000, 1138 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1139 __kmp_str_free(&buff); 1140 } 1141 #endif 1142 1143 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1144 } 1145 // } 1146 } 1147 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1148 } 1149 1150 #endif /* KMP_GOMP_COMPAT */ 1151 1152 template <typename T> 1153 int __kmp_dispatch_next_algorithm(int gtid, 1154 dispatch_private_info_template<T> *pr, 1155 dispatch_shared_info_template<T> volatile *sh, 1156 kmp_int32 *p_last, T *p_lb, T *p_ub, 1157 typename traits_t<T>::signed_t *p_st, T nproc, 1158 T tid) { 1159 typedef typename traits_t<T>::unsigned_t UT; 1160 typedef typename traits_t<T>::signed_t ST; 1161 typedef typename traits_t<T>::floating_t DBL; 1162 int status = 0; 1163 kmp_int32 last = 0; 1164 T start; 1165 ST incr; 1166 UT limit, trip, init; 1167 kmp_info_t *th = __kmp_threads[gtid]; 1168 kmp_team_t *team = th->th.th_team; 1169 1170 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1171 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1172 KMP_DEBUG_ASSERT(pr); 1173 KMP_DEBUG_ASSERT(sh); 1174 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1175 #ifdef KMP_DEBUG 1176 { 1177 char *buff; 1178 // create format specifiers before the debug output 1179 buff = 1180 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1181 "sh:%%p nproc:%%%s tid:%%%s\n", 1182 traits_t<T>::spec, traits_t<T>::spec); 1183 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1184 __kmp_str_free(&buff); 1185 } 1186 #endif 1187 1188 // zero trip count 1189 if (pr->u.p.tc == 0) { 1190 KD_TRACE(10, 1191 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1192 "zero status:%d\n", 1193 gtid, status)); 1194 return 0; 1195 } 1196 1197 switch (pr->schedule) { 1198 #if (KMP_STATIC_STEAL_ENABLED) 1199 case kmp_sch_static_steal: { 1200 T chunk = pr->u.p.parm1; 1201 1202 KD_TRACE(100, 1203 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1204 gtid)); 1205 1206 trip = pr->u.p.tc - 1; 1207 1208 if (traits_t<T>::type_size > 4) { 1209 // use lock for 8-byte and CAS for 4-byte induction 1210 // variable. TODO (optional): check and use 16-byte CAS 1211 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1212 KMP_DEBUG_ASSERT(lck != NULL); 1213 if (pr->u.p.count < (UT)pr->u.p.ub) { 1214 __kmp_acquire_lock(lck, gtid); 1215 // try to get own chunk of iterations 1216 init = (pr->u.p.count)++; 1217 status = (init < (UT)pr->u.p.ub); 1218 __kmp_release_lock(lck, gtid); 1219 } else { 1220 status = 0; // no own chunks 1221 } 1222 if (!status) { // try to steal 1223 kmp_info_t **other_threads = team->t.t_threads; 1224 int while_limit = pr->u.p.parm3; 1225 int while_index = 0; 1226 // TODO: algorithm of searching for a victim 1227 // should be cleaned up and measured 1228 while ((!status) && (while_limit != ++while_index)) { 1229 T remaining; 1230 T victimIdx = pr->u.p.parm4; 1231 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1232 dispatch_private_info_template<T> *victim = 1233 reinterpret_cast<dispatch_private_info_template<T> *>( 1234 other_threads[victimIdx] 1235 ->th.th_dispatch->th_dispatch_pr_current); 1236 while ((victim == NULL || victim == pr || 1237 (*(volatile T *)&victim->u.p.static_steal_counter != 1238 *(volatile T *)&pr->u.p.static_steal_counter)) && 1239 oldVictimIdx != victimIdx) { 1240 victimIdx = (victimIdx + 1) % nproc; 1241 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1242 other_threads[victimIdx] 1243 ->th.th_dispatch->th_dispatch_pr_current); 1244 } 1245 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1246 *(volatile T *)&pr->u.p.static_steal_counter)) { 1247 continue; // try once more (nproc attempts in total) 1248 // no victim is ready yet to participate in stealing 1249 // because all victims are still in kmp_init_dispatch 1250 } 1251 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1252 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1253 continue; // not enough chunks to steal, goto next victim 1254 } 1255 1256 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1257 KMP_ASSERT(lck != NULL); 1258 __kmp_acquire_lock(lck, gtid); 1259 limit = victim->u.p.ub; // keep initial ub 1260 if (victim->u.p.count >= limit || 1261 (remaining = limit - victim->u.p.count) < 2) { 1262 __kmp_release_lock(lck, gtid); 1263 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1264 continue; // not enough chunks to steal 1265 } 1266 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or 1267 // by 1 1268 if (remaining > 3) { 1269 // steal 1/4 of remaining 1270 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1271 init = (victim->u.p.ub -= (remaining >> 2)); 1272 } else { 1273 // steal 1 chunk of 2 or 3 remaining 1274 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1275 init = (victim->u.p.ub -= 1); 1276 } 1277 __kmp_release_lock(lck, gtid); 1278 1279 KMP_DEBUG_ASSERT(init + 1 <= limit); 1280 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1281 status = 1; 1282 while_index = 0; 1283 // now update own count and ub with stolen range but init chunk 1284 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1285 pr->u.p.count = init + 1; 1286 pr->u.p.ub = limit; 1287 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1288 } // while (search for victim) 1289 } // if (try to find victim and steal) 1290 } else { 1291 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1292 typedef union { 1293 struct { 1294 UT count; 1295 T ub; 1296 } p; 1297 kmp_int64 b; 1298 } union_i4; 1299 // All operations on 'count' or 'ub' must be combined atomically 1300 // together. 1301 { 1302 union_i4 vold, vnew; 1303 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1304 vnew = vold; 1305 vnew.p.count++; 1306 while (!KMP_COMPARE_AND_STORE_ACQ64( 1307 (volatile kmp_int64 *)&pr->u.p.count, 1308 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1309 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1310 KMP_CPU_PAUSE(); 1311 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1312 vnew = vold; 1313 vnew.p.count++; 1314 } 1315 vnew = vold; 1316 init = vnew.p.count; 1317 status = (init < (UT)vnew.p.ub); 1318 } 1319 1320 if (!status) { 1321 kmp_info_t **other_threads = team->t.t_threads; 1322 int while_limit = pr->u.p.parm3; 1323 int while_index = 0; 1324 1325 // TODO: algorithm of searching for a victim 1326 // should be cleaned up and measured 1327 while ((!status) && (while_limit != ++while_index)) { 1328 union_i4 vold, vnew; 1329 kmp_int32 remaining; 1330 T victimIdx = pr->u.p.parm4; 1331 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1332 dispatch_private_info_template<T> *victim = 1333 reinterpret_cast<dispatch_private_info_template<T> *>( 1334 other_threads[victimIdx] 1335 ->th.th_dispatch->th_dispatch_pr_current); 1336 while ((victim == NULL || victim == pr || 1337 (*(volatile T *)&victim->u.p.static_steal_counter != 1338 *(volatile T *)&pr->u.p.static_steal_counter)) && 1339 oldVictimIdx != victimIdx) { 1340 victimIdx = (victimIdx + 1) % nproc; 1341 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1342 other_threads[victimIdx] 1343 ->th.th_dispatch->th_dispatch_pr_current); 1344 } 1345 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != 1346 *(volatile T *)&pr->u.p.static_steal_counter)) { 1347 continue; // try once more (nproc attempts in total) 1348 // no victim is ready yet to participate in stealing 1349 // because all victims are still in kmp_init_dispatch 1350 } 1351 pr->u.p.parm4 = victimIdx; // new victim found 1352 while (1) { // CAS loop if victim has enough chunks to steal 1353 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1354 vnew = vold; 1355 1356 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1357 if (vnew.p.count >= (UT)vnew.p.ub || 1358 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1359 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1360 break; // not enough chunks to steal, goto next victim 1361 } 1362 if (remaining > 3) { 1363 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1364 } else { 1365 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1366 } 1367 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1368 // TODO: Should this be acquire or release? 1369 if (KMP_COMPARE_AND_STORE_ACQ64( 1370 (volatile kmp_int64 *)&victim->u.p.count, 1371 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1372 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1373 // stealing succedded 1374 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1375 vold.p.ub - vnew.p.ub); 1376 status = 1; 1377 while_index = 0; 1378 // now update own count and ub 1379 init = vnew.p.ub; 1380 vold.p.count = init + 1; 1381 #if KMP_ARCH_X86 1382 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1383 #else 1384 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1385 #endif 1386 break; 1387 } // if (check CAS result) 1388 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1389 } // while (try to steal from particular victim) 1390 } // while (search for victim) 1391 } // if (try to find victim and steal) 1392 } // if (4-byte induction variable) 1393 if (!status) { 1394 *p_lb = 0; 1395 *p_ub = 0; 1396 if (p_st != NULL) 1397 *p_st = 0; 1398 } else { 1399 start = pr->u.p.parm2; 1400 init *= chunk; 1401 limit = chunk + init - 1; 1402 incr = pr->u.p.st; 1403 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1404 1405 KMP_DEBUG_ASSERT(init <= trip); 1406 if ((last = (limit >= trip)) != 0) 1407 limit = trip; 1408 if (p_st != NULL) 1409 *p_st = incr; 1410 1411 if (incr == 1) { 1412 *p_lb = start + init; 1413 *p_ub = start + limit; 1414 } else { 1415 *p_lb = start + init * incr; 1416 *p_ub = start + limit * incr; 1417 } 1418 1419 if (pr->flags.ordered) { 1420 pr->u.p.ordered_lower = init; 1421 pr->u.p.ordered_upper = limit; 1422 } // if 1423 } // if 1424 break; 1425 } // case 1426 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1427 case kmp_sch_static_balanced: { 1428 KD_TRACE( 1429 10, 1430 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1431 gtid)); 1432 /* check if thread has any iteration to do */ 1433 if ((status = !pr->u.p.count) != 0) { 1434 pr->u.p.count = 1; 1435 *p_lb = pr->u.p.lb; 1436 *p_ub = pr->u.p.ub; 1437 last = pr->u.p.parm1; 1438 if (p_st != NULL) 1439 *p_st = pr->u.p.st; 1440 } else { /* no iterations to do */ 1441 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1442 } 1443 } // case 1444 break; 1445 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1446 merged here */ 1447 case kmp_sch_static_chunked: { 1448 T parm1; 1449 1450 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1451 "kmp_sch_static_[affinity|chunked] case\n", 1452 gtid)); 1453 parm1 = pr->u.p.parm1; 1454 1455 trip = pr->u.p.tc - 1; 1456 init = parm1 * (pr->u.p.count + tid); 1457 1458 if ((status = (init <= trip)) != 0) { 1459 start = pr->u.p.lb; 1460 incr = pr->u.p.st; 1461 limit = parm1 + init - 1; 1462 1463 if ((last = (limit >= trip)) != 0) 1464 limit = trip; 1465 1466 if (p_st != NULL) 1467 *p_st = incr; 1468 1469 pr->u.p.count += nproc; 1470 1471 if (incr == 1) { 1472 *p_lb = start + init; 1473 *p_ub = start + limit; 1474 } else { 1475 *p_lb = start + init * incr; 1476 *p_ub = start + limit * incr; 1477 } 1478 1479 if (pr->flags.ordered) { 1480 pr->u.p.ordered_lower = init; 1481 pr->u.p.ordered_upper = limit; 1482 } // if 1483 } // if 1484 } // case 1485 break; 1486 1487 case kmp_sch_dynamic_chunked: { 1488 T chunk = pr->u.p.parm1; 1489 1490 KD_TRACE( 1491 100, 1492 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1493 gtid)); 1494 1495 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1496 trip = pr->u.p.tc - 1; 1497 1498 if ((status = (init <= trip)) == 0) { 1499 *p_lb = 0; 1500 *p_ub = 0; 1501 if (p_st != NULL) 1502 *p_st = 0; 1503 } else { 1504 start = pr->u.p.lb; 1505 limit = chunk + init - 1; 1506 incr = pr->u.p.st; 1507 1508 if ((last = (limit >= trip)) != 0) 1509 limit = trip; 1510 1511 if (p_st != NULL) 1512 *p_st = incr; 1513 1514 if (incr == 1) { 1515 *p_lb = start + init; 1516 *p_ub = start + limit; 1517 } else { 1518 *p_lb = start + init * incr; 1519 *p_ub = start + limit * incr; 1520 } 1521 1522 if (pr->flags.ordered) { 1523 pr->u.p.ordered_lower = init; 1524 pr->u.p.ordered_upper = limit; 1525 } // if 1526 } // if 1527 } // case 1528 break; 1529 1530 case kmp_sch_guided_iterative_chunked: { 1531 T chunkspec = pr->u.p.parm1; 1532 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1533 "iterative case\n", 1534 gtid)); 1535 trip = pr->u.p.tc; 1536 // Start atomic part of calculations 1537 while (1) { 1538 ST remaining; // signed, because can be < 0 1539 init = sh->u.s.iteration; // shared value 1540 remaining = trip - init; 1541 if (remaining <= 0) { // AC: need to compare with 0 first 1542 // nothing to do, don't try atomic op 1543 status = 0; 1544 break; 1545 } 1546 if ((T)remaining < 1547 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1548 // use dynamic-style shcedule 1549 // atomically inrement iterations, get old value 1550 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1551 (ST)chunkspec); 1552 remaining = trip - init; 1553 if (remaining <= 0) { 1554 status = 0; // all iterations got by other threads 1555 } else { 1556 // got some iterations to work on 1557 status = 1; 1558 if ((T)remaining > chunkspec) { 1559 limit = init + chunkspec - 1; 1560 } else { 1561 last = 1; // the last chunk 1562 limit = init + remaining - 1; 1563 } // if 1564 } // if 1565 break; 1566 } // if 1567 limit = init + 1568 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1569 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1570 (ST)init, (ST)limit)) { 1571 // CAS was successful, chunk obtained 1572 status = 1; 1573 --limit; 1574 break; 1575 } // if 1576 } // while 1577 if (status != 0) { 1578 start = pr->u.p.lb; 1579 incr = pr->u.p.st; 1580 if (p_st != NULL) 1581 *p_st = incr; 1582 *p_lb = start + init * incr; 1583 *p_ub = start + limit * incr; 1584 if (pr->flags.ordered) { 1585 pr->u.p.ordered_lower = init; 1586 pr->u.p.ordered_upper = limit; 1587 } // if 1588 } else { 1589 *p_lb = 0; 1590 *p_ub = 0; 1591 if (p_st != NULL) 1592 *p_st = 0; 1593 } // if 1594 } // case 1595 break; 1596 1597 #if OMP_45_ENABLED 1598 case kmp_sch_guided_simd: { 1599 // same as iterative but curr-chunk adjusted to be multiple of given 1600 // chunk 1601 T chunk = pr->u.p.parm1; 1602 KD_TRACE(100, 1603 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1604 gtid)); 1605 trip = pr->u.p.tc; 1606 // Start atomic part of calculations 1607 while (1) { 1608 ST remaining; // signed, because can be < 0 1609 init = sh->u.s.iteration; // shared value 1610 remaining = trip - init; 1611 if (remaining <= 0) { // AC: need to compare with 0 first 1612 status = 0; // nothing to do, don't try atomic op 1613 break; 1614 } 1615 KMP_DEBUG_ASSERT(init % chunk == 0); 1616 // compare with K*nproc*(chunk+1), K=2 by default 1617 if ((T)remaining < pr->u.p.parm2) { 1618 // use dynamic-style shcedule 1619 // atomically inrement iterations, get old value 1620 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1621 (ST)chunk); 1622 remaining = trip - init; 1623 if (remaining <= 0) { 1624 status = 0; // all iterations got by other threads 1625 } else { 1626 // got some iterations to work on 1627 status = 1; 1628 if ((T)remaining > chunk) { 1629 limit = init + chunk - 1; 1630 } else { 1631 last = 1; // the last chunk 1632 limit = init + remaining - 1; 1633 } // if 1634 } // if 1635 break; 1636 } // if 1637 // divide by K*nproc 1638 UT span = remaining * (*(double *)&pr->u.p.parm3); 1639 UT rem = span % chunk; 1640 if (rem) // adjust so that span%chunk == 0 1641 span += chunk - rem; 1642 limit = init + span; 1643 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1644 (ST)init, (ST)limit)) { 1645 // CAS was successful, chunk obtained 1646 status = 1; 1647 --limit; 1648 break; 1649 } // if 1650 } // while 1651 if (status != 0) { 1652 start = pr->u.p.lb; 1653 incr = pr->u.p.st; 1654 if (p_st != NULL) 1655 *p_st = incr; 1656 *p_lb = start + init * incr; 1657 *p_ub = start + limit * incr; 1658 if (pr->flags.ordered) { 1659 pr->u.p.ordered_lower = init; 1660 pr->u.p.ordered_upper = limit; 1661 } // if 1662 } else { 1663 *p_lb = 0; 1664 *p_ub = 0; 1665 if (p_st != NULL) 1666 *p_st = 0; 1667 } // if 1668 } // case 1669 break; 1670 #endif // OMP_45_ENABLED 1671 1672 case kmp_sch_guided_analytical_chunked: { 1673 T chunkspec = pr->u.p.parm1; 1674 UT chunkIdx; 1675 #if KMP_USE_X87CONTROL 1676 /* for storing original FPCW value for Windows* OS on 1677 IA-32 architecture 8-byte version */ 1678 unsigned int oldFpcw; 1679 unsigned int fpcwSet = 0; 1680 #endif 1681 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1682 "kmp_sch_guided_analytical_chunked case\n", 1683 gtid)); 1684 1685 trip = pr->u.p.tc; 1686 1687 KMP_DEBUG_ASSERT(nproc > 1); 1688 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1689 1690 while (1) { /* this while loop is a safeguard against unexpected zero 1691 chunk sizes */ 1692 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1693 if (chunkIdx >= (UT)pr->u.p.parm2) { 1694 --trip; 1695 /* use dynamic-style scheduling */ 1696 init = chunkIdx * chunkspec + pr->u.p.count; 1697 /* need to verify init > 0 in case of overflow in the above 1698 * calculation */ 1699 if ((status = (init > 0 && init <= trip)) != 0) { 1700 limit = init + chunkspec - 1; 1701 1702 if ((last = (limit >= trip)) != 0) 1703 limit = trip; 1704 } 1705 break; 1706 } else { 1707 /* use exponential-style scheduling */ 1708 /* The following check is to workaround the lack of long double precision on 1709 Windows* OS. 1710 This check works around the possible effect that init != 0 for chunkIdx == 0. 1711 */ 1712 #if KMP_USE_X87CONTROL 1713 /* If we haven't already done so, save original 1714 FPCW and set precision to 64-bit, as Windows* OS 1715 on IA-32 architecture defaults to 53-bit */ 1716 if (!fpcwSet) { 1717 oldFpcw = _control87(0, 0); 1718 _control87(_PC_64, _MCW_PC); 1719 fpcwSet = 0x30000; 1720 } 1721 #endif 1722 if (chunkIdx) { 1723 init = __kmp_dispatch_guided_remaining<T>( 1724 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1725 KMP_DEBUG_ASSERT(init); 1726 init = trip - init; 1727 } else 1728 init = 0; 1729 limit = trip - __kmp_dispatch_guided_remaining<T>( 1730 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1731 KMP_ASSERT(init <= limit); 1732 if (init < limit) { 1733 KMP_DEBUG_ASSERT(limit <= trip); 1734 --limit; 1735 status = 1; 1736 break; 1737 } // if 1738 } // if 1739 } // while (1) 1740 #if KMP_USE_X87CONTROL 1741 /* restore FPCW if necessary 1742 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1743 */ 1744 if (fpcwSet && (oldFpcw & fpcwSet)) 1745 _control87(oldFpcw, _MCW_PC); 1746 #endif 1747 if (status != 0) { 1748 start = pr->u.p.lb; 1749 incr = pr->u.p.st; 1750 if (p_st != NULL) 1751 *p_st = incr; 1752 *p_lb = start + init * incr; 1753 *p_ub = start + limit * incr; 1754 if (pr->flags.ordered) { 1755 pr->u.p.ordered_lower = init; 1756 pr->u.p.ordered_upper = limit; 1757 } 1758 } else { 1759 *p_lb = 0; 1760 *p_ub = 0; 1761 if (p_st != NULL) 1762 *p_st = 0; 1763 } 1764 } // case 1765 break; 1766 1767 case kmp_sch_trapezoidal: { 1768 UT index; 1769 T parm2 = pr->u.p.parm2; 1770 T parm3 = pr->u.p.parm3; 1771 T parm4 = pr->u.p.parm4; 1772 KD_TRACE(100, 1773 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1774 gtid)); 1775 1776 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1777 1778 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1779 trip = pr->u.p.tc - 1; 1780 1781 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1782 *p_lb = 0; 1783 *p_ub = 0; 1784 if (p_st != NULL) 1785 *p_st = 0; 1786 } else { 1787 start = pr->u.p.lb; 1788 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1789 incr = pr->u.p.st; 1790 1791 if ((last = (limit >= trip)) != 0) 1792 limit = trip; 1793 1794 if (p_st != NULL) 1795 *p_st = incr; 1796 1797 if (incr == 1) { 1798 *p_lb = start + init; 1799 *p_ub = start + limit; 1800 } else { 1801 *p_lb = start + init * incr; 1802 *p_ub = start + limit * incr; 1803 } 1804 1805 if (pr->flags.ordered) { 1806 pr->u.p.ordered_lower = init; 1807 pr->u.p.ordered_upper = limit; 1808 } // if 1809 } // if 1810 } // case 1811 break; 1812 default: { 1813 status = 0; // to avoid complaints on uninitialized variable use 1814 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1815 KMP_HNT(GetNewerLibrary), // Hint 1816 __kmp_msg_null // Variadic argument list terminator 1817 ); 1818 } break; 1819 } // switch 1820 if (p_last) 1821 *p_last = last; 1822 #ifdef KMP_DEBUG 1823 if (pr->flags.ordered) { 1824 char *buff; 1825 // create format specifiers before the debug output 1826 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1827 "ordered_lower:%%%s ordered_upper:%%%s\n", 1828 traits_t<UT>::spec, traits_t<UT>::spec); 1829 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1830 __kmp_str_free(&buff); 1831 } 1832 { 1833 char *buff; 1834 // create format specifiers before the debug output 1835 buff = __kmp_str_format( 1836 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1837 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1838 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1839 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1840 __kmp_str_free(&buff); 1841 } 1842 #endif 1843 return status; 1844 } 1845 1846 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1847 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1848 is not called. */ 1849 #if OMPT_SUPPORT && OMPT_OPTIONAL 1850 #define OMPT_LOOP_END \ 1851 if (status == 0) { \ 1852 if (ompt_enabled.ompt_callback_work) { \ 1853 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1854 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1855 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1856 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1857 &(task_info->task_data), 0, codeptr); \ 1858 } \ 1859 } 1860 // TODO: implement count 1861 #else 1862 #define OMPT_LOOP_END // no-op 1863 #endif 1864 1865 #if KMP_STATS_ENABLED 1866 #define KMP_STATS_LOOP_END \ 1867 { \ 1868 kmp_int64 u, l, t, i; \ 1869 l = (kmp_int64)(*p_lb); \ 1870 u = (kmp_int64)(*p_ub); \ 1871 i = (kmp_int64)(pr->u.p.st); \ 1872 if (status == 0) { \ 1873 t = 0; \ 1874 KMP_POP_PARTITIONED_TIMER(); \ 1875 } else if (i == 1) { \ 1876 if (u >= l) \ 1877 t = u - l + 1; \ 1878 else \ 1879 t = 0; \ 1880 } else if (i < 0) { \ 1881 if (l >= u) \ 1882 t = (l - u) / (-i) + 1; \ 1883 else \ 1884 t = 0; \ 1885 } else { \ 1886 if (u >= l) \ 1887 t = (u - l) / i + 1; \ 1888 else \ 1889 t = 0; \ 1890 } \ 1891 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1892 } 1893 #else 1894 #define KMP_STATS_LOOP_END /* Nothing */ 1895 #endif 1896 1897 template <typename T> 1898 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1899 T *p_lb, T *p_ub, 1900 typename traits_t<T>::signed_t *p_st 1901 #if OMPT_SUPPORT && OMPT_OPTIONAL 1902 , 1903 void *codeptr 1904 #endif 1905 ) { 1906 1907 typedef typename traits_t<T>::unsigned_t UT; 1908 typedef typename traits_t<T>::signed_t ST; 1909 // This is potentially slightly misleading, schedule(runtime) will appear here 1910 // even if the actual runtme schedule is static. (Which points out a 1911 // disadavantage of schedule(runtime): even when static scheduling is used it 1912 // costs more than a compile time choice to use static scheduling would.) 1913 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1914 1915 int status; 1916 dispatch_private_info_template<T> *pr; 1917 kmp_info_t *th = __kmp_threads[gtid]; 1918 kmp_team_t *team = th->th.th_team; 1919 1920 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1921 KD_TRACE( 1922 1000, 1923 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1924 gtid, p_lb, p_ub, p_st, p_last)); 1925 1926 if (team->t.t_serialized) { 1927 /* NOTE: serialize this dispatch becase we are not at the active level */ 1928 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1929 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1930 KMP_DEBUG_ASSERT(pr); 1931 1932 if ((status = (pr->u.p.tc != 0)) == 0) { 1933 *p_lb = 0; 1934 *p_ub = 0; 1935 // if ( p_last != NULL ) 1936 // *p_last = 0; 1937 if (p_st != NULL) 1938 *p_st = 0; 1939 if (__kmp_env_consistency_check) { 1940 if (pr->pushed_ws != ct_none) { 1941 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1942 } 1943 } 1944 } else if (pr->flags.nomerge) { 1945 kmp_int32 last; 1946 T start; 1947 UT limit, trip, init; 1948 ST incr; 1949 T chunk = pr->u.p.parm1; 1950 1951 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1952 gtid)); 1953 1954 init = chunk * pr->u.p.count++; 1955 trip = pr->u.p.tc - 1; 1956 1957 if ((status = (init <= trip)) == 0) { 1958 *p_lb = 0; 1959 *p_ub = 0; 1960 // if ( p_last != NULL ) 1961 // *p_last = 0; 1962 if (p_st != NULL) 1963 *p_st = 0; 1964 if (__kmp_env_consistency_check) { 1965 if (pr->pushed_ws != ct_none) { 1966 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1967 } 1968 } 1969 } else { 1970 start = pr->u.p.lb; 1971 limit = chunk + init - 1; 1972 incr = pr->u.p.st; 1973 1974 if ((last = (limit >= trip)) != 0) { 1975 limit = trip; 1976 #if KMP_OS_WINDOWS 1977 pr->u.p.last_upper = pr->u.p.ub; 1978 #endif /* KMP_OS_WINDOWS */ 1979 } 1980 if (p_last != NULL) 1981 *p_last = last; 1982 if (p_st != NULL) 1983 *p_st = incr; 1984 if (incr == 1) { 1985 *p_lb = start + init; 1986 *p_ub = start + limit; 1987 } else { 1988 *p_lb = start + init * incr; 1989 *p_ub = start + limit * incr; 1990 } 1991 1992 if (pr->flags.ordered) { 1993 pr->u.p.ordered_lower = init; 1994 pr->u.p.ordered_upper = limit; 1995 #ifdef KMP_DEBUG 1996 { 1997 char *buff; 1998 // create format specifiers before the debug output 1999 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2000 "ordered_lower:%%%s ordered_upper:%%%s\n", 2001 traits_t<UT>::spec, traits_t<UT>::spec); 2002 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2003 pr->u.p.ordered_upper)); 2004 __kmp_str_free(&buff); 2005 } 2006 #endif 2007 } // if 2008 } // if 2009 } else { 2010 pr->u.p.tc = 0; 2011 *p_lb = pr->u.p.lb; 2012 *p_ub = pr->u.p.ub; 2013 #if KMP_OS_WINDOWS 2014 pr->u.p.last_upper = *p_ub; 2015 #endif /* KMP_OS_WINDOWS */ 2016 if (p_last != NULL) 2017 *p_last = TRUE; 2018 if (p_st != NULL) 2019 *p_st = pr->u.p.st; 2020 } // if 2021 #ifdef KMP_DEBUG 2022 { 2023 char *buff; 2024 // create format specifiers before the debug output 2025 buff = __kmp_str_format( 2026 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2027 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2028 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2029 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 2030 __kmp_str_free(&buff); 2031 } 2032 #endif 2033 #if INCLUDE_SSC_MARKS 2034 SSC_MARK_DISPATCH_NEXT(); 2035 #endif 2036 OMPT_LOOP_END; 2037 KMP_STATS_LOOP_END; 2038 return status; 2039 } else { 2040 kmp_int32 last = 0; 2041 dispatch_shared_info_template<T> volatile *sh; 2042 2043 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2044 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2045 2046 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2047 th->th.th_dispatch->th_dispatch_pr_current); 2048 KMP_DEBUG_ASSERT(pr); 2049 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2050 th->th.th_dispatch->th_dispatch_sh_current); 2051 KMP_DEBUG_ASSERT(sh); 2052 2053 #if KMP_USE_HIER_SCHED 2054 if (pr->flags.use_hier) 2055 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2056 else 2057 #endif // KMP_USE_HIER_SCHED 2058 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2059 p_st, th->th.th_team_nproc, 2060 th->th.th_info.ds.ds_tid); 2061 // status == 0: no more iterations to execute 2062 if (status == 0) { 2063 UT num_done; 2064 2065 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2066 #ifdef KMP_DEBUG 2067 { 2068 char *buff; 2069 // create format specifiers before the debug output 2070 buff = __kmp_str_format( 2071 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2072 traits_t<UT>::spec); 2073 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2074 __kmp_str_free(&buff); 2075 } 2076 #endif 2077 2078 #if KMP_USE_HIER_SCHED 2079 pr->flags.use_hier = FALSE; 2080 #endif 2081 if ((ST)num_done == th->th.th_team_nproc - 1) { 2082 #if (KMP_STATIC_STEAL_ENABLED) 2083 if (pr->schedule == kmp_sch_static_steal && 2084 traits_t<T>::type_size > 4) { 2085 int i; 2086 kmp_info_t **other_threads = team->t.t_threads; 2087 // loop complete, safe to destroy locks used for stealing 2088 for (i = 0; i < th->th.th_team_nproc; ++i) { 2089 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2090 KMP_ASSERT(lck != NULL); 2091 __kmp_destroy_lock(lck); 2092 __kmp_free(lck); 2093 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2094 } 2095 } 2096 #endif 2097 /* NOTE: release this buffer to be reused */ 2098 2099 KMP_MB(); /* Flush all pending memory write invalidates. */ 2100 2101 sh->u.s.num_done = 0; 2102 sh->u.s.iteration = 0; 2103 2104 /* TODO replace with general release procedure? */ 2105 if (pr->flags.ordered) { 2106 sh->u.s.ordered_iteration = 0; 2107 } 2108 2109 KMP_MB(); /* Flush all pending memory write invalidates. */ 2110 2111 sh->buffer_index += __kmp_dispatch_num_buffers; 2112 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2113 gtid, sh->buffer_index)); 2114 2115 KMP_MB(); /* Flush all pending memory write invalidates. */ 2116 2117 } // if 2118 if (__kmp_env_consistency_check) { 2119 if (pr->pushed_ws != ct_none) { 2120 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2121 } 2122 } 2123 2124 th->th.th_dispatch->th_deo_fcn = NULL; 2125 th->th.th_dispatch->th_dxo_fcn = NULL; 2126 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2127 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2128 } // if (status == 0) 2129 #if KMP_OS_WINDOWS 2130 else if (last) { 2131 pr->u.p.last_upper = pr->u.p.ub; 2132 } 2133 #endif /* KMP_OS_WINDOWS */ 2134 if (p_last != NULL && status != 0) 2135 *p_last = last; 2136 } // if 2137 2138 #ifdef KMP_DEBUG 2139 { 2140 char *buff; 2141 // create format specifiers before the debug output 2142 buff = __kmp_str_format( 2143 "__kmp_dispatch_next: T#%%d normal case: " 2144 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2145 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2146 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2147 (p_last ? *p_last : 0), status)); 2148 __kmp_str_free(&buff); 2149 } 2150 #endif 2151 #if INCLUDE_SSC_MARKS 2152 SSC_MARK_DISPATCH_NEXT(); 2153 #endif 2154 OMPT_LOOP_END; 2155 KMP_STATS_LOOP_END; 2156 return status; 2157 } 2158 2159 template <typename T> 2160 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2161 kmp_int32 *plastiter, T *plower, T *pupper, 2162 typename traits_t<T>::signed_t incr) { 2163 typedef typename traits_t<T>::unsigned_t UT; 2164 kmp_uint32 team_id; 2165 kmp_uint32 nteams; 2166 UT trip_count; 2167 kmp_team_t *team; 2168 kmp_info_t *th; 2169 2170 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2171 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2172 #ifdef KMP_DEBUG 2173 typedef typename traits_t<T>::signed_t ST; 2174 { 2175 char *buff; 2176 // create format specifiers before the debug output 2177 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2178 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2179 traits_t<T>::spec, traits_t<T>::spec, 2180 traits_t<ST>::spec, traits_t<T>::spec); 2181 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2182 __kmp_str_free(&buff); 2183 } 2184 #endif 2185 2186 if (__kmp_env_consistency_check) { 2187 if (incr == 0) { 2188 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2189 loc); 2190 } 2191 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2192 // The loop is illegal. 2193 // Some zero-trip loops maintained by compiler, e.g.: 2194 // for(i=10;i<0;++i) // lower >= upper - run-time check 2195 // for(i=0;i>10;--i) // lower <= upper - run-time check 2196 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2197 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2198 // Compiler does not check the following illegal loops: 2199 // for(i=0;i<10;i+=incr) // where incr<0 2200 // for(i=10;i>0;i-=incr) // where incr<0 2201 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2202 } 2203 } 2204 th = __kmp_threads[gtid]; 2205 team = th->th.th_team; 2206 #if OMP_40_ENABLED 2207 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2208 nteams = th->th.th_teams_size.nteams; 2209 #endif 2210 team_id = team->t.t_master_tid; 2211 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2212 2213 // compute global trip count 2214 if (incr == 1) { 2215 trip_count = *pupper - *plower + 1; 2216 } else if (incr == -1) { 2217 trip_count = *plower - *pupper + 1; 2218 } else if (incr > 0) { 2219 // upper-lower can exceed the limit of signed type 2220 trip_count = (UT)(*pupper - *plower) / incr + 1; 2221 } else { 2222 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2223 } 2224 2225 if (trip_count <= nteams) { 2226 KMP_DEBUG_ASSERT( 2227 __kmp_static == kmp_sch_static_greedy || 2228 __kmp_static == 2229 kmp_sch_static_balanced); // Unknown static scheduling type. 2230 // only some teams get single iteration, others get nothing 2231 if (team_id < trip_count) { 2232 *pupper = *plower = *plower + team_id * incr; 2233 } else { 2234 *plower = *pupper + incr; // zero-trip loop 2235 } 2236 if (plastiter != NULL) 2237 *plastiter = (team_id == trip_count - 1); 2238 } else { 2239 if (__kmp_static == kmp_sch_static_balanced) { 2240 UT chunk = trip_count / nteams; 2241 UT extras = trip_count % nteams; 2242 *plower += 2243 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2244 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2245 if (plastiter != NULL) 2246 *plastiter = (team_id == nteams - 1); 2247 } else { 2248 T chunk_inc_count = 2249 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2250 T upper = *pupper; 2251 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2252 // Unknown static scheduling type. 2253 *plower += team_id * chunk_inc_count; 2254 *pupper = *plower + chunk_inc_count - incr; 2255 // Check/correct bounds if needed 2256 if (incr > 0) { 2257 if (*pupper < *plower) 2258 *pupper = traits_t<T>::max_value; 2259 if (plastiter != NULL) 2260 *plastiter = *plower <= upper && *pupper > upper - incr; 2261 if (*pupper > upper) 2262 *pupper = upper; // tracker C73258 2263 } else { 2264 if (*pupper > *plower) 2265 *pupper = traits_t<T>::min_value; 2266 if (plastiter != NULL) 2267 *plastiter = *plower >= upper && *pupper < upper - incr; 2268 if (*pupper < upper) 2269 *pupper = upper; // tracker C73258 2270 } 2271 } 2272 } 2273 } 2274 2275 //----------------------------------------------------------------------------- 2276 // Dispatch routines 2277 // Transfer call to template< type T > 2278 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2279 // T lb, T ub, ST st, ST chunk ) 2280 extern "C" { 2281 2282 /*! 2283 @ingroup WORK_SHARING 2284 @{ 2285 @param loc Source location 2286 @param gtid Global thread id 2287 @param schedule Schedule type 2288 @param lb Lower bound 2289 @param ub Upper bound 2290 @param st Step (or increment if you prefer) 2291 @param chunk The chunk size to block with 2292 2293 This function prepares the runtime to start a dynamically scheduled for loop, 2294 saving the loop arguments. 2295 These functions are all identical apart from the types of the arguments. 2296 */ 2297 2298 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2299 enum sched_type schedule, kmp_int32 lb, 2300 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2301 KMP_DEBUG_ASSERT(__kmp_init_serial); 2302 #if OMPT_SUPPORT && OMPT_OPTIONAL 2303 OMPT_STORE_RETURN_ADDRESS(gtid); 2304 #endif 2305 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2306 } 2307 /*! 2308 See @ref __kmpc_dispatch_init_4 2309 */ 2310 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2311 enum sched_type schedule, kmp_uint32 lb, 2312 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2313 KMP_DEBUG_ASSERT(__kmp_init_serial); 2314 #if OMPT_SUPPORT && OMPT_OPTIONAL 2315 OMPT_STORE_RETURN_ADDRESS(gtid); 2316 #endif 2317 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2318 } 2319 2320 /*! 2321 See @ref __kmpc_dispatch_init_4 2322 */ 2323 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2324 enum sched_type schedule, kmp_int64 lb, 2325 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2326 KMP_DEBUG_ASSERT(__kmp_init_serial); 2327 #if OMPT_SUPPORT && OMPT_OPTIONAL 2328 OMPT_STORE_RETURN_ADDRESS(gtid); 2329 #endif 2330 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2331 } 2332 2333 /*! 2334 See @ref __kmpc_dispatch_init_4 2335 */ 2336 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2337 enum sched_type schedule, kmp_uint64 lb, 2338 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2339 KMP_DEBUG_ASSERT(__kmp_init_serial); 2340 #if OMPT_SUPPORT && OMPT_OPTIONAL 2341 OMPT_STORE_RETURN_ADDRESS(gtid); 2342 #endif 2343 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2344 } 2345 2346 /*! 2347 See @ref __kmpc_dispatch_init_4 2348 2349 Difference from __kmpc_dispatch_init set of functions is these functions 2350 are called for composite distribute parallel for construct. Thus before 2351 regular iterations dispatching we need to calc per-team iteration space. 2352 2353 These functions are all identical apart from the types of the arguments. 2354 */ 2355 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2356 enum sched_type schedule, kmp_int32 *p_last, 2357 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2358 kmp_int32 chunk) { 2359 KMP_DEBUG_ASSERT(__kmp_init_serial); 2360 #if OMPT_SUPPORT && OMPT_OPTIONAL 2361 OMPT_STORE_RETURN_ADDRESS(gtid); 2362 #endif 2363 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2364 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2365 } 2366 2367 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2368 enum sched_type schedule, kmp_int32 *p_last, 2369 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2370 kmp_int32 chunk) { 2371 KMP_DEBUG_ASSERT(__kmp_init_serial); 2372 #if OMPT_SUPPORT && OMPT_OPTIONAL 2373 OMPT_STORE_RETURN_ADDRESS(gtid); 2374 #endif 2375 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2376 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2377 } 2378 2379 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2380 enum sched_type schedule, kmp_int32 *p_last, 2381 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2382 kmp_int64 chunk) { 2383 KMP_DEBUG_ASSERT(__kmp_init_serial); 2384 #if OMPT_SUPPORT && OMPT_OPTIONAL 2385 OMPT_STORE_RETURN_ADDRESS(gtid); 2386 #endif 2387 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2388 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2389 } 2390 2391 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2392 enum sched_type schedule, kmp_int32 *p_last, 2393 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2394 kmp_int64 chunk) { 2395 KMP_DEBUG_ASSERT(__kmp_init_serial); 2396 #if OMPT_SUPPORT && OMPT_OPTIONAL 2397 OMPT_STORE_RETURN_ADDRESS(gtid); 2398 #endif 2399 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2400 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2401 } 2402 2403 /*! 2404 @param loc Source code location 2405 @param gtid Global thread id 2406 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2407 otherwise 2408 @param p_lb Pointer to the lower bound for the next chunk of work 2409 @param p_ub Pointer to the upper bound for the next chunk of work 2410 @param p_st Pointer to the stride for the next chunk of work 2411 @return one if there is work to be done, zero otherwise 2412 2413 Get the next dynamically allocated chunk of work for this thread. 2414 If there is no more work, then the lb,ub and stride need not be modified. 2415 */ 2416 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2417 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2418 #if OMPT_SUPPORT && OMPT_OPTIONAL 2419 OMPT_STORE_RETURN_ADDRESS(gtid); 2420 #endif 2421 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2422 #if OMPT_SUPPORT && OMPT_OPTIONAL 2423 , 2424 OMPT_LOAD_RETURN_ADDRESS(gtid) 2425 #endif 2426 ); 2427 } 2428 2429 /*! 2430 See @ref __kmpc_dispatch_next_4 2431 */ 2432 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2433 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2434 kmp_int32 *p_st) { 2435 #if OMPT_SUPPORT && OMPT_OPTIONAL 2436 OMPT_STORE_RETURN_ADDRESS(gtid); 2437 #endif 2438 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2439 #if OMPT_SUPPORT && OMPT_OPTIONAL 2440 , 2441 OMPT_LOAD_RETURN_ADDRESS(gtid) 2442 #endif 2443 ); 2444 } 2445 2446 /*! 2447 See @ref __kmpc_dispatch_next_4 2448 */ 2449 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2450 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2451 #if OMPT_SUPPORT && OMPT_OPTIONAL 2452 OMPT_STORE_RETURN_ADDRESS(gtid); 2453 #endif 2454 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2455 #if OMPT_SUPPORT && OMPT_OPTIONAL 2456 , 2457 OMPT_LOAD_RETURN_ADDRESS(gtid) 2458 #endif 2459 ); 2460 } 2461 2462 /*! 2463 See @ref __kmpc_dispatch_next_4 2464 */ 2465 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2466 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2467 kmp_int64 *p_st) { 2468 #if OMPT_SUPPORT && OMPT_OPTIONAL 2469 OMPT_STORE_RETURN_ADDRESS(gtid); 2470 #endif 2471 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2472 #if OMPT_SUPPORT && OMPT_OPTIONAL 2473 , 2474 OMPT_LOAD_RETURN_ADDRESS(gtid) 2475 #endif 2476 ); 2477 } 2478 2479 /*! 2480 @param loc Source code location 2481 @param gtid Global thread id 2482 2483 Mark the end of a dynamic loop. 2484 */ 2485 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2486 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2487 } 2488 2489 /*! 2490 See @ref __kmpc_dispatch_fini_4 2491 */ 2492 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2493 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2494 } 2495 2496 /*! 2497 See @ref __kmpc_dispatch_fini_4 2498 */ 2499 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2500 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2501 } 2502 2503 /*! 2504 See @ref __kmpc_dispatch_fini_4 2505 */ 2506 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2507 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2508 } 2509 /*! @} */ 2510 2511 //----------------------------------------------------------------------------- 2512 // Non-template routines from kmp_dispatch.cpp used in other sources 2513 2514 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2515 return value == checker; 2516 } 2517 2518 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2519 return value != checker; 2520 } 2521 2522 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2523 return value < checker; 2524 } 2525 2526 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2527 return value >= checker; 2528 } 2529 2530 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2531 return value <= checker; 2532 } 2533 2534 kmp_uint32 2535 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2536 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2537 void *obj // Higher-level synchronization object, or NULL. 2538 ) { 2539 // note: we may not belong to a team at this point 2540 volatile kmp_uint32 *spin = spinner; 2541 kmp_uint32 check = checker; 2542 kmp_uint32 spins; 2543 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2544 kmp_uint32 r; 2545 2546 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2547 KMP_INIT_YIELD(spins); 2548 // main wait spin loop 2549 while (!f(r = TCR_4(*spin), check)) { 2550 KMP_FSYNC_SPIN_PREPARE(obj); 2551 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2552 split. It causes problems with infinite recursion because of exit lock */ 2553 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2554 __kmp_abort_thread(); */ 2555 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2556 } 2557 KMP_FSYNC_SPIN_ACQUIRED(obj); 2558 return r; 2559 } 2560 2561 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2562 kmp_uint32 (*pred)(void *, kmp_uint32), 2563 void *obj // Higher-level synchronization object, or NULL. 2564 ) { 2565 // note: we may not belong to a team at this point 2566 void *spin = spinner; 2567 kmp_uint32 check = checker; 2568 kmp_uint32 spins; 2569 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2570 2571 KMP_FSYNC_SPIN_INIT(obj, spin); 2572 KMP_INIT_YIELD(spins); 2573 // main wait spin loop 2574 while (!f(spin, check)) { 2575 KMP_FSYNC_SPIN_PREPARE(obj); 2576 /* if we have waited a bit, or are noversubscribed, yield */ 2577 /* pause is in the following code */ 2578 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2579 } 2580 KMP_FSYNC_SPIN_ACQUIRED(obj); 2581 } 2582 2583 } // extern "C" 2584 2585 #ifdef KMP_GOMP_COMPAT 2586 2587 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2588 enum sched_type schedule, kmp_int32 lb, 2589 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2590 int push_ws) { 2591 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2592 push_ws); 2593 } 2594 2595 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2596 enum sched_type schedule, kmp_uint32 lb, 2597 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2598 int push_ws) { 2599 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2600 push_ws); 2601 } 2602 2603 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2604 enum sched_type schedule, kmp_int64 lb, 2605 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2606 int push_ws) { 2607 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2608 push_ws); 2609 } 2610 2611 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2612 enum sched_type schedule, kmp_uint64 lb, 2613 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2614 int push_ws) { 2615 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2616 push_ws); 2617 } 2618 2619 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2620 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2621 } 2622 2623 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2624 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2625 } 2626 2627 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2628 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2629 } 2630 2631 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2632 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2633 } 2634 2635 #endif /* KMP_GOMP_COMPAT */ 2636 2637 /* ------------------------------------------------------------------------ */ 2638