1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 // Initialize a dispatch_private_info_template<T> buffer for a particular 94 // type of schedule,chunk. The loop description is found in lb (lower bound), 95 // ub (upper bound), and st (stride). nproc is the number of threads relevant 96 // to the scheduling (often the number of threads in a team, but not always if 97 // hierarchical scheduling is used). tid is the id of the thread calling 98 // the function within the group of nproc threads. It will have a value 99 // between 0 and nproc - 1. This is often just the thread id within a team, but 100 // is not necessarily the case when using hierarchical scheduling. 101 // loc is the source file location of the corresponding loop 102 // gtid is the global thread id 103 template <typename T> 104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 105 dispatch_private_info_template<T> *pr, 106 enum sched_type schedule, T lb, T ub, 107 typename traits_t<T>::signed_t st, 108 #if USE_ITT_BUILD 109 kmp_uint64 *cur_chunk, 110 #endif 111 typename traits_t<T>::signed_t chunk, 112 T nproc, T tid) { 113 typedef typename traits_t<T>::unsigned_t UT; 114 typedef typename traits_t<T>::floating_t DBL; 115 116 int active; 117 T tc; 118 kmp_info_t *th; 119 kmp_team_t *team; 120 int monotonicity; 121 bool use_hier; 122 123 #ifdef KMP_DEBUG 124 typedef typename traits_t<T>::signed_t ST; 125 { 126 char *buff; 127 // create format specifiers before the debug output 128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 129 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 131 traits_t<T>::spec, traits_t<T>::spec, 132 traits_t<ST>::spec, traits_t<ST>::spec, 133 traits_t<T>::spec, traits_t<T>::spec); 134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 135 __kmp_str_free(&buff); 136 } 137 #endif 138 /* setup data */ 139 th = __kmp_threads[gtid]; 140 team = th->th.th_team; 141 active = !team->t.t_serialized; 142 143 #if USE_ITT_BUILD 144 int itt_need_metadata_reporting = 145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 147 team->t.t_active_level == 1; 148 #endif 149 150 #if KMP_USE_HIER_SCHED 151 use_hier = pr->flags.use_hier; 152 #else 153 use_hier = false; 154 #endif 155 156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159 160 /* Pick up the nomerge/ordered bits from the scheduling type */ 161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 162 pr->flags.nomerge = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.nomerge = FALSE; 167 } 168 pr->type_size = traits_t<T>::type_size; // remember the size of variables 169 if (kmp_ord_lower & schedule) { 170 pr->flags.ordered = TRUE; 171 schedule = 172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 173 } else { 174 pr->flags.ordered = FALSE; 175 } 176 // Ordered overrides nonmonotonic 177 if (pr->flags.ordered) { 178 monotonicity = SCHEDULE_MONOTONIC; 179 } 180 181 if (schedule == kmp_sch_static) { 182 schedule = __kmp_static; 183 } else { 184 if (schedule == kmp_sch_runtime) { 185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 186 // not specified) 187 schedule = team->t.t_sched.r_sched_type; 188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 190 // Detail the schedule if needed (global controls are differentiated 191 // appropriately) 192 if (schedule == kmp_sch_guided_chunked) { 193 schedule = __kmp_guided; 194 } else if (schedule == kmp_sch_static) { 195 schedule = __kmp_static; 196 } 197 // Use the chunk size specified by OMP_SCHEDULE (or default if not 198 // specified) 199 chunk = team->t.t_sched.chunk; 200 #if USE_ITT_BUILD 201 if (cur_chunk) 202 *cur_chunk = chunk; 203 #endif 204 #ifdef KMP_DEBUG 205 { 206 char *buff; 207 // create format specifiers before the debug output 208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 209 "schedule:%%d chunk:%%%s\n", 210 traits_t<ST>::spec); 211 KD_TRACE(10, (buff, gtid, schedule, chunk)); 212 __kmp_str_free(&buff); 213 } 214 #endif 215 } else { 216 if (schedule == kmp_sch_guided_chunked) { 217 schedule = __kmp_guided; 218 } 219 if (chunk <= 0) { 220 chunk = KMP_DEFAULT_CHUNK; 221 } 222 } 223 224 if (schedule == kmp_sch_auto) { 225 // mapping and differentiation: in the __kmp_do_serial_initialize() 226 schedule = __kmp_auto; 227 #ifdef KMP_DEBUG 228 { 229 char *buff; 230 // create format specifiers before the debug output 231 buff = __kmp_str_format( 232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 233 "schedule:%%d chunk:%%%s\n", 234 traits_t<ST>::spec); 235 KD_TRACE(10, (buff, gtid, schedule, chunk)); 236 __kmp_str_free(&buff); 237 } 238 #endif 239 } 240 #if KMP_STATIC_STEAL_ENABLED 241 // map nonmonotonic:dynamic to static steal 242 if (schedule == kmp_sch_dynamic_chunked) { 243 if (monotonicity == SCHEDULE_NONMONOTONIC) 244 schedule = kmp_sch_static_steal; 245 } 246 #endif 247 /* guided analytical not safe for too many threads */ 248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 249 schedule = kmp_sch_guided_iterative_chunked; 250 KMP_WARNING(DispatchManyThreads); 251 } 252 if (schedule == kmp_sch_runtime_simd) { 253 // compiler provides simd_width in the chunk parameter 254 schedule = team->t.t_sched.r_sched_type; 255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257 // Detail the schedule if needed (global controls are differentiated 258 // appropriately) 259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260 schedule == __kmp_static) { 261 schedule = kmp_sch_static_balanced_chunked; 262 } else { 263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264 schedule = kmp_sch_guided_simd; 265 } 266 chunk = team->t.t_sched.chunk * chunk; 267 } 268 #if USE_ITT_BUILD 269 if (cur_chunk) 270 *cur_chunk = chunk; 271 #endif 272 #ifdef KMP_DEBUG 273 { 274 char *buff; 275 // create format specifiers before the debug output 276 buff = __kmp_str_format( 277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278 " chunk:%%%s\n", 279 traits_t<ST>::spec); 280 KD_TRACE(10, (buff, gtid, schedule, chunk)); 281 __kmp_str_free(&buff); 282 } 283 #endif 284 } 285 pr->u.p.parm1 = chunk; 286 } 287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 288 "unknown scheduling type"); 289 290 pr->u.p.count = 0; 291 292 if (__kmp_env_consistency_check) { 293 if (st == 0) { 294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 296 } 297 } 298 // compute trip count 299 if (st == 1) { // most common case 300 if (ub >= lb) { 301 tc = ub - lb + 1; 302 } else { // ub < lb 303 tc = 0; // zero-trip 304 } 305 } else if (st < 0) { 306 if (lb >= ub) { 307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(lb - ub) / (-st) + 1; 310 } else { // lb < ub 311 tc = 0; // zero-trip 312 } 313 } else { // st > 0 314 if (ub >= lb) { 315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 316 // where the division needs to be unsigned regardless of the result type 317 tc = (UT)(ub - lb) / st + 1; 318 } else { // ub < lb 319 tc = 0; // zero-trip 320 } 321 } 322 323 #if KMP_STATS_ENABLED 324 if (KMP_MASTER_GTID(gtid)) { 325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326 } 327 #endif 328 329 pr->u.p.lb = lb; 330 pr->u.p.ub = ub; 331 pr->u.p.st = st; 332 pr->u.p.tc = tc; 333 334 #if KMP_OS_WINDOWS 335 pr->u.p.last_upper = ub + st; 336 #endif /* KMP_OS_WINDOWS */ 337 338 /* NOTE: only the active parallel region(s) has active ordered sections */ 339 340 if (active) { 341 if (pr->flags.ordered) { 342 pr->ordered_bumped = 0; 343 pr->u.p.ordered_lower = 1; 344 pr->u.p.ordered_upper = 0; 345 } 346 } 347 348 switch (schedule) { 349 #if (KMP_STATIC_STEAL_ENABLED) 350 case kmp_sch_static_steal: { 351 T ntc, init; 352 353 KD_TRACE(100, 354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 355 gtid)); 356 357 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 358 if (nproc > 1 && ntc >= nproc) { 359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 360 T id = tid; 361 T small_chunk, extras; 362 363 small_chunk = ntc / nproc; 364 extras = ntc % nproc; 365 366 init = id * small_chunk + (id < extras ? id : extras); 367 pr->u.p.count = init; 368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 369 370 pr->u.p.parm2 = lb; 371 // parm3 is the number of times to attempt stealing which is 372 // proportional to the number of chunks per thread up until 373 // the maximum value of nproc. 374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 376 pr->u.p.st = st; 377 if (traits_t<T>::type_size > 4) { 378 // AC: TODO: check if 16-byte CAS available and use it to 379 // improve performance (probably wait for explicit request 380 // before spending time on this). 381 // For now use dynamically allocated per-thread lock, 382 // free memory in __kmp_dispatch_next when status==0. 383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384 pr->u.p.th_steal_lock = 385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386 __kmp_init_lock(pr->u.p.th_steal_lock); 387 } 388 break; 389 } else { 390 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391 schedule = kmp_sch_dynamic_chunked; 392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393 "kmp_sch_dynamic_chunked\n", 394 gtid)); 395 goto dynamic_init; 396 break; 397 } // if 398 } // case 399 #endif 400 case kmp_sch_static_balanced: { 401 T init, limit; 402 403 KD_TRACE( 404 100, 405 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 406 gtid)); 407 408 if (nproc > 1) { 409 T id = tid; 410 411 if (tc < nproc) { 412 if (id < tc) { 413 init = id; 414 limit = id; 415 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 416 } else { 417 pr->u.p.count = 1; /* means no more chunks to execute */ 418 pr->u.p.parm1 = FALSE; 419 break; 420 } 421 } else { 422 T small_chunk = tc / nproc; 423 T extras = tc % nproc; 424 init = id * small_chunk + (id < extras ? id : extras); 425 limit = init + small_chunk - (id < extras ? 0 : 1); 426 pr->u.p.parm1 = (id == nproc - 1); 427 } 428 } else { 429 if (tc > 0) { 430 init = 0; 431 limit = tc - 1; 432 pr->u.p.parm1 = TRUE; 433 } else { 434 // zero trip count 435 pr->u.p.count = 1; /* means no more chunks to execute */ 436 pr->u.p.parm1 = FALSE; 437 break; 438 } 439 } 440 #if USE_ITT_BUILD 441 // Calculate chunk for metadata report 442 if (itt_need_metadata_reporting) 443 if (cur_chunk) 444 *cur_chunk = limit - init + 1; 445 #endif 446 if (st == 1) { 447 pr->u.p.lb = lb + init; 448 pr->u.p.ub = lb + limit; 449 } else { 450 // calculated upper bound, "ub" is user-defined upper bound 451 T ub_tmp = lb + limit * st; 452 pr->u.p.lb = lb + init * st; 453 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 454 // it exactly 455 if (st > 0) { 456 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 457 } else { 458 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 459 } 460 } 461 if (pr->flags.ordered) { 462 pr->u.p.ordered_lower = init; 463 pr->u.p.ordered_upper = limit; 464 } 465 break; 466 } // case 467 case kmp_sch_static_balanced_chunked: { 468 // similar to balanced, but chunk adjusted to multiple of simd width 469 T nth = nproc; 470 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 471 " -> falling-through to static_greedy\n", 472 gtid)); 473 schedule = kmp_sch_static_greedy; 474 if (nth > 1) 475 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 476 else 477 pr->u.p.parm1 = tc; 478 break; 479 } // case 480 case kmp_sch_guided_simd: 481 case kmp_sch_guided_iterative_chunked: { 482 KD_TRACE( 483 100, 484 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 485 " case\n", 486 gtid)); 487 488 if (nproc > 1) { 489 if ((2L * chunk + 1) * nproc >= tc) { 490 /* chunk size too large, switch to dynamic */ 491 schedule = kmp_sch_dynamic_chunked; 492 goto dynamic_init; 493 } else { 494 // when remaining iters become less than parm2 - switch to dynamic 495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 496 *(double *)&pr->u.p.parm3 = 497 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 498 } 499 } else { 500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 501 "kmp_sch_static_greedy\n", 502 gtid)); 503 schedule = kmp_sch_static_greedy; 504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 508 gtid)); 509 pr->u.p.parm1 = tc; 510 } // if 511 } // case 512 break; 513 case kmp_sch_guided_analytical_chunked: { 514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 515 "kmp_sch_guided_analytical_chunked case\n", 516 gtid)); 517 518 if (nproc > 1) { 519 if ((2L * chunk + 1) * nproc >= tc) { 520 /* chunk size too large, switch to dynamic */ 521 schedule = kmp_sch_dynamic_chunked; 522 goto dynamic_init; 523 } else { 524 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 525 DBL x; 526 527 #if KMP_USE_X87CONTROL 528 /* Linux* OS already has 64-bit computation by default for long double, 529 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 530 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 531 instead of the default 53-bit. Even though long double doesn't work 532 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 533 expected to impact the correctness of the algorithm, but this has not 534 been mathematically proven. */ 535 // save original FPCW and set precision to 64-bit, as 536 // Windows* OS on IA-32 architecture defaults to 53-bit 537 unsigned int oldFpcw = _control87(0, 0); 538 _control87(_PC_64, _MCW_PC); // 0,0x30000 539 #endif 540 /* value used for comparison in solver for cross-over point */ 541 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 542 543 /* crossover point--chunk indexes equal to or greater than 544 this point switch to dynamic-style scheduling */ 545 UT cross; 546 547 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 548 x = 1.0 - 0.5 / (double)nproc; 549 550 #ifdef KMP_DEBUG 551 { // test natural alignment 552 struct _test_a { 553 char a; 554 union { 555 char b; 556 DBL d; 557 }; 558 } t; 559 ptrdiff_t natural_alignment = 560 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 561 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 562 // long)natural_alignment ); 563 KMP_DEBUG_ASSERT( 564 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 565 } 566 #endif // KMP_DEBUG 567 568 /* save the term in thread private dispatch structure */ 569 *(DBL *)&pr->u.p.parm3 = x; 570 571 /* solve for the crossover point to the nearest integer i for which C_i 572 <= chunk */ 573 { 574 UT left, right, mid; 575 long double p; 576 577 /* estimate initial upper and lower bound */ 578 579 /* doesn't matter what value right is as long as it is positive, but 580 it affects performance of the solver */ 581 right = 229; 582 p = __kmp_pow<UT>(x, right); 583 if (p > target) { 584 do { 585 p *= p; 586 right <<= 1; 587 } while (p > target && right < (1 << 27)); 588 /* lower bound is previous (failed) estimate of upper bound */ 589 left = right >> 1; 590 } else { 591 left = 0; 592 } 593 594 /* bisection root-finding method */ 595 while (left + 1 < right) { 596 mid = (left + right) / 2; 597 if (__kmp_pow<UT>(x, mid) > target) { 598 left = mid; 599 } else { 600 right = mid; 601 } 602 } // while 603 cross = right; 604 } 605 /* assert sanity of computed crossover point */ 606 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 607 __kmp_pow<UT>(x, cross) <= target); 608 609 /* save the crossover point in thread private dispatch structure */ 610 pr->u.p.parm2 = cross; 611 612 // C75803 613 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 614 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 615 #else 616 #define GUIDED_ANALYTICAL_WORKAROUND (x) 617 #endif 618 /* dynamic-style scheduling offset */ 619 pr->u.p.count = tc - 620 __kmp_dispatch_guided_remaining( 621 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 622 cross * chunk; 623 #if KMP_USE_X87CONTROL 624 // restore FPCW 625 _control87(oldFpcw, _MCW_PC); 626 #endif 627 } // if 628 } else { 629 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 630 "kmp_sch_static_greedy\n", 631 gtid)); 632 schedule = kmp_sch_static_greedy; 633 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 634 pr->u.p.parm1 = tc; 635 } // if 636 } // case 637 break; 638 case kmp_sch_static_greedy: 639 KD_TRACE( 640 100, 641 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 642 gtid)); 643 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 644 break; 645 case kmp_sch_static_chunked: 646 case kmp_sch_dynamic_chunked: 647 dynamic_init: 648 if (pr->u.p.parm1 <= 0) 649 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 650 else if (pr->u.p.parm1 > tc) 651 pr->u.p.parm1 = tc; 652 // Store the total number of chunks to prevent integer overflow during 653 // bounds calculations in the get next chunk routine. 654 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); 655 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 656 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 657 gtid)); 658 break; 659 case kmp_sch_trapezoidal: { 660 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 661 662 T parm1, parm2, parm3, parm4; 663 KD_TRACE(100, 664 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 665 gtid)); 666 667 parm1 = chunk; 668 669 /* F : size of the first cycle */ 670 parm2 = (tc / (2 * nproc)); 671 672 if (parm2 < 1) { 673 parm2 = 1; 674 } 675 676 /* L : size of the last cycle. Make sure the last cycle is not larger 677 than the first cycle. */ 678 if (parm1 < 1) { 679 parm1 = 1; 680 } else if (parm1 > parm2) { 681 parm1 = parm2; 682 } 683 684 /* N : number of cycles */ 685 parm3 = (parm2 + parm1); 686 parm3 = (2 * tc + parm3 - 1) / parm3; 687 688 if (parm3 < 2) { 689 parm3 = 2; 690 } 691 692 /* sigma : decreasing incr of the trapezoid */ 693 parm4 = (parm3 - 1); 694 parm4 = (parm2 - parm1) / parm4; 695 696 // pointless check, because parm4 >= 0 always 697 // if ( parm4 < 0 ) { 698 // parm4 = 0; 699 //} 700 701 pr->u.p.parm1 = parm1; 702 pr->u.p.parm2 = parm2; 703 pr->u.p.parm3 = parm3; 704 pr->u.p.parm4 = parm4; 705 } // case 706 break; 707 708 default: { 709 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 710 KMP_HNT(GetNewerLibrary), // Hint 711 __kmp_msg_null // Variadic argument list terminator 712 ); 713 } break; 714 } // switch 715 pr->schedule = schedule; 716 } 717 718 #if KMP_USE_HIER_SCHED 719 template <typename T> 720 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 721 typename traits_t<T>::signed_t st); 722 template <> 723 inline void 724 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 725 kmp_int32 ub, kmp_int32 st) { 726 __kmp_dispatch_init_hierarchy<kmp_int32>( 727 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 728 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 729 } 730 template <> 731 inline void 732 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 733 kmp_uint32 ub, kmp_int32 st) { 734 __kmp_dispatch_init_hierarchy<kmp_uint32>( 735 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 736 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 737 } 738 template <> 739 inline void 740 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 741 kmp_int64 ub, kmp_int64 st) { 742 __kmp_dispatch_init_hierarchy<kmp_int64>( 743 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 744 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 745 } 746 template <> 747 inline void 748 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 749 kmp_uint64 ub, kmp_int64 st) { 750 __kmp_dispatch_init_hierarchy<kmp_uint64>( 751 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 752 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 753 } 754 755 // free all the hierarchy scheduling memory associated with the team 756 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 757 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 758 for (int i = 0; i < num_disp_buff; ++i) { 759 // type does not matter here so use kmp_int32 760 auto sh = 761 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 762 &team->t.t_disp_buffer[i]); 763 if (sh->hier) { 764 sh->hier->deallocate(); 765 __kmp_free(sh->hier); 766 } 767 } 768 } 769 #endif 770 771 // UT - unsigned flavor of T, ST - signed flavor of T, 772 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 773 template <typename T> 774 static void 775 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 776 T ub, typename traits_t<T>::signed_t st, 777 typename traits_t<T>::signed_t chunk, int push_ws) { 778 typedef typename traits_t<T>::unsigned_t UT; 779 780 int active; 781 kmp_info_t *th; 782 kmp_team_t *team; 783 kmp_uint32 my_buffer_index; 784 dispatch_private_info_template<T> *pr; 785 dispatch_shared_info_template<T> volatile *sh; 786 787 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 788 sizeof(dispatch_private_info)); 789 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 790 sizeof(dispatch_shared_info)); 791 __kmp_assert_valid_gtid(gtid); 792 793 if (!TCR_4(__kmp_init_parallel)) 794 __kmp_parallel_initialize(); 795 796 __kmp_resume_if_soft_paused(); 797 798 #if INCLUDE_SSC_MARKS 799 SSC_MARK_DISPATCH_INIT(); 800 #endif 801 #ifdef KMP_DEBUG 802 typedef typename traits_t<T>::signed_t ST; 803 { 804 char *buff; 805 // create format specifiers before the debug output 806 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 807 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 808 traits_t<ST>::spec, traits_t<T>::spec, 809 traits_t<T>::spec, traits_t<ST>::spec); 810 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 811 __kmp_str_free(&buff); 812 } 813 #endif 814 /* setup data */ 815 th = __kmp_threads[gtid]; 816 team = th->th.th_team; 817 active = !team->t.t_serialized; 818 th->th.th_ident = loc; 819 820 // Any half-decent optimizer will remove this test when the blocks are empty 821 // since the macros expand to nothing 822 // when statistics are disabled. 823 if (schedule == __kmp_static) { 824 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 825 } else { 826 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 827 } 828 829 #if KMP_USE_HIER_SCHED 830 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 831 // Hierarchical scheduling does not work with ordered, so if ordered is 832 // detected, then revert back to threaded scheduling. 833 bool ordered; 834 enum sched_type my_sched = schedule; 835 my_buffer_index = th->th.th_dispatch->th_disp_index; 836 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 837 &th->th.th_dispatch 838 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 839 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 840 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 841 my_sched = 842 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 843 ordered = (kmp_ord_lower & my_sched); 844 if (pr->flags.use_hier) { 845 if (ordered) { 846 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 847 "Disabling hierarchical scheduling.\n", 848 gtid)); 849 pr->flags.use_hier = FALSE; 850 } 851 } 852 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 853 // Don't use hierarchical for ordered parallel loops and don't 854 // use the runtime hierarchy if one was specified in the program 855 if (!ordered && !pr->flags.use_hier) 856 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 857 } 858 #endif // KMP_USE_HIER_SCHED 859 860 #if USE_ITT_BUILD 861 kmp_uint64 cur_chunk = chunk; 862 int itt_need_metadata_reporting = 863 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 864 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 865 team->t.t_active_level == 1; 866 #endif 867 if (!active) { 868 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 869 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 870 } else { 871 KMP_DEBUG_ASSERT(th->th.th_dispatch == 872 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 873 874 my_buffer_index = th->th.th_dispatch->th_disp_index++; 875 876 /* What happens when number of threads changes, need to resize buffer? */ 877 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 878 &th->th.th_dispatch 879 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 880 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 881 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 882 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 883 my_buffer_index)); 884 } 885 886 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 887 #if USE_ITT_BUILD 888 &cur_chunk, 889 #endif 890 chunk, (T)th->th.th_team_nproc, 891 (T)th->th.th_info.ds.ds_tid); 892 if (active) { 893 if (pr->flags.ordered == 0) { 894 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 895 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 896 } else { 897 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 898 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 899 } 900 } 901 902 if (active) { 903 /* The name of this buffer should be my_buffer_index when it's free to use 904 * it */ 905 906 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 907 "sh->buffer_index:%d\n", 908 gtid, my_buffer_index, sh->buffer_index)); 909 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 910 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 911 // Note: KMP_WAIT() cannot be used there: buffer index and 912 // my_buffer_index are *always* 32-bit integers. 913 KMP_MB(); /* is this necessary? */ 914 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 915 "sh->buffer_index:%d\n", 916 gtid, my_buffer_index, sh->buffer_index)); 917 918 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 919 th->th.th_dispatch->th_dispatch_sh_current = 920 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 921 #if USE_ITT_BUILD 922 if (pr->flags.ordered) { 923 __kmp_itt_ordered_init(gtid); 924 } 925 // Report loop metadata 926 if (itt_need_metadata_reporting) { 927 // Only report metadata by primary thread of active team at level 1 928 kmp_uint64 schedtype = 0; 929 switch (schedule) { 930 case kmp_sch_static_chunked: 931 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 932 break; 933 case kmp_sch_static_greedy: 934 cur_chunk = pr->u.p.parm1; 935 break; 936 case kmp_sch_dynamic_chunked: 937 schedtype = 1; 938 break; 939 case kmp_sch_guided_iterative_chunked: 940 case kmp_sch_guided_analytical_chunked: 941 case kmp_sch_guided_simd: 942 schedtype = 2; 943 break; 944 default: 945 // Should we put this case under "static"? 946 // case kmp_sch_static_steal: 947 schedtype = 3; 948 break; 949 } 950 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 951 } 952 #if KMP_USE_HIER_SCHED 953 if (pr->flags.use_hier) { 954 pr->u.p.count = 0; 955 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 956 } 957 #endif // KMP_USER_HIER_SCHED 958 #endif /* USE_ITT_BUILD */ 959 } 960 961 #ifdef KMP_DEBUG 962 { 963 char *buff; 964 // create format specifiers before the debug output 965 buff = __kmp_str_format( 966 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 967 "lb:%%%s ub:%%%s" 968 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 969 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 970 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 971 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 972 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 973 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 974 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 975 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 976 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 977 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 978 __kmp_str_free(&buff); 979 } 980 #endif 981 #if (KMP_STATIC_STEAL_ENABLED) 982 // It cannot be guaranteed that after execution of a loop with some other 983 // schedule kind all the parm3 variables will contain the same value. Even if 984 // all parm3 will be the same, it still exists a bad case like using 0 and 1 985 // rather than program life-time increment. So the dedicated variable is 986 // required. The 'static_steal_counter' is used. 987 if (pr->schedule == kmp_sch_static_steal) { 988 // Other threads will inspect this variable when searching for a victim. 989 // This is a flag showing that other threads may steal from this thread 990 // since then. 991 volatile T *p = &pr->u.p.static_steal_counter; 992 *p = *p + 1; 993 } 994 #endif // ( KMP_STATIC_STEAL_ENABLED ) 995 996 #if OMPT_SUPPORT && OMPT_OPTIONAL 997 if (ompt_enabled.ompt_callback_work) { 998 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 999 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1000 ompt_callbacks.ompt_callback(ompt_callback_work)( 1001 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1002 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1003 } 1004 #endif 1005 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1006 } 1007 1008 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1009 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1010 * every chunk of iterations. If the ordered section(s) were not executed 1011 * for this iteration (or every iteration in this chunk), we need to set the 1012 * ordered iteration counters so that the next thread can proceed. */ 1013 template <typename UT> 1014 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1015 typedef typename traits_t<UT>::signed_t ST; 1016 __kmp_assert_valid_gtid(gtid); 1017 kmp_info_t *th = __kmp_threads[gtid]; 1018 1019 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1020 if (!th->th.th_team->t.t_serialized) { 1021 1022 dispatch_private_info_template<UT> *pr = 1023 reinterpret_cast<dispatch_private_info_template<UT> *>( 1024 th->th.th_dispatch->th_dispatch_pr_current); 1025 dispatch_shared_info_template<UT> volatile *sh = 1026 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1027 th->th.th_dispatch->th_dispatch_sh_current); 1028 KMP_DEBUG_ASSERT(pr); 1029 KMP_DEBUG_ASSERT(sh); 1030 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1031 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1032 1033 if (pr->ordered_bumped) { 1034 KD_TRACE( 1035 1000, 1036 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1037 gtid)); 1038 pr->ordered_bumped = 0; 1039 } else { 1040 UT lower = pr->u.p.ordered_lower; 1041 1042 #ifdef KMP_DEBUG 1043 { 1044 char *buff; 1045 // create format specifiers before the debug output 1046 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1047 "ordered_iteration:%%%s lower:%%%s\n", 1048 traits_t<UT>::spec, traits_t<UT>::spec); 1049 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1050 __kmp_str_free(&buff); 1051 } 1052 #endif 1053 1054 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1055 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1056 KMP_MB(); /* is this necessary? */ 1057 #ifdef KMP_DEBUG 1058 { 1059 char *buff; 1060 // create format specifiers before the debug output 1061 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1062 "ordered_iteration:%%%s lower:%%%s\n", 1063 traits_t<UT>::spec, traits_t<UT>::spec); 1064 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1065 __kmp_str_free(&buff); 1066 } 1067 #endif 1068 1069 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1070 } // if 1071 } // if 1072 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1073 } 1074 1075 #ifdef KMP_GOMP_COMPAT 1076 1077 template <typename UT> 1078 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1079 typedef typename traits_t<UT>::signed_t ST; 1080 __kmp_assert_valid_gtid(gtid); 1081 kmp_info_t *th = __kmp_threads[gtid]; 1082 1083 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1084 if (!th->th.th_team->t.t_serialized) { 1085 // int cid; 1086 dispatch_private_info_template<UT> *pr = 1087 reinterpret_cast<dispatch_private_info_template<UT> *>( 1088 th->th.th_dispatch->th_dispatch_pr_current); 1089 dispatch_shared_info_template<UT> volatile *sh = 1090 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1091 th->th.th_dispatch->th_dispatch_sh_current); 1092 KMP_DEBUG_ASSERT(pr); 1093 KMP_DEBUG_ASSERT(sh); 1094 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1095 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1096 1097 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1098 UT lower = pr->u.p.ordered_lower; 1099 UT upper = pr->u.p.ordered_upper; 1100 UT inc = upper - lower + 1; 1101 1102 if (pr->ordered_bumped == inc) { 1103 KD_TRACE( 1104 1000, 1105 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1106 gtid)); 1107 pr->ordered_bumped = 0; 1108 } else { 1109 inc -= pr->ordered_bumped; 1110 1111 #ifdef KMP_DEBUG 1112 { 1113 char *buff; 1114 // create format specifiers before the debug output 1115 buff = __kmp_str_format( 1116 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1117 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1118 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1119 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1120 __kmp_str_free(&buff); 1121 } 1122 #endif 1123 1124 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1125 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1126 1127 KMP_MB(); /* is this necessary? */ 1128 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1129 "ordered_bumped to zero\n", 1130 gtid)); 1131 pr->ordered_bumped = 0; 1132 //!!!!! TODO check if the inc should be unsigned, or signed??? 1133 #ifdef KMP_DEBUG 1134 { 1135 char *buff; 1136 // create format specifiers before the debug output 1137 buff = __kmp_str_format( 1138 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1139 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1140 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1141 traits_t<UT>::spec); 1142 KD_TRACE(1000, 1143 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1144 __kmp_str_free(&buff); 1145 } 1146 #endif 1147 1148 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1149 } 1150 // } 1151 } 1152 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1153 } 1154 1155 #endif /* KMP_GOMP_COMPAT */ 1156 1157 template <typename T> 1158 int __kmp_dispatch_next_algorithm(int gtid, 1159 dispatch_private_info_template<T> *pr, 1160 dispatch_shared_info_template<T> volatile *sh, 1161 kmp_int32 *p_last, T *p_lb, T *p_ub, 1162 typename traits_t<T>::signed_t *p_st, T nproc, 1163 T tid) { 1164 typedef typename traits_t<T>::unsigned_t UT; 1165 typedef typename traits_t<T>::signed_t ST; 1166 typedef typename traits_t<T>::floating_t DBL; 1167 int status = 0; 1168 bool last = false; 1169 T start; 1170 ST incr; 1171 UT limit, trip, init; 1172 kmp_info_t *th = __kmp_threads[gtid]; 1173 kmp_team_t *team = th->th.th_team; 1174 1175 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1176 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1177 KMP_DEBUG_ASSERT(pr); 1178 KMP_DEBUG_ASSERT(sh); 1179 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1180 #ifdef KMP_DEBUG 1181 { 1182 char *buff; 1183 // create format specifiers before the debug output 1184 buff = 1185 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1186 "sh:%%p nproc:%%%s tid:%%%s\n", 1187 traits_t<T>::spec, traits_t<T>::spec); 1188 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1189 __kmp_str_free(&buff); 1190 } 1191 #endif 1192 1193 // zero trip count 1194 if (pr->u.p.tc == 0) { 1195 KD_TRACE(10, 1196 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1197 "zero status:%d\n", 1198 gtid, status)); 1199 return 0; 1200 } 1201 1202 switch (pr->schedule) { 1203 #if (KMP_STATIC_STEAL_ENABLED) 1204 case kmp_sch_static_steal: { 1205 T chunk = pr->u.p.parm1; 1206 1207 KD_TRACE(100, 1208 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1209 gtid)); 1210 1211 trip = pr->u.p.tc - 1; 1212 1213 if (traits_t<T>::type_size > 4) { 1214 // use lock for 8-byte and CAS for 4-byte induction 1215 // variable. TODO (optional): check and use 16-byte CAS 1216 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1217 KMP_DEBUG_ASSERT(lck != NULL); 1218 if (pr->u.p.count < (UT)pr->u.p.ub) { 1219 __kmp_acquire_lock(lck, gtid); 1220 // try to get own chunk of iterations 1221 init = (pr->u.p.count)++; 1222 status = (init < (UT)pr->u.p.ub); 1223 __kmp_release_lock(lck, gtid); 1224 } else { 1225 status = 0; // no own chunks 1226 } 1227 if (!status) { // try to steal 1228 kmp_info_t **other_threads = team->t.t_threads; 1229 T while_limit = pr->u.p.parm3; 1230 T while_index = 0; 1231 T id = pr->u.p.static_steal_counter; // loop id 1232 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1233 __kmp_dispatch_num_buffers; // current loop index 1234 // note: victim thread can potentially execute another loop 1235 // TODO: algorithm of searching for a victim 1236 // should be cleaned up and measured 1237 while ((!status) && (while_limit != ++while_index)) { 1238 dispatch_private_info_template<T> *victim; 1239 T remaining; 1240 T victimIdx = pr->u.p.parm4; 1241 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1242 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1243 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1244 KMP_DEBUG_ASSERT(victim); 1245 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1246 oldVictimIdx != victimIdx) { 1247 victimIdx = (victimIdx + 1) % nproc; 1248 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1249 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1250 KMP_DEBUG_ASSERT(victim); 1251 } 1252 if (victim == pr || id != victim->u.p.static_steal_counter) { 1253 continue; // try once more (nproc attempts in total) 1254 // no victim is ready yet to participate in stealing 1255 // because no victim passed kmp_init_dispatch yet 1256 } 1257 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1258 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1259 continue; // not enough chunks to steal, goto next victim 1260 } 1261 1262 lck = victim->u.p.th_steal_lock; 1263 KMP_ASSERT(lck != NULL); 1264 __kmp_acquire_lock(lck, gtid); 1265 limit = victim->u.p.ub; // keep initial ub 1266 if (victim->u.p.count >= limit || 1267 (remaining = limit - victim->u.p.count) < 2) { 1268 __kmp_release_lock(lck, gtid); 1269 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1270 continue; // not enough chunks to steal 1271 } 1272 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1273 // by 1 1274 if (remaining > 3) { 1275 // steal 1/4 of remaining 1276 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1277 init = (victim->u.p.ub -= (remaining >> 2)); 1278 } else { 1279 // steal 1 chunk of 2 or 3 remaining 1280 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1281 init = (victim->u.p.ub -= 1); 1282 } 1283 __kmp_release_lock(lck, gtid); 1284 1285 KMP_DEBUG_ASSERT(init + 1 <= limit); 1286 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1287 status = 1; 1288 while_index = 0; 1289 // now update own count and ub with stolen range but init chunk 1290 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1291 pr->u.p.count = init + 1; 1292 pr->u.p.ub = limit; 1293 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1294 } // while (search for victim) 1295 } // if (try to find victim and steal) 1296 } else { 1297 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1298 typedef union { 1299 struct { 1300 UT count; 1301 T ub; 1302 } p; 1303 kmp_int64 b; 1304 } union_i4; 1305 // All operations on 'count' or 'ub' must be combined atomically 1306 // together. 1307 { 1308 union_i4 vold, vnew; 1309 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1310 vnew = vold; 1311 vnew.p.count++; 1312 while (!KMP_COMPARE_AND_STORE_ACQ64( 1313 (volatile kmp_int64 *)&pr->u.p.count, 1314 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1315 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1316 KMP_CPU_PAUSE(); 1317 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1318 vnew = vold; 1319 vnew.p.count++; 1320 } 1321 vnew = vold; 1322 init = vnew.p.count; 1323 status = (init < (UT)vnew.p.ub); 1324 } 1325 1326 if (!status) { 1327 kmp_info_t **other_threads = team->t.t_threads; 1328 T while_limit = pr->u.p.parm3; 1329 T while_index = 0; 1330 T id = pr->u.p.static_steal_counter; // loop id 1331 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1332 __kmp_dispatch_num_buffers; // current loop index 1333 // note: victim thread can potentially execute another loop 1334 // TODO: algorithm of searching for a victim 1335 // should be cleaned up and measured 1336 while ((!status) && (while_limit != ++while_index)) { 1337 dispatch_private_info_template<T> *victim; 1338 union_i4 vold, vnew; 1339 T remaining; 1340 T victimIdx = pr->u.p.parm4; 1341 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1342 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1343 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1344 KMP_DEBUG_ASSERT(victim); 1345 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1346 oldVictimIdx != victimIdx) { 1347 victimIdx = (victimIdx + 1) % nproc; 1348 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1349 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1350 KMP_DEBUG_ASSERT(victim); 1351 } 1352 if (victim == pr || id != victim->u.p.static_steal_counter) { 1353 continue; // try once more (nproc attempts in total) 1354 // no victim is ready yet to participate in stealing 1355 // because no victim passed kmp_init_dispatch yet 1356 } 1357 pr->u.p.parm4 = victimIdx; // new victim found 1358 while (1) { // CAS loop if victim has enough chunks to steal 1359 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1360 vnew = vold; 1361 1362 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1363 if (vnew.p.count >= (UT)vnew.p.ub || 1364 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1365 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1366 break; // not enough chunks to steal, goto next victim 1367 } 1368 if (remaining > 3) { 1369 // try to steal 1/4 of remaining 1370 vnew.p.ub -= remaining >> 2; 1371 } else { 1372 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1373 } 1374 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1375 // TODO: Should this be acquire or release? 1376 if (KMP_COMPARE_AND_STORE_ACQ64( 1377 (volatile kmp_int64 *)&victim->u.p.count, 1378 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1379 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1380 // stealing succeeded 1381 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1382 vold.p.ub - vnew.p.ub); 1383 status = 1; 1384 while_index = 0; 1385 // now update own count and ub 1386 init = vnew.p.ub; 1387 vold.p.count = init + 1; 1388 #if KMP_ARCH_X86 1389 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1390 #else 1391 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1392 #endif 1393 break; 1394 } // if (check CAS result) 1395 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1396 } // while (try to steal from particular victim) 1397 } // while (search for victim) 1398 } // if (try to find victim and steal) 1399 } // if (4-byte induction variable) 1400 if (!status) { 1401 *p_lb = 0; 1402 *p_ub = 0; 1403 if (p_st != NULL) 1404 *p_st = 0; 1405 } else { 1406 start = pr->u.p.parm2; 1407 init *= chunk; 1408 limit = chunk + init - 1; 1409 incr = pr->u.p.st; 1410 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1411 1412 KMP_DEBUG_ASSERT(init <= trip); 1413 if ((last = (limit >= trip)) != 0) 1414 limit = trip; 1415 if (p_st != NULL) 1416 *p_st = incr; 1417 1418 if (incr == 1) { 1419 *p_lb = start + init; 1420 *p_ub = start + limit; 1421 } else { 1422 *p_lb = start + init * incr; 1423 *p_ub = start + limit * incr; 1424 } 1425 1426 if (pr->flags.ordered) { 1427 pr->u.p.ordered_lower = init; 1428 pr->u.p.ordered_upper = limit; 1429 } // if 1430 } // if 1431 break; 1432 } // case 1433 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1434 case kmp_sch_static_balanced: { 1435 KD_TRACE( 1436 10, 1437 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1438 gtid)); 1439 /* check if thread has any iteration to do */ 1440 if ((status = !pr->u.p.count) != 0) { 1441 pr->u.p.count = 1; 1442 *p_lb = pr->u.p.lb; 1443 *p_ub = pr->u.p.ub; 1444 last = (pr->u.p.parm1 != 0); 1445 if (p_st != NULL) 1446 *p_st = pr->u.p.st; 1447 } else { /* no iterations to do */ 1448 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1449 } 1450 } // case 1451 break; 1452 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1453 merged here */ 1454 case kmp_sch_static_chunked: { 1455 T parm1; 1456 1457 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1458 "kmp_sch_static_[affinity|chunked] case\n", 1459 gtid)); 1460 parm1 = pr->u.p.parm1; 1461 1462 trip = pr->u.p.tc - 1; 1463 init = parm1 * (pr->u.p.count + tid); 1464 1465 if ((status = (init <= trip)) != 0) { 1466 start = pr->u.p.lb; 1467 incr = pr->u.p.st; 1468 limit = parm1 + init - 1; 1469 1470 if ((last = (limit >= trip)) != 0) 1471 limit = trip; 1472 1473 if (p_st != NULL) 1474 *p_st = incr; 1475 1476 pr->u.p.count += nproc; 1477 1478 if (incr == 1) { 1479 *p_lb = start + init; 1480 *p_ub = start + limit; 1481 } else { 1482 *p_lb = start + init * incr; 1483 *p_ub = start + limit * incr; 1484 } 1485 1486 if (pr->flags.ordered) { 1487 pr->u.p.ordered_lower = init; 1488 pr->u.p.ordered_upper = limit; 1489 } // if 1490 } // if 1491 } // case 1492 break; 1493 1494 case kmp_sch_dynamic_chunked: { 1495 UT chunk_number; 1496 UT chunk_size = pr->u.p.parm1; 1497 UT nchunks = pr->u.p.parm2; 1498 1499 KD_TRACE( 1500 100, 1501 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1502 gtid)); 1503 1504 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1505 status = (chunk_number < nchunks); 1506 if (!status) { 1507 *p_lb = 0; 1508 *p_ub = 0; 1509 if (p_st != NULL) 1510 *p_st = 0; 1511 } else { 1512 init = chunk_size * chunk_number; 1513 trip = pr->u.p.tc - 1; 1514 start = pr->u.p.lb; 1515 incr = pr->u.p.st; 1516 1517 if ((last = (trip - init < (UT)chunk_size))) 1518 limit = trip; 1519 else 1520 limit = chunk_size + init - 1; 1521 1522 if (p_st != NULL) 1523 *p_st = incr; 1524 1525 if (incr == 1) { 1526 *p_lb = start + init; 1527 *p_ub = start + limit; 1528 } else { 1529 *p_lb = start + init * incr; 1530 *p_ub = start + limit * incr; 1531 } 1532 1533 if (pr->flags.ordered) { 1534 pr->u.p.ordered_lower = init; 1535 pr->u.p.ordered_upper = limit; 1536 } // if 1537 } // if 1538 } // case 1539 break; 1540 1541 case kmp_sch_guided_iterative_chunked: { 1542 T chunkspec = pr->u.p.parm1; 1543 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1544 "iterative case\n", 1545 gtid)); 1546 trip = pr->u.p.tc; 1547 // Start atomic part of calculations 1548 while (1) { 1549 ST remaining; // signed, because can be < 0 1550 init = sh->u.s.iteration; // shared value 1551 remaining = trip - init; 1552 if (remaining <= 0) { // AC: need to compare with 0 first 1553 // nothing to do, don't try atomic op 1554 status = 0; 1555 break; 1556 } 1557 if ((T)remaining < 1558 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1559 // use dynamic-style schedule 1560 // atomically increment iterations, get old value 1561 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1562 (ST)chunkspec); 1563 remaining = trip - init; 1564 if (remaining <= 0) { 1565 status = 0; // all iterations got by other threads 1566 } else { 1567 // got some iterations to work on 1568 status = 1; 1569 if ((T)remaining > chunkspec) { 1570 limit = init + chunkspec - 1; 1571 } else { 1572 last = true; // the last chunk 1573 limit = init + remaining - 1; 1574 } // if 1575 } // if 1576 break; 1577 } // if 1578 limit = init + (UT)((double)remaining * 1579 *(double *)&pr->u.p.parm3); // divide by K*nproc 1580 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1581 (ST)init, (ST)limit)) { 1582 // CAS was successful, chunk obtained 1583 status = 1; 1584 --limit; 1585 break; 1586 } // if 1587 } // while 1588 if (status != 0) { 1589 start = pr->u.p.lb; 1590 incr = pr->u.p.st; 1591 if (p_st != NULL) 1592 *p_st = incr; 1593 *p_lb = start + init * incr; 1594 *p_ub = start + limit * incr; 1595 if (pr->flags.ordered) { 1596 pr->u.p.ordered_lower = init; 1597 pr->u.p.ordered_upper = limit; 1598 } // if 1599 } else { 1600 *p_lb = 0; 1601 *p_ub = 0; 1602 if (p_st != NULL) 1603 *p_st = 0; 1604 } // if 1605 } // case 1606 break; 1607 1608 case kmp_sch_guided_simd: { 1609 // same as iterative but curr-chunk adjusted to be multiple of given 1610 // chunk 1611 T chunk = pr->u.p.parm1; 1612 KD_TRACE(100, 1613 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1614 gtid)); 1615 trip = pr->u.p.tc; 1616 // Start atomic part of calculations 1617 while (1) { 1618 ST remaining; // signed, because can be < 0 1619 init = sh->u.s.iteration; // shared value 1620 remaining = trip - init; 1621 if (remaining <= 0) { // AC: need to compare with 0 first 1622 status = 0; // nothing to do, don't try atomic op 1623 break; 1624 } 1625 KMP_DEBUG_ASSERT(init % chunk == 0); 1626 // compare with K*nproc*(chunk+1), K=2 by default 1627 if ((T)remaining < pr->u.p.parm2) { 1628 // use dynamic-style schedule 1629 // atomically increment iterations, get old value 1630 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1631 (ST)chunk); 1632 remaining = trip - init; 1633 if (remaining <= 0) { 1634 status = 0; // all iterations got by other threads 1635 } else { 1636 // got some iterations to work on 1637 status = 1; 1638 if ((T)remaining > chunk) { 1639 limit = init + chunk - 1; 1640 } else { 1641 last = true; // the last chunk 1642 limit = init + remaining - 1; 1643 } // if 1644 } // if 1645 break; 1646 } // if 1647 // divide by K*nproc 1648 UT span; 1649 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1650 &span); 1651 UT rem = span % chunk; 1652 if (rem) // adjust so that span%chunk == 0 1653 span += chunk - rem; 1654 limit = init + span; 1655 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1656 (ST)init, (ST)limit)) { 1657 // CAS was successful, chunk obtained 1658 status = 1; 1659 --limit; 1660 break; 1661 } // if 1662 } // while 1663 if (status != 0) { 1664 start = pr->u.p.lb; 1665 incr = pr->u.p.st; 1666 if (p_st != NULL) 1667 *p_st = incr; 1668 *p_lb = start + init * incr; 1669 *p_ub = start + limit * incr; 1670 if (pr->flags.ordered) { 1671 pr->u.p.ordered_lower = init; 1672 pr->u.p.ordered_upper = limit; 1673 } // if 1674 } else { 1675 *p_lb = 0; 1676 *p_ub = 0; 1677 if (p_st != NULL) 1678 *p_st = 0; 1679 } // if 1680 } // case 1681 break; 1682 1683 case kmp_sch_guided_analytical_chunked: { 1684 T chunkspec = pr->u.p.parm1; 1685 UT chunkIdx; 1686 #if KMP_USE_X87CONTROL 1687 /* for storing original FPCW value for Windows* OS on 1688 IA-32 architecture 8-byte version */ 1689 unsigned int oldFpcw; 1690 unsigned int fpcwSet = 0; 1691 #endif 1692 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1693 "kmp_sch_guided_analytical_chunked case\n", 1694 gtid)); 1695 1696 trip = pr->u.p.tc; 1697 1698 KMP_DEBUG_ASSERT(nproc > 1); 1699 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1700 1701 while (1) { /* this while loop is a safeguard against unexpected zero 1702 chunk sizes */ 1703 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1704 if (chunkIdx >= (UT)pr->u.p.parm2) { 1705 --trip; 1706 /* use dynamic-style scheduling */ 1707 init = chunkIdx * chunkspec + pr->u.p.count; 1708 /* need to verify init > 0 in case of overflow in the above 1709 * calculation */ 1710 if ((status = (init > 0 && init <= trip)) != 0) { 1711 limit = init + chunkspec - 1; 1712 1713 if ((last = (limit >= trip)) != 0) 1714 limit = trip; 1715 } 1716 break; 1717 } else { 1718 /* use exponential-style scheduling */ 1719 /* The following check is to workaround the lack of long double precision on 1720 Windows* OS. 1721 This check works around the possible effect that init != 0 for chunkIdx == 0. 1722 */ 1723 #if KMP_USE_X87CONTROL 1724 /* If we haven't already done so, save original 1725 FPCW and set precision to 64-bit, as Windows* OS 1726 on IA-32 architecture defaults to 53-bit */ 1727 if (!fpcwSet) { 1728 oldFpcw = _control87(0, 0); 1729 _control87(_PC_64, _MCW_PC); 1730 fpcwSet = 0x30000; 1731 } 1732 #endif 1733 if (chunkIdx) { 1734 init = __kmp_dispatch_guided_remaining<T>( 1735 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1736 KMP_DEBUG_ASSERT(init); 1737 init = trip - init; 1738 } else 1739 init = 0; 1740 limit = trip - __kmp_dispatch_guided_remaining<T>( 1741 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1742 KMP_ASSERT(init <= limit); 1743 if (init < limit) { 1744 KMP_DEBUG_ASSERT(limit <= trip); 1745 --limit; 1746 status = 1; 1747 break; 1748 } // if 1749 } // if 1750 } // while (1) 1751 #if KMP_USE_X87CONTROL 1752 /* restore FPCW if necessary 1753 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1754 */ 1755 if (fpcwSet && (oldFpcw & fpcwSet)) 1756 _control87(oldFpcw, _MCW_PC); 1757 #endif 1758 if (status != 0) { 1759 start = pr->u.p.lb; 1760 incr = pr->u.p.st; 1761 if (p_st != NULL) 1762 *p_st = incr; 1763 *p_lb = start + init * incr; 1764 *p_ub = start + limit * incr; 1765 if (pr->flags.ordered) { 1766 pr->u.p.ordered_lower = init; 1767 pr->u.p.ordered_upper = limit; 1768 } 1769 } else { 1770 *p_lb = 0; 1771 *p_ub = 0; 1772 if (p_st != NULL) 1773 *p_st = 0; 1774 } 1775 } // case 1776 break; 1777 1778 case kmp_sch_trapezoidal: { 1779 UT index; 1780 T parm2 = pr->u.p.parm2; 1781 T parm3 = pr->u.p.parm3; 1782 T parm4 = pr->u.p.parm4; 1783 KD_TRACE(100, 1784 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1785 gtid)); 1786 1787 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1788 1789 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1790 trip = pr->u.p.tc - 1; 1791 1792 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1793 *p_lb = 0; 1794 *p_ub = 0; 1795 if (p_st != NULL) 1796 *p_st = 0; 1797 } else { 1798 start = pr->u.p.lb; 1799 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1800 incr = pr->u.p.st; 1801 1802 if ((last = (limit >= trip)) != 0) 1803 limit = trip; 1804 1805 if (p_st != NULL) 1806 *p_st = incr; 1807 1808 if (incr == 1) { 1809 *p_lb = start + init; 1810 *p_ub = start + limit; 1811 } else { 1812 *p_lb = start + init * incr; 1813 *p_ub = start + limit * incr; 1814 } 1815 1816 if (pr->flags.ordered) { 1817 pr->u.p.ordered_lower = init; 1818 pr->u.p.ordered_upper = limit; 1819 } // if 1820 } // if 1821 } // case 1822 break; 1823 default: { 1824 status = 0; // to avoid complaints on uninitialized variable use 1825 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1826 KMP_HNT(GetNewerLibrary), // Hint 1827 __kmp_msg_null // Variadic argument list terminator 1828 ); 1829 } break; 1830 } // switch 1831 if (p_last) 1832 *p_last = last; 1833 #ifdef KMP_DEBUG 1834 if (pr->flags.ordered) { 1835 char *buff; 1836 // create format specifiers before the debug output 1837 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1838 "ordered_lower:%%%s ordered_upper:%%%s\n", 1839 traits_t<UT>::spec, traits_t<UT>::spec); 1840 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1841 __kmp_str_free(&buff); 1842 } 1843 { 1844 char *buff; 1845 // create format specifiers before the debug output 1846 buff = __kmp_str_format( 1847 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1848 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1849 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1850 KMP_DEBUG_ASSERT(p_last); 1851 KMP_DEBUG_ASSERT(p_st); 1852 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1853 __kmp_str_free(&buff); 1854 } 1855 #endif 1856 return status; 1857 } 1858 1859 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1860 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1861 is not called. */ 1862 #if OMPT_SUPPORT && OMPT_OPTIONAL 1863 #define OMPT_LOOP_END \ 1864 if (status == 0) { \ 1865 if (ompt_enabled.ompt_callback_work) { \ 1866 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1867 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1868 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1869 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1870 &(task_info->task_data), 0, codeptr); \ 1871 } \ 1872 } 1873 // TODO: implement count 1874 #else 1875 #define OMPT_LOOP_END // no-op 1876 #endif 1877 1878 #if KMP_STATS_ENABLED 1879 #define KMP_STATS_LOOP_END \ 1880 { \ 1881 kmp_int64 u, l, t, i; \ 1882 l = (kmp_int64)(*p_lb); \ 1883 u = (kmp_int64)(*p_ub); \ 1884 i = (kmp_int64)(pr->u.p.st); \ 1885 if (status == 0) { \ 1886 t = 0; \ 1887 KMP_POP_PARTITIONED_TIMER(); \ 1888 } else if (i == 1) { \ 1889 if (u >= l) \ 1890 t = u - l + 1; \ 1891 else \ 1892 t = 0; \ 1893 } else if (i < 0) { \ 1894 if (l >= u) \ 1895 t = (l - u) / (-i) + 1; \ 1896 else \ 1897 t = 0; \ 1898 } else { \ 1899 if (u >= l) \ 1900 t = (u - l) / i + 1; \ 1901 else \ 1902 t = 0; \ 1903 } \ 1904 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1905 } 1906 #else 1907 #define KMP_STATS_LOOP_END /* Nothing */ 1908 #endif 1909 1910 template <typename T> 1911 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1912 T *p_lb, T *p_ub, 1913 typename traits_t<T>::signed_t *p_st 1914 #if OMPT_SUPPORT && OMPT_OPTIONAL 1915 , 1916 void *codeptr 1917 #endif 1918 ) { 1919 1920 typedef typename traits_t<T>::unsigned_t UT; 1921 typedef typename traits_t<T>::signed_t ST; 1922 // This is potentially slightly misleading, schedule(runtime) will appear here 1923 // even if the actual runtime schedule is static. (Which points out a 1924 // disadvantage of schedule(runtime): even when static scheduling is used it 1925 // costs more than a compile time choice to use static scheduling would.) 1926 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1927 1928 int status; 1929 dispatch_private_info_template<T> *pr; 1930 __kmp_assert_valid_gtid(gtid); 1931 kmp_info_t *th = __kmp_threads[gtid]; 1932 kmp_team_t *team = th->th.th_team; 1933 1934 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1935 KD_TRACE( 1936 1000, 1937 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1938 gtid, p_lb, p_ub, p_st, p_last)); 1939 1940 if (team->t.t_serialized) { 1941 /* NOTE: serialize this dispatch because we are not at the active level */ 1942 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1943 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1944 KMP_DEBUG_ASSERT(pr); 1945 1946 if ((status = (pr->u.p.tc != 0)) == 0) { 1947 *p_lb = 0; 1948 *p_ub = 0; 1949 // if ( p_last != NULL ) 1950 // *p_last = 0; 1951 if (p_st != NULL) 1952 *p_st = 0; 1953 if (__kmp_env_consistency_check) { 1954 if (pr->pushed_ws != ct_none) { 1955 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1956 } 1957 } 1958 } else if (pr->flags.nomerge) { 1959 kmp_int32 last; 1960 T start; 1961 UT limit, trip, init; 1962 ST incr; 1963 T chunk = pr->u.p.parm1; 1964 1965 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1966 gtid)); 1967 1968 init = chunk * pr->u.p.count++; 1969 trip = pr->u.p.tc - 1; 1970 1971 if ((status = (init <= trip)) == 0) { 1972 *p_lb = 0; 1973 *p_ub = 0; 1974 // if ( p_last != NULL ) 1975 // *p_last = 0; 1976 if (p_st != NULL) 1977 *p_st = 0; 1978 if (__kmp_env_consistency_check) { 1979 if (pr->pushed_ws != ct_none) { 1980 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1981 } 1982 } 1983 } else { 1984 start = pr->u.p.lb; 1985 limit = chunk + init - 1; 1986 incr = pr->u.p.st; 1987 1988 if ((last = (limit >= trip)) != 0) { 1989 limit = trip; 1990 #if KMP_OS_WINDOWS 1991 pr->u.p.last_upper = pr->u.p.ub; 1992 #endif /* KMP_OS_WINDOWS */ 1993 } 1994 if (p_last != NULL) 1995 *p_last = last; 1996 if (p_st != NULL) 1997 *p_st = incr; 1998 if (incr == 1) { 1999 *p_lb = start + init; 2000 *p_ub = start + limit; 2001 } else { 2002 *p_lb = start + init * incr; 2003 *p_ub = start + limit * incr; 2004 } 2005 2006 if (pr->flags.ordered) { 2007 pr->u.p.ordered_lower = init; 2008 pr->u.p.ordered_upper = limit; 2009 #ifdef KMP_DEBUG 2010 { 2011 char *buff; 2012 // create format specifiers before the debug output 2013 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2014 "ordered_lower:%%%s ordered_upper:%%%s\n", 2015 traits_t<UT>::spec, traits_t<UT>::spec); 2016 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2017 pr->u.p.ordered_upper)); 2018 __kmp_str_free(&buff); 2019 } 2020 #endif 2021 } // if 2022 } // if 2023 } else { 2024 pr->u.p.tc = 0; 2025 *p_lb = pr->u.p.lb; 2026 *p_ub = pr->u.p.ub; 2027 #if KMP_OS_WINDOWS 2028 pr->u.p.last_upper = *p_ub; 2029 #endif /* KMP_OS_WINDOWS */ 2030 if (p_last != NULL) 2031 *p_last = TRUE; 2032 if (p_st != NULL) 2033 *p_st = pr->u.p.st; 2034 } // if 2035 #ifdef KMP_DEBUG 2036 { 2037 char *buff; 2038 // create format specifiers before the debug output 2039 buff = __kmp_str_format( 2040 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2041 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2042 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2043 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2044 (p_last ? *p_last : 0), status)); 2045 __kmp_str_free(&buff); 2046 } 2047 #endif 2048 #if INCLUDE_SSC_MARKS 2049 SSC_MARK_DISPATCH_NEXT(); 2050 #endif 2051 OMPT_LOOP_END; 2052 KMP_STATS_LOOP_END; 2053 return status; 2054 } else { 2055 kmp_int32 last = 0; 2056 dispatch_shared_info_template<T> volatile *sh; 2057 2058 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2059 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2060 2061 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2062 th->th.th_dispatch->th_dispatch_pr_current); 2063 KMP_DEBUG_ASSERT(pr); 2064 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2065 th->th.th_dispatch->th_dispatch_sh_current); 2066 KMP_DEBUG_ASSERT(sh); 2067 2068 #if KMP_USE_HIER_SCHED 2069 if (pr->flags.use_hier) 2070 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2071 else 2072 #endif // KMP_USE_HIER_SCHED 2073 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2074 p_st, th->th.th_team_nproc, 2075 th->th.th_info.ds.ds_tid); 2076 // status == 0: no more iterations to execute 2077 if (status == 0) { 2078 UT num_done; 2079 2080 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2081 #ifdef KMP_DEBUG 2082 { 2083 char *buff; 2084 // create format specifiers before the debug output 2085 buff = __kmp_str_format( 2086 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2087 traits_t<UT>::spec); 2088 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2089 __kmp_str_free(&buff); 2090 } 2091 #endif 2092 2093 #if KMP_USE_HIER_SCHED 2094 pr->flags.use_hier = FALSE; 2095 #endif 2096 if ((ST)num_done == th->th.th_team_nproc - 1) { 2097 #if (KMP_STATIC_STEAL_ENABLED) 2098 if (pr->schedule == kmp_sch_static_steal && 2099 traits_t<T>::type_size > 4) { 2100 int i; 2101 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2102 __kmp_dispatch_num_buffers; // current loop index 2103 kmp_info_t **other_threads = team->t.t_threads; 2104 // loop complete, safe to destroy locks used for stealing 2105 for (i = 0; i < th->th.th_team_nproc; ++i) { 2106 dispatch_private_info_template<T> *buf = 2107 reinterpret_cast<dispatch_private_info_template<T> *>( 2108 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2109 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2110 KMP_ASSERT(lck != NULL); 2111 __kmp_destroy_lock(lck); 2112 __kmp_free(lck); 2113 buf->u.p.th_steal_lock = NULL; 2114 } 2115 } 2116 #endif 2117 /* NOTE: release this buffer to be reused */ 2118 2119 KMP_MB(); /* Flush all pending memory write invalidates. */ 2120 2121 sh->u.s.num_done = 0; 2122 sh->u.s.iteration = 0; 2123 2124 /* TODO replace with general release procedure? */ 2125 if (pr->flags.ordered) { 2126 sh->u.s.ordered_iteration = 0; 2127 } 2128 2129 KMP_MB(); /* Flush all pending memory write invalidates. */ 2130 2131 sh->buffer_index += __kmp_dispatch_num_buffers; 2132 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2133 gtid, sh->buffer_index)); 2134 2135 KMP_MB(); /* Flush all pending memory write invalidates. */ 2136 2137 } // if 2138 if (__kmp_env_consistency_check) { 2139 if (pr->pushed_ws != ct_none) { 2140 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2141 } 2142 } 2143 2144 th->th.th_dispatch->th_deo_fcn = NULL; 2145 th->th.th_dispatch->th_dxo_fcn = NULL; 2146 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2147 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2148 } // if (status == 0) 2149 #if KMP_OS_WINDOWS 2150 else if (last) { 2151 pr->u.p.last_upper = pr->u.p.ub; 2152 } 2153 #endif /* KMP_OS_WINDOWS */ 2154 if (p_last != NULL && status != 0) 2155 *p_last = last; 2156 } // if 2157 2158 #ifdef KMP_DEBUG 2159 { 2160 char *buff; 2161 // create format specifiers before the debug output 2162 buff = __kmp_str_format( 2163 "__kmp_dispatch_next: T#%%d normal case: " 2164 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2165 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2166 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2167 (p_last ? *p_last : 0), status)); 2168 __kmp_str_free(&buff); 2169 } 2170 #endif 2171 #if INCLUDE_SSC_MARKS 2172 SSC_MARK_DISPATCH_NEXT(); 2173 #endif 2174 OMPT_LOOP_END; 2175 KMP_STATS_LOOP_END; 2176 return status; 2177 } 2178 2179 template <typename T> 2180 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2181 kmp_int32 *plastiter, T *plower, T *pupper, 2182 typename traits_t<T>::signed_t incr) { 2183 typedef typename traits_t<T>::unsigned_t UT; 2184 kmp_uint32 team_id; 2185 kmp_uint32 nteams; 2186 UT trip_count; 2187 kmp_team_t *team; 2188 kmp_info_t *th; 2189 2190 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2191 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2192 #ifdef KMP_DEBUG 2193 typedef typename traits_t<T>::signed_t ST; 2194 { 2195 char *buff; 2196 // create format specifiers before the debug output 2197 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2198 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2199 traits_t<T>::spec, traits_t<T>::spec, 2200 traits_t<ST>::spec, traits_t<T>::spec); 2201 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2202 __kmp_str_free(&buff); 2203 } 2204 #endif 2205 2206 if (__kmp_env_consistency_check) { 2207 if (incr == 0) { 2208 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2209 loc); 2210 } 2211 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2212 // The loop is illegal. 2213 // Some zero-trip loops maintained by compiler, e.g.: 2214 // for(i=10;i<0;++i) // lower >= upper - run-time check 2215 // for(i=0;i>10;--i) // lower <= upper - run-time check 2216 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2217 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2218 // Compiler does not check the following illegal loops: 2219 // for(i=0;i<10;i+=incr) // where incr<0 2220 // for(i=10;i>0;i-=incr) // where incr<0 2221 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2222 } 2223 } 2224 __kmp_assert_valid_gtid(gtid); 2225 th = __kmp_threads[gtid]; 2226 team = th->th.th_team; 2227 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2228 nteams = th->th.th_teams_size.nteams; 2229 team_id = team->t.t_master_tid; 2230 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2231 2232 // compute global trip count 2233 if (incr == 1) { 2234 trip_count = *pupper - *plower + 1; 2235 } else if (incr == -1) { 2236 trip_count = *plower - *pupper + 1; 2237 } else if (incr > 0) { 2238 // upper-lower can exceed the limit of signed type 2239 trip_count = (UT)(*pupper - *plower) / incr + 1; 2240 } else { 2241 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2242 } 2243 2244 if (trip_count <= nteams) { 2245 KMP_DEBUG_ASSERT( 2246 __kmp_static == kmp_sch_static_greedy || 2247 __kmp_static == 2248 kmp_sch_static_balanced); // Unknown static scheduling type. 2249 // only some teams get single iteration, others get nothing 2250 if (team_id < trip_count) { 2251 *pupper = *plower = *plower + team_id * incr; 2252 } else { 2253 *plower = *pupper + incr; // zero-trip loop 2254 } 2255 if (plastiter != NULL) 2256 *plastiter = (team_id == trip_count - 1); 2257 } else { 2258 if (__kmp_static == kmp_sch_static_balanced) { 2259 UT chunk = trip_count / nteams; 2260 UT extras = trip_count % nteams; 2261 *plower += 2262 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2263 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2264 if (plastiter != NULL) 2265 *plastiter = (team_id == nteams - 1); 2266 } else { 2267 T chunk_inc_count = 2268 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2269 T upper = *pupper; 2270 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2271 // Unknown static scheduling type. 2272 *plower += team_id * chunk_inc_count; 2273 *pupper = *plower + chunk_inc_count - incr; 2274 // Check/correct bounds if needed 2275 if (incr > 0) { 2276 if (*pupper < *plower) 2277 *pupper = traits_t<T>::max_value; 2278 if (plastiter != NULL) 2279 *plastiter = *plower <= upper && *pupper > upper - incr; 2280 if (*pupper > upper) 2281 *pupper = upper; // tracker C73258 2282 } else { 2283 if (*pupper > *plower) 2284 *pupper = traits_t<T>::min_value; 2285 if (plastiter != NULL) 2286 *plastiter = *plower >= upper && *pupper < upper - incr; 2287 if (*pupper < upper) 2288 *pupper = upper; // tracker C73258 2289 } 2290 } 2291 } 2292 } 2293 2294 //----------------------------------------------------------------------------- 2295 // Dispatch routines 2296 // Transfer call to template< type T > 2297 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2298 // T lb, T ub, ST st, ST chunk ) 2299 extern "C" { 2300 2301 /*! 2302 @ingroup WORK_SHARING 2303 @{ 2304 @param loc Source location 2305 @param gtid Global thread id 2306 @param schedule Schedule type 2307 @param lb Lower bound 2308 @param ub Upper bound 2309 @param st Step (or increment if you prefer) 2310 @param chunk The chunk size to block with 2311 2312 This function prepares the runtime to start a dynamically scheduled for loop, 2313 saving the loop arguments. 2314 These functions are all identical apart from the types of the arguments. 2315 */ 2316 2317 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2318 enum sched_type schedule, kmp_int32 lb, 2319 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2320 KMP_DEBUG_ASSERT(__kmp_init_serial); 2321 #if OMPT_SUPPORT && OMPT_OPTIONAL 2322 OMPT_STORE_RETURN_ADDRESS(gtid); 2323 #endif 2324 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2325 } 2326 /*! 2327 See @ref __kmpc_dispatch_init_4 2328 */ 2329 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2330 enum sched_type schedule, kmp_uint32 lb, 2331 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2332 KMP_DEBUG_ASSERT(__kmp_init_serial); 2333 #if OMPT_SUPPORT && OMPT_OPTIONAL 2334 OMPT_STORE_RETURN_ADDRESS(gtid); 2335 #endif 2336 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2337 } 2338 2339 /*! 2340 See @ref __kmpc_dispatch_init_4 2341 */ 2342 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2343 enum sched_type schedule, kmp_int64 lb, 2344 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2345 KMP_DEBUG_ASSERT(__kmp_init_serial); 2346 #if OMPT_SUPPORT && OMPT_OPTIONAL 2347 OMPT_STORE_RETURN_ADDRESS(gtid); 2348 #endif 2349 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2350 } 2351 2352 /*! 2353 See @ref __kmpc_dispatch_init_4 2354 */ 2355 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2356 enum sched_type schedule, kmp_uint64 lb, 2357 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2358 KMP_DEBUG_ASSERT(__kmp_init_serial); 2359 #if OMPT_SUPPORT && OMPT_OPTIONAL 2360 OMPT_STORE_RETURN_ADDRESS(gtid); 2361 #endif 2362 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2363 } 2364 2365 /*! 2366 See @ref __kmpc_dispatch_init_4 2367 2368 Difference from __kmpc_dispatch_init set of functions is these functions 2369 are called for composite distribute parallel for construct. Thus before 2370 regular iterations dispatching we need to calc per-team iteration space. 2371 2372 These functions are all identical apart from the types of the arguments. 2373 */ 2374 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2375 enum sched_type schedule, kmp_int32 *p_last, 2376 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2377 kmp_int32 chunk) { 2378 KMP_DEBUG_ASSERT(__kmp_init_serial); 2379 #if OMPT_SUPPORT && OMPT_OPTIONAL 2380 OMPT_STORE_RETURN_ADDRESS(gtid); 2381 #endif 2382 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2383 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2384 } 2385 2386 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2387 enum sched_type schedule, kmp_int32 *p_last, 2388 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2389 kmp_int32 chunk) { 2390 KMP_DEBUG_ASSERT(__kmp_init_serial); 2391 #if OMPT_SUPPORT && OMPT_OPTIONAL 2392 OMPT_STORE_RETURN_ADDRESS(gtid); 2393 #endif 2394 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2395 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2396 } 2397 2398 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2399 enum sched_type schedule, kmp_int32 *p_last, 2400 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2401 kmp_int64 chunk) { 2402 KMP_DEBUG_ASSERT(__kmp_init_serial); 2403 #if OMPT_SUPPORT && OMPT_OPTIONAL 2404 OMPT_STORE_RETURN_ADDRESS(gtid); 2405 #endif 2406 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2407 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2408 } 2409 2410 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2411 enum sched_type schedule, kmp_int32 *p_last, 2412 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2413 kmp_int64 chunk) { 2414 KMP_DEBUG_ASSERT(__kmp_init_serial); 2415 #if OMPT_SUPPORT && OMPT_OPTIONAL 2416 OMPT_STORE_RETURN_ADDRESS(gtid); 2417 #endif 2418 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2419 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2420 } 2421 2422 /*! 2423 @param loc Source code location 2424 @param gtid Global thread id 2425 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2426 otherwise 2427 @param p_lb Pointer to the lower bound for the next chunk of work 2428 @param p_ub Pointer to the upper bound for the next chunk of work 2429 @param p_st Pointer to the stride for the next chunk of work 2430 @return one if there is work to be done, zero otherwise 2431 2432 Get the next dynamically allocated chunk of work for this thread. 2433 If there is no more work, then the lb,ub and stride need not be modified. 2434 */ 2435 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2436 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2437 #if OMPT_SUPPORT && OMPT_OPTIONAL 2438 OMPT_STORE_RETURN_ADDRESS(gtid); 2439 #endif 2440 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2441 #if OMPT_SUPPORT && OMPT_OPTIONAL 2442 , 2443 OMPT_LOAD_RETURN_ADDRESS(gtid) 2444 #endif 2445 ); 2446 } 2447 2448 /*! 2449 See @ref __kmpc_dispatch_next_4 2450 */ 2451 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2452 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2453 kmp_int32 *p_st) { 2454 #if OMPT_SUPPORT && OMPT_OPTIONAL 2455 OMPT_STORE_RETURN_ADDRESS(gtid); 2456 #endif 2457 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2458 #if OMPT_SUPPORT && OMPT_OPTIONAL 2459 , 2460 OMPT_LOAD_RETURN_ADDRESS(gtid) 2461 #endif 2462 ); 2463 } 2464 2465 /*! 2466 See @ref __kmpc_dispatch_next_4 2467 */ 2468 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2469 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2470 #if OMPT_SUPPORT && OMPT_OPTIONAL 2471 OMPT_STORE_RETURN_ADDRESS(gtid); 2472 #endif 2473 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2474 #if OMPT_SUPPORT && OMPT_OPTIONAL 2475 , 2476 OMPT_LOAD_RETURN_ADDRESS(gtid) 2477 #endif 2478 ); 2479 } 2480 2481 /*! 2482 See @ref __kmpc_dispatch_next_4 2483 */ 2484 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2485 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2486 kmp_int64 *p_st) { 2487 #if OMPT_SUPPORT && OMPT_OPTIONAL 2488 OMPT_STORE_RETURN_ADDRESS(gtid); 2489 #endif 2490 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2491 #if OMPT_SUPPORT && OMPT_OPTIONAL 2492 , 2493 OMPT_LOAD_RETURN_ADDRESS(gtid) 2494 #endif 2495 ); 2496 } 2497 2498 /*! 2499 @param loc Source code location 2500 @param gtid Global thread id 2501 2502 Mark the end of a dynamic loop. 2503 */ 2504 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2505 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2506 } 2507 2508 /*! 2509 See @ref __kmpc_dispatch_fini_4 2510 */ 2511 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2512 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2513 } 2514 2515 /*! 2516 See @ref __kmpc_dispatch_fini_4 2517 */ 2518 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2519 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2520 } 2521 2522 /*! 2523 See @ref __kmpc_dispatch_fini_4 2524 */ 2525 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2526 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2527 } 2528 /*! @} */ 2529 2530 //----------------------------------------------------------------------------- 2531 // Non-template routines from kmp_dispatch.cpp used in other sources 2532 2533 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2534 return value == checker; 2535 } 2536 2537 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2538 return value != checker; 2539 } 2540 2541 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2542 return value < checker; 2543 } 2544 2545 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2546 return value >= checker; 2547 } 2548 2549 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2550 return value <= checker; 2551 } 2552 2553 kmp_uint32 2554 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2555 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2556 void *obj // Higher-level synchronization object, or NULL. 2557 ) { 2558 // note: we may not belong to a team at this point 2559 volatile kmp_uint32 *spin = spinner; 2560 kmp_uint32 check = checker; 2561 kmp_uint32 spins; 2562 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2563 kmp_uint32 r; 2564 2565 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2566 KMP_INIT_YIELD(spins); 2567 // main wait spin loop 2568 while (!f(r = TCR_4(*spin), check)) { 2569 KMP_FSYNC_SPIN_PREPARE(obj); 2570 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2571 split. It causes problems with infinite recursion because of exit lock */ 2572 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2573 __kmp_abort_thread(); */ 2574 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2575 } 2576 KMP_FSYNC_SPIN_ACQUIRED(obj); 2577 return r; 2578 } 2579 2580 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2581 kmp_uint32 (*pred)(void *, kmp_uint32), 2582 void *obj // Higher-level synchronization object, or NULL. 2583 ) { 2584 // note: we may not belong to a team at this point 2585 void *spin = spinner; 2586 kmp_uint32 check = checker; 2587 kmp_uint32 spins; 2588 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2589 2590 KMP_FSYNC_SPIN_INIT(obj, spin); 2591 KMP_INIT_YIELD(spins); 2592 // main wait spin loop 2593 while (!f(spin, check)) { 2594 KMP_FSYNC_SPIN_PREPARE(obj); 2595 /* if we have waited a bit, or are noversubscribed, yield */ 2596 /* pause is in the following code */ 2597 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2598 } 2599 KMP_FSYNC_SPIN_ACQUIRED(obj); 2600 } 2601 2602 } // extern "C" 2603 2604 #ifdef KMP_GOMP_COMPAT 2605 2606 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2607 enum sched_type schedule, kmp_int32 lb, 2608 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2609 int push_ws) { 2610 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2611 push_ws); 2612 } 2613 2614 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2615 enum sched_type schedule, kmp_uint32 lb, 2616 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2617 int push_ws) { 2618 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2619 push_ws); 2620 } 2621 2622 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2623 enum sched_type schedule, kmp_int64 lb, 2624 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2625 int push_ws) { 2626 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2627 push_ws); 2628 } 2629 2630 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2631 enum sched_type schedule, kmp_uint64 lb, 2632 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2633 int push_ws) { 2634 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2635 push_ws); 2636 } 2637 2638 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2639 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2640 } 2641 2642 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2643 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2644 } 2645 2646 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2647 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2648 } 2649 2650 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2651 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2652 } 2653 2654 #endif /* KMP_GOMP_COMPAT */ 2655 2656 /* ------------------------------------------------------------------------ */ 2657