1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 int monotonicity; 76 // default to monotonic 77 monotonicity = SCHEDULE_MONOTONIC; 78 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 79 monotonicity = SCHEDULE_NONMONOTONIC; 80 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 81 monotonicity = SCHEDULE_MONOTONIC; 82 return monotonicity; 83 } 84 85 // Initialize a dispatch_private_info_template<T> buffer for a particular 86 // type of schedule,chunk. The loop description is found in lb (lower bound), 87 // ub (upper bound), and st (stride). nproc is the number of threads relevant 88 // to the scheduling (often the number of threads in a team, but not always if 89 // hierarchical scheduling is used). tid is the id of the thread calling 90 // the function within the group of nproc threads. It will have a value 91 // between 0 and nproc - 1. This is often just the thread id within a team, but 92 // is not necessarily the case when using hierarchical scheduling. 93 // loc is the source file location of the corresponding loop 94 // gtid is the global thread id 95 template <typename T> 96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 97 dispatch_private_info_template<T> *pr, 98 enum sched_type schedule, T lb, T ub, 99 typename traits_t<T>::signed_t st, 100 #if USE_ITT_BUILD 101 kmp_uint64 *cur_chunk, 102 #endif 103 typename traits_t<T>::signed_t chunk, 104 T nproc, T tid) { 105 typedef typename traits_t<T>::unsigned_t UT; 106 typedef typename traits_t<T>::floating_t DBL; 107 108 int active; 109 T tc; 110 kmp_info_t *th; 111 kmp_team_t *team; 112 int monotonicity; 113 bool use_hier; 114 115 #ifdef KMP_DEBUG 116 typedef typename traits_t<T>::signed_t ST; 117 { 118 char *buff; 119 // create format specifiers before the debug output 120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 121 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 123 traits_t<T>::spec, traits_t<T>::spec, 124 traits_t<ST>::spec, traits_t<ST>::spec, 125 traits_t<T>::spec, traits_t<T>::spec); 126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 127 __kmp_str_free(&buff); 128 } 129 #endif 130 /* setup data */ 131 th = __kmp_threads[gtid]; 132 team = th->th.th_team; 133 active = !team->t.t_serialized; 134 135 #if USE_ITT_BUILD 136 int itt_need_metadata_reporting = 137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 139 team->t.t_active_level == 1; 140 #endif 141 142 #if KMP_USE_HIER_SCHED 143 use_hier = pr->flags.use_hier; 144 #else 145 use_hier = false; 146 #endif 147 148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 149 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 151 152 /* Pick up the nomerge/ordered bits from the scheduling type */ 153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 154 pr->flags.nomerge = TRUE; 155 schedule = 156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 157 } else { 158 pr->flags.nomerge = FALSE; 159 } 160 pr->type_size = traits_t<T>::type_size; // remember the size of variables 161 if (kmp_ord_lower & schedule) { 162 pr->flags.ordered = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.ordered = FALSE; 167 } 168 // Ordered overrides nonmonotonic 169 if (pr->flags.ordered) { 170 monotonicity = SCHEDULE_MONOTONIC; 171 } 172 173 if (schedule == kmp_sch_static) { 174 schedule = __kmp_static; 175 } else { 176 if (schedule == kmp_sch_runtime) { 177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 178 // not specified) 179 schedule = team->t.t_sched.r_sched_type; 180 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 182 // Detail the schedule if needed (global controls are differentiated 183 // appropriately) 184 if (schedule == kmp_sch_guided_chunked) { 185 schedule = __kmp_guided; 186 } else if (schedule == kmp_sch_static) { 187 schedule = __kmp_static; 188 } 189 // Use the chunk size specified by OMP_SCHEDULE (or default if not 190 // specified) 191 chunk = team->t.t_sched.chunk; 192 #if USE_ITT_BUILD 193 if (cur_chunk) 194 *cur_chunk = chunk; 195 #endif 196 #ifdef KMP_DEBUG 197 { 198 char *buff; 199 // create format specifiers before the debug output 200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 201 "schedule:%%d chunk:%%%s\n", 202 traits_t<ST>::spec); 203 KD_TRACE(10, (buff, gtid, schedule, chunk)); 204 __kmp_str_free(&buff); 205 } 206 #endif 207 } else { 208 if (schedule == kmp_sch_guided_chunked) { 209 schedule = __kmp_guided; 210 } 211 if (chunk <= 0) { 212 chunk = KMP_DEFAULT_CHUNK; 213 } 214 } 215 216 if (schedule == kmp_sch_auto) { 217 // mapping and differentiation: in the __kmp_do_serial_initialize() 218 schedule = __kmp_auto; 219 #ifdef KMP_DEBUG 220 { 221 char *buff; 222 // create format specifiers before the debug output 223 buff = __kmp_str_format( 224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 225 "schedule:%%d chunk:%%%s\n", 226 traits_t<ST>::spec); 227 KD_TRACE(10, (buff, gtid, schedule, chunk)); 228 __kmp_str_free(&buff); 229 } 230 #endif 231 } 232 #if KMP_STATIC_STEAL_ENABLED 233 // map nonmonotonic:dynamic to static steal 234 if (schedule == kmp_sch_dynamic_chunked) { 235 if (monotonicity == SCHEDULE_NONMONOTONIC) 236 schedule = kmp_sch_static_steal; 237 } 238 #endif 239 /* guided analytical not safe for too many threads */ 240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 241 schedule = kmp_sch_guided_iterative_chunked; 242 KMP_WARNING(DispatchManyThreads); 243 } 244 if (schedule == kmp_sch_runtime_simd) { 245 // compiler provides simd_width in the chunk parameter 246 schedule = team->t.t_sched.r_sched_type; 247 monotonicity = __kmp_get_monotonicity(schedule, use_hier); 248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 249 // Detail the schedule if needed (global controls are differentiated 250 // appropriately) 251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 252 schedule == __kmp_static) { 253 schedule = kmp_sch_static_balanced_chunked; 254 } else { 255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 256 schedule = kmp_sch_guided_simd; 257 } 258 chunk = team->t.t_sched.chunk * chunk; 259 } 260 #if USE_ITT_BUILD 261 if (cur_chunk) 262 *cur_chunk = chunk; 263 #endif 264 #ifdef KMP_DEBUG 265 { 266 char *buff; 267 // create format specifiers before the debug output 268 buff = __kmp_str_format( 269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 270 " chunk:%%%s\n", 271 traits_t<ST>::spec); 272 KD_TRACE(10, (buff, gtid, schedule, chunk)); 273 __kmp_str_free(&buff); 274 } 275 #endif 276 } 277 pr->u.p.parm1 = chunk; 278 } 279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 280 "unknown scheduling type"); 281 282 pr->u.p.count = 0; 283 284 if (__kmp_env_consistency_check) { 285 if (st == 0) { 286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 288 } 289 } 290 // compute trip count 291 if (st == 1) { // most common case 292 if (ub >= lb) { 293 tc = ub - lb + 1; 294 } else { // ub < lb 295 tc = 0; // zero-trip 296 } 297 } else if (st < 0) { 298 if (lb >= ub) { 299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 300 // where the division needs to be unsigned regardless of the result type 301 tc = (UT)(lb - ub) / (-st) + 1; 302 } else { // lb < ub 303 tc = 0; // zero-trip 304 } 305 } else { // st > 0 306 if (ub >= lb) { 307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(ub - lb) / st + 1; 310 } else { // ub < lb 311 tc = 0; // zero-trip 312 } 313 } 314 315 #if KMP_STATS_ENABLED 316 if (KMP_MASTER_GTID(gtid)) { 317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 318 } 319 #endif 320 321 pr->u.p.lb = lb; 322 pr->u.p.ub = ub; 323 pr->u.p.st = st; 324 pr->u.p.tc = tc; 325 326 #if KMP_OS_WINDOWS 327 pr->u.p.last_upper = ub + st; 328 #endif /* KMP_OS_WINDOWS */ 329 330 /* NOTE: only the active parallel region(s) has active ordered sections */ 331 332 if (active) { 333 if (pr->flags.ordered) { 334 pr->ordered_bumped = 0; 335 pr->u.p.ordered_lower = 1; 336 pr->u.p.ordered_upper = 0; 337 } 338 } 339 340 switch (schedule) { 341 #if (KMP_STATIC_STEAL_ENABLED) 342 case kmp_sch_static_steal: { 343 T ntc, init; 344 345 KD_TRACE(100, 346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 347 gtid)); 348 349 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 350 if (nproc > 1 && ntc >= nproc) { 351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 352 T id = tid; 353 T small_chunk, extras; 354 355 small_chunk = ntc / nproc; 356 extras = ntc % nproc; 357 358 init = id * small_chunk + (id < extras ? id : extras); 359 pr->u.p.count = init; 360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 361 362 pr->u.p.parm2 = lb; 363 // parm3 is the number of times to attempt stealing which is 364 // proportional to the number of chunks per thread up until 365 // the maximum value of nproc. 366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 368 pr->u.p.st = st; 369 if (traits_t<T>::type_size > 4) { 370 // AC: TODO: check if 16-byte CAS available and use it to 371 // improve performance (probably wait for explicit request 372 // before spending time on this). 373 // For now use dynamically allocated per-thread lock, 374 // free memory in __kmp_dispatch_next when status==0. 375 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 376 pr->u.p.th_steal_lock = 377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 378 __kmp_init_lock(pr->u.p.th_steal_lock); 379 } 380 break; 381 } else { 382 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 383 schedule = kmp_sch_dynamic_chunked; 384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 385 "kmp_sch_dynamic_chunked\n", 386 gtid)); 387 if (pr->u.p.parm1 <= 0) 388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 389 break; 390 } // if 391 } // case 392 #endif 393 case kmp_sch_static_balanced: { 394 T init, limit; 395 396 KD_TRACE( 397 100, 398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 399 gtid)); 400 401 if (nproc > 1) { 402 T id = tid; 403 404 if (tc < nproc) { 405 if (id < tc) { 406 init = id; 407 limit = id; 408 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 409 } else { 410 pr->u.p.count = 1; /* means no more chunks to execute */ 411 pr->u.p.parm1 = FALSE; 412 break; 413 } 414 } else { 415 T small_chunk = tc / nproc; 416 T extras = tc % nproc; 417 init = id * small_chunk + (id < extras ? id : extras); 418 limit = init + small_chunk - (id < extras ? 0 : 1); 419 pr->u.p.parm1 = (id == nproc - 1); 420 } 421 } else { 422 if (tc > 0) { 423 init = 0; 424 limit = tc - 1; 425 pr->u.p.parm1 = TRUE; 426 } else { 427 // zero trip count 428 pr->u.p.count = 1; /* means no more chunks to execute */ 429 pr->u.p.parm1 = FALSE; 430 break; 431 } 432 } 433 #if USE_ITT_BUILD 434 // Calculate chunk for metadata report 435 if (itt_need_metadata_reporting) 436 if (cur_chunk) 437 *cur_chunk = limit - init + 1; 438 #endif 439 if (st == 1) { 440 pr->u.p.lb = lb + init; 441 pr->u.p.ub = lb + limit; 442 } else { 443 // calculated upper bound, "ub" is user-defined upper bound 444 T ub_tmp = lb + limit * st; 445 pr->u.p.lb = lb + init * st; 446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 447 // it exactly 448 if (st > 0) { 449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 450 } else { 451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 452 } 453 } 454 if (pr->flags.ordered) { 455 pr->u.p.ordered_lower = init; 456 pr->u.p.ordered_upper = limit; 457 } 458 break; 459 } // case 460 case kmp_sch_static_balanced_chunked: { 461 // similar to balanced, but chunk adjusted to multiple of simd width 462 T nth = nproc; 463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 464 " -> falling-through to static_greedy\n", 465 gtid)); 466 schedule = kmp_sch_static_greedy; 467 if (nth > 1) 468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 469 else 470 pr->u.p.parm1 = tc; 471 break; 472 } // case 473 case kmp_sch_guided_simd: 474 case kmp_sch_guided_iterative_chunked: { 475 KD_TRACE( 476 100, 477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 478 " case\n", 479 gtid)); 480 481 if (nproc > 1) { 482 if ((2L * chunk + 1) * nproc >= tc) { 483 /* chunk size too large, switch to dynamic */ 484 schedule = kmp_sch_dynamic_chunked; 485 } else { 486 // when remaining iters become less than parm2 - switch to dynamic 487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 488 *(double *)&pr->u.p.parm3 = 489 guided_flt_param / nproc; // may occupy parm3 and parm4 490 } 491 } else { 492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 493 "kmp_sch_static_greedy\n", 494 gtid)); 495 schedule = kmp_sch_static_greedy; 496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 497 KD_TRACE( 498 100, 499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 500 gtid)); 501 pr->u.p.parm1 = tc; 502 } // if 503 } // case 504 break; 505 case kmp_sch_guided_analytical_chunked: { 506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 507 "kmp_sch_guided_analytical_chunked case\n", 508 gtid)); 509 510 if (nproc > 1) { 511 if ((2L * chunk + 1) * nproc >= tc) { 512 /* chunk size too large, switch to dynamic */ 513 schedule = kmp_sch_dynamic_chunked; 514 } else { 515 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 516 DBL x; 517 518 #if KMP_USE_X87CONTROL 519 /* Linux* OS already has 64-bit computation by default for long double, 520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 522 instead of the default 53-bit. Even though long double doesn't work 523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 524 expected to impact the correctness of the algorithm, but this has not 525 been mathematically proven. */ 526 // save original FPCW and set precision to 64-bit, as 527 // Windows* OS on IA-32 architecture defaults to 53-bit 528 unsigned int oldFpcw = _control87(0, 0); 529 _control87(_PC_64, _MCW_PC); // 0,0x30000 530 #endif 531 /* value used for comparison in solver for cross-over point */ 532 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 533 534 /* crossover point--chunk indexes equal to or greater than 535 this point switch to dynamic-style scheduling */ 536 UT cross; 537 538 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 539 x = (long double)1.0 - (long double)0.5 / nproc; 540 541 #ifdef KMP_DEBUG 542 { // test natural alignment 543 struct _test_a { 544 char a; 545 union { 546 char b; 547 DBL d; 548 }; 549 } t; 550 ptrdiff_t natural_alignment = 551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 553 // long)natural_alignment ); 554 KMP_DEBUG_ASSERT( 555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 556 } 557 #endif // KMP_DEBUG 558 559 /* save the term in thread private dispatch structure */ 560 *(DBL *)&pr->u.p.parm3 = x; 561 562 /* solve for the crossover point to the nearest integer i for which C_i 563 <= chunk */ 564 { 565 UT left, right, mid; 566 long double p; 567 568 /* estimate initial upper and lower bound */ 569 570 /* doesn't matter what value right is as long as it is positive, but 571 it affects performance of the solver */ 572 right = 229; 573 p = __kmp_pow<UT>(x, right); 574 if (p > target) { 575 do { 576 p *= p; 577 right <<= 1; 578 } while (p > target && right < (1 << 27)); 579 /* lower bound is previous (failed) estimate of upper bound */ 580 left = right >> 1; 581 } else { 582 left = 0; 583 } 584 585 /* bisection root-finding method */ 586 while (left + 1 < right) { 587 mid = (left + right) / 2; 588 if (__kmp_pow<UT>(x, mid) > target) { 589 left = mid; 590 } else { 591 right = mid; 592 } 593 } // while 594 cross = right; 595 } 596 /* assert sanity of computed crossover point */ 597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 598 __kmp_pow<UT>(x, cross) <= target); 599 600 /* save the crossover point in thread private dispatch structure */ 601 pr->u.p.parm2 = cross; 602 603 // C75803 604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 606 #else 607 #define GUIDED_ANALYTICAL_WORKAROUND (x) 608 #endif 609 /* dynamic-style scheduling offset */ 610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 612 cross * chunk; 613 #if KMP_USE_X87CONTROL 614 // restore FPCW 615 _control87(oldFpcw, _MCW_PC); 616 #endif 617 } // if 618 } else { 619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 620 "kmp_sch_static_greedy\n", 621 gtid)); 622 schedule = kmp_sch_static_greedy; 623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 624 pr->u.p.parm1 = tc; 625 } // if 626 } // case 627 break; 628 case kmp_sch_static_greedy: 629 KD_TRACE( 630 100, 631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 632 gtid)); 633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 634 break; 635 case kmp_sch_static_chunked: 636 case kmp_sch_dynamic_chunked: 637 if (pr->u.p.parm1 <= 0) { 638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 639 } 640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 642 gtid)); 643 break; 644 case kmp_sch_trapezoidal: { 645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 646 647 T parm1, parm2, parm3, parm4; 648 KD_TRACE(100, 649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 650 gtid)); 651 652 parm1 = chunk; 653 654 /* F : size of the first cycle */ 655 parm2 = (tc / (2 * nproc)); 656 657 if (parm2 < 1) { 658 parm2 = 1; 659 } 660 661 /* L : size of the last cycle. Make sure the last cycle is not larger 662 than the first cycle. */ 663 if (parm1 < 1) { 664 parm1 = 1; 665 } else if (parm1 > parm2) { 666 parm1 = parm2; 667 } 668 669 /* N : number of cycles */ 670 parm3 = (parm2 + parm1); 671 parm3 = (2 * tc + parm3 - 1) / parm3; 672 673 if (parm3 < 2) { 674 parm3 = 2; 675 } 676 677 /* sigma : decreasing incr of the trapezoid */ 678 parm4 = (parm3 - 1); 679 parm4 = (parm2 - parm1) / parm4; 680 681 // pointless check, because parm4 >= 0 always 682 // if ( parm4 < 0 ) { 683 // parm4 = 0; 684 //} 685 686 pr->u.p.parm1 = parm1; 687 pr->u.p.parm2 = parm2; 688 pr->u.p.parm3 = parm3; 689 pr->u.p.parm4 = parm4; 690 } // case 691 break; 692 693 default: { 694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 695 KMP_HNT(GetNewerLibrary), // Hint 696 __kmp_msg_null // Variadic argument list terminator 697 ); 698 } break; 699 } // switch 700 pr->schedule = schedule; 701 } 702 703 #if KMP_USE_HIER_SCHED 704 template <typename T> 705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 706 typename traits_t<T>::signed_t st); 707 template <> 708 inline void 709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 710 kmp_int32 ub, kmp_int32 st) { 711 __kmp_dispatch_init_hierarchy<kmp_int32>( 712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 714 } 715 template <> 716 inline void 717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 718 kmp_uint32 ub, kmp_int32 st) { 719 __kmp_dispatch_init_hierarchy<kmp_uint32>( 720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 722 } 723 template <> 724 inline void 725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 726 kmp_int64 ub, kmp_int64 st) { 727 __kmp_dispatch_init_hierarchy<kmp_int64>( 728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 730 } 731 template <> 732 inline void 733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 734 kmp_uint64 ub, kmp_int64 st) { 735 __kmp_dispatch_init_hierarchy<kmp_uint64>( 736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 738 } 739 740 // free all the hierarchy scheduling memory associated with the team 741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 743 for (int i = 0; i < num_disp_buff; ++i) { 744 // type does not matter here so use kmp_int32 745 auto sh = 746 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 747 &team->t.t_disp_buffer[i]); 748 if (sh->hier) { 749 sh->hier->deallocate(); 750 __kmp_free(sh->hier); 751 } 752 } 753 } 754 #endif 755 756 // UT - unsigned flavor of T, ST - signed flavor of T, 757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 758 template <typename T> 759 static void 760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 761 T ub, typename traits_t<T>::signed_t st, 762 typename traits_t<T>::signed_t chunk, int push_ws) { 763 typedef typename traits_t<T>::unsigned_t UT; 764 765 int active; 766 kmp_info_t *th; 767 kmp_team_t *team; 768 kmp_uint32 my_buffer_index; 769 dispatch_private_info_template<T> *pr; 770 dispatch_shared_info_template<T> volatile *sh; 771 772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 773 sizeof(dispatch_private_info)); 774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 775 sizeof(dispatch_shared_info)); 776 __kmp_assert_valid_gtid(gtid); 777 778 if (!TCR_4(__kmp_init_parallel)) 779 __kmp_parallel_initialize(); 780 781 __kmp_resume_if_soft_paused(); 782 783 #if INCLUDE_SSC_MARKS 784 SSC_MARK_DISPATCH_INIT(); 785 #endif 786 #ifdef KMP_DEBUG 787 typedef typename traits_t<T>::signed_t ST; 788 { 789 char *buff; 790 // create format specifiers before the debug output 791 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 792 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 793 traits_t<ST>::spec, traits_t<T>::spec, 794 traits_t<T>::spec, traits_t<ST>::spec); 795 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 796 __kmp_str_free(&buff); 797 } 798 #endif 799 /* setup data */ 800 th = __kmp_threads[gtid]; 801 team = th->th.th_team; 802 active = !team->t.t_serialized; 803 th->th.th_ident = loc; 804 805 // Any half-decent optimizer will remove this test when the blocks are empty 806 // since the macros expand to nothing 807 // when statistics are disabled. 808 if (schedule == __kmp_static) { 809 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 810 } else { 811 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 812 } 813 814 #if KMP_USE_HIER_SCHED 815 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 816 // Hierarchical scheduling does not work with ordered, so if ordered is 817 // detected, then revert back to threaded scheduling. 818 bool ordered; 819 enum sched_type my_sched = schedule; 820 my_buffer_index = th->th.th_dispatch->th_disp_index; 821 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 822 &th->th.th_dispatch 823 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 824 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 825 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 826 my_sched = 827 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 828 ordered = (kmp_ord_lower & my_sched); 829 if (pr->flags.use_hier) { 830 if (ordered) { 831 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 832 "Disabling hierarchical scheduling.\n", 833 gtid)); 834 pr->flags.use_hier = FALSE; 835 } 836 } 837 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 838 // Don't use hierarchical for ordered parallel loops and don't 839 // use the runtime hierarchy if one was specified in the program 840 if (!ordered && !pr->flags.use_hier) 841 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 842 } 843 #endif // KMP_USE_HIER_SCHED 844 845 #if USE_ITT_BUILD 846 kmp_uint64 cur_chunk = chunk; 847 int itt_need_metadata_reporting = 848 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 849 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 850 team->t.t_active_level == 1; 851 #endif 852 if (!active) { 853 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 854 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 855 } else { 856 KMP_DEBUG_ASSERT(th->th.th_dispatch == 857 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 858 859 my_buffer_index = th->th.th_dispatch->th_disp_index++; 860 861 /* What happens when number of threads changes, need to resize buffer? */ 862 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 863 &th->th.th_dispatch 864 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 865 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 866 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 867 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 868 my_buffer_index)); 869 } 870 871 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 872 #if USE_ITT_BUILD 873 &cur_chunk, 874 #endif 875 chunk, (T)th->th.th_team_nproc, 876 (T)th->th.th_info.ds.ds_tid); 877 if (active) { 878 if (pr->flags.ordered == 0) { 879 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 880 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 881 } else { 882 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 883 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 884 } 885 } 886 887 if (active) { 888 /* The name of this buffer should be my_buffer_index when it's free to use 889 * it */ 890 891 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 892 "sh->buffer_index:%d\n", 893 gtid, my_buffer_index, sh->buffer_index)); 894 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 895 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 896 // Note: KMP_WAIT() cannot be used there: buffer index and 897 // my_buffer_index are *always* 32-bit integers. 898 KMP_MB(); /* is this necessary? */ 899 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 900 "sh->buffer_index:%d\n", 901 gtid, my_buffer_index, sh->buffer_index)); 902 903 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 904 th->th.th_dispatch->th_dispatch_sh_current = 905 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 906 #if USE_ITT_BUILD 907 if (pr->flags.ordered) { 908 __kmp_itt_ordered_init(gtid); 909 } 910 // Report loop metadata 911 if (itt_need_metadata_reporting) { 912 // Only report metadata by master of active team at level 1 913 kmp_uint64 schedtype = 0; 914 switch (schedule) { 915 case kmp_sch_static_chunked: 916 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 917 break; 918 case kmp_sch_static_greedy: 919 cur_chunk = pr->u.p.parm1; 920 break; 921 case kmp_sch_dynamic_chunked: 922 schedtype = 1; 923 break; 924 case kmp_sch_guided_iterative_chunked: 925 case kmp_sch_guided_analytical_chunked: 926 case kmp_sch_guided_simd: 927 schedtype = 2; 928 break; 929 default: 930 // Should we put this case under "static"? 931 // case kmp_sch_static_steal: 932 schedtype = 3; 933 break; 934 } 935 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 936 } 937 #if KMP_USE_HIER_SCHED 938 if (pr->flags.use_hier) { 939 pr->u.p.count = 0; 940 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 941 } 942 #endif // KMP_USER_HIER_SCHED 943 #endif /* USE_ITT_BUILD */ 944 } 945 946 #ifdef KMP_DEBUG 947 { 948 char *buff; 949 // create format specifiers before the debug output 950 buff = __kmp_str_format( 951 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 952 "lb:%%%s ub:%%%s" 953 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 954 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 955 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 956 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 957 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 958 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 959 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 960 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 961 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 962 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 963 __kmp_str_free(&buff); 964 } 965 #endif 966 #if (KMP_STATIC_STEAL_ENABLED) 967 // It cannot be guaranteed that after execution of a loop with some other 968 // schedule kind all the parm3 variables will contain the same value. Even if 969 // all parm3 will be the same, it still exists a bad case like using 0 and 1 970 // rather than program life-time increment. So the dedicated variable is 971 // required. The 'static_steal_counter' is used. 972 if (pr->schedule == kmp_sch_static_steal) { 973 // Other threads will inspect this variable when searching for a victim. 974 // This is a flag showing that other threads may steal from this thread 975 // since then. 976 volatile T *p = &pr->u.p.static_steal_counter; 977 *p = *p + 1; 978 } 979 #endif // ( KMP_STATIC_STEAL_ENABLED ) 980 981 #if OMPT_SUPPORT && OMPT_OPTIONAL 982 if (ompt_enabled.ompt_callback_work) { 983 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 984 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 985 ompt_callbacks.ompt_callback(ompt_callback_work)( 986 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 987 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 988 } 989 #endif 990 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 991 } 992 993 /* For ordered loops, either __kmp_dispatch_finish() should be called after 994 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 995 * every chunk of iterations. If the ordered section(s) were not executed 996 * for this iteration (or every iteration in this chunk), we need to set the 997 * ordered iteration counters so that the next thread can proceed. */ 998 template <typename UT> 999 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1000 typedef typename traits_t<UT>::signed_t ST; 1001 __kmp_assert_valid_gtid(gtid); 1002 kmp_info_t *th = __kmp_threads[gtid]; 1003 1004 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1005 if (!th->th.th_team->t.t_serialized) { 1006 1007 dispatch_private_info_template<UT> *pr = 1008 reinterpret_cast<dispatch_private_info_template<UT> *>( 1009 th->th.th_dispatch->th_dispatch_pr_current); 1010 dispatch_shared_info_template<UT> volatile *sh = 1011 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1012 th->th.th_dispatch->th_dispatch_sh_current); 1013 KMP_DEBUG_ASSERT(pr); 1014 KMP_DEBUG_ASSERT(sh); 1015 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1016 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1017 1018 if (pr->ordered_bumped) { 1019 KD_TRACE( 1020 1000, 1021 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1022 gtid)); 1023 pr->ordered_bumped = 0; 1024 } else { 1025 UT lower = pr->u.p.ordered_lower; 1026 1027 #ifdef KMP_DEBUG 1028 { 1029 char *buff; 1030 // create format specifiers before the debug output 1031 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1032 "ordered_iteration:%%%s lower:%%%s\n", 1033 traits_t<UT>::spec, traits_t<UT>::spec); 1034 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1035 __kmp_str_free(&buff); 1036 } 1037 #endif 1038 1039 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1040 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1041 KMP_MB(); /* is this necessary? */ 1042 #ifdef KMP_DEBUG 1043 { 1044 char *buff; 1045 // create format specifiers before the debug output 1046 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1047 "ordered_iteration:%%%s lower:%%%s\n", 1048 traits_t<UT>::spec, traits_t<UT>::spec); 1049 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1050 __kmp_str_free(&buff); 1051 } 1052 #endif 1053 1054 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1055 } // if 1056 } // if 1057 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1058 } 1059 1060 #ifdef KMP_GOMP_COMPAT 1061 1062 template <typename UT> 1063 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1064 typedef typename traits_t<UT>::signed_t ST; 1065 __kmp_assert_valid_gtid(gtid); 1066 kmp_info_t *th = __kmp_threads[gtid]; 1067 1068 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1069 if (!th->th.th_team->t.t_serialized) { 1070 // int cid; 1071 dispatch_private_info_template<UT> *pr = 1072 reinterpret_cast<dispatch_private_info_template<UT> *>( 1073 th->th.th_dispatch->th_dispatch_pr_current); 1074 dispatch_shared_info_template<UT> volatile *sh = 1075 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1076 th->th.th_dispatch->th_dispatch_sh_current); 1077 KMP_DEBUG_ASSERT(pr); 1078 KMP_DEBUG_ASSERT(sh); 1079 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1080 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1081 1082 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1083 UT lower = pr->u.p.ordered_lower; 1084 UT upper = pr->u.p.ordered_upper; 1085 UT inc = upper - lower + 1; 1086 1087 if (pr->ordered_bumped == inc) { 1088 KD_TRACE( 1089 1000, 1090 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1091 gtid)); 1092 pr->ordered_bumped = 0; 1093 } else { 1094 inc -= pr->ordered_bumped; 1095 1096 #ifdef KMP_DEBUG 1097 { 1098 char *buff; 1099 // create format specifiers before the debug output 1100 buff = __kmp_str_format( 1101 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1102 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1103 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1104 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1105 __kmp_str_free(&buff); 1106 } 1107 #endif 1108 1109 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1110 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1111 1112 KMP_MB(); /* is this necessary? */ 1113 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1114 "ordered_bumped to zero\n", 1115 gtid)); 1116 pr->ordered_bumped = 0; 1117 //!!!!! TODO check if the inc should be unsigned, or signed??? 1118 #ifdef KMP_DEBUG 1119 { 1120 char *buff; 1121 // create format specifiers before the debug output 1122 buff = __kmp_str_format( 1123 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1124 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1125 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1126 traits_t<UT>::spec); 1127 KD_TRACE(1000, 1128 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1129 __kmp_str_free(&buff); 1130 } 1131 #endif 1132 1133 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1134 } 1135 // } 1136 } 1137 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1138 } 1139 1140 #endif /* KMP_GOMP_COMPAT */ 1141 1142 template <typename T> 1143 int __kmp_dispatch_next_algorithm(int gtid, 1144 dispatch_private_info_template<T> *pr, 1145 dispatch_shared_info_template<T> volatile *sh, 1146 kmp_int32 *p_last, T *p_lb, T *p_ub, 1147 typename traits_t<T>::signed_t *p_st, T nproc, 1148 T tid) { 1149 typedef typename traits_t<T>::unsigned_t UT; 1150 typedef typename traits_t<T>::signed_t ST; 1151 typedef typename traits_t<T>::floating_t DBL; 1152 int status = 0; 1153 kmp_int32 last = 0; 1154 T start; 1155 ST incr; 1156 UT limit, trip, init; 1157 kmp_info_t *th = __kmp_threads[gtid]; 1158 kmp_team_t *team = th->th.th_team; 1159 1160 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1161 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1162 KMP_DEBUG_ASSERT(pr); 1163 KMP_DEBUG_ASSERT(sh); 1164 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1165 #ifdef KMP_DEBUG 1166 { 1167 char *buff; 1168 // create format specifiers before the debug output 1169 buff = 1170 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1171 "sh:%%p nproc:%%%s tid:%%%s\n", 1172 traits_t<T>::spec, traits_t<T>::spec); 1173 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1174 __kmp_str_free(&buff); 1175 } 1176 #endif 1177 1178 // zero trip count 1179 if (pr->u.p.tc == 0) { 1180 KD_TRACE(10, 1181 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1182 "zero status:%d\n", 1183 gtid, status)); 1184 return 0; 1185 } 1186 1187 switch (pr->schedule) { 1188 #if (KMP_STATIC_STEAL_ENABLED) 1189 case kmp_sch_static_steal: { 1190 T chunk = pr->u.p.parm1; 1191 1192 KD_TRACE(100, 1193 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1194 gtid)); 1195 1196 trip = pr->u.p.tc - 1; 1197 1198 if (traits_t<T>::type_size > 4) { 1199 // use lock for 8-byte and CAS for 4-byte induction 1200 // variable. TODO (optional): check and use 16-byte CAS 1201 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1202 KMP_DEBUG_ASSERT(lck != NULL); 1203 if (pr->u.p.count < (UT)pr->u.p.ub) { 1204 __kmp_acquire_lock(lck, gtid); 1205 // try to get own chunk of iterations 1206 init = (pr->u.p.count)++; 1207 status = (init < (UT)pr->u.p.ub); 1208 __kmp_release_lock(lck, gtid); 1209 } else { 1210 status = 0; // no own chunks 1211 } 1212 if (!status) { // try to steal 1213 kmp_info_t **other_threads = team->t.t_threads; 1214 int while_limit = pr->u.p.parm3; 1215 int while_index = 0; 1216 T id = pr->u.p.static_steal_counter; // loop id 1217 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1218 __kmp_dispatch_num_buffers; // current loop index 1219 // note: victim thread can potentially execute another loop 1220 // TODO: algorithm of searching for a victim 1221 // should be cleaned up and measured 1222 while ((!status) && (while_limit != ++while_index)) { 1223 dispatch_private_info_template<T> *victim; 1224 T remaining; 1225 T victimIdx = pr->u.p.parm4; 1226 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1227 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1228 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1229 KMP_DEBUG_ASSERT(victim); 1230 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1231 oldVictimIdx != victimIdx) { 1232 victimIdx = (victimIdx + 1) % nproc; 1233 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1234 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1235 KMP_DEBUG_ASSERT(victim); 1236 } 1237 if (victim == pr || id != victim->u.p.static_steal_counter) { 1238 continue; // try once more (nproc attempts in total) 1239 // no victim is ready yet to participate in stealing 1240 // because no victim passed kmp_init_dispatch yet 1241 } 1242 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1243 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1244 continue; // not enough chunks to steal, goto next victim 1245 } 1246 1247 lck = victim->u.p.th_steal_lock; 1248 KMP_ASSERT(lck != NULL); 1249 __kmp_acquire_lock(lck, gtid); 1250 limit = victim->u.p.ub; // keep initial ub 1251 if (victim->u.p.count >= limit || 1252 (remaining = limit - victim->u.p.count) < 2) { 1253 __kmp_release_lock(lck, gtid); 1254 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1255 continue; // not enough chunks to steal 1256 } 1257 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1258 // by 1 1259 if (remaining > 3) { 1260 // steal 1/4 of remaining 1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1262 init = (victim->u.p.ub -= (remaining >> 2)); 1263 } else { 1264 // steal 1 chunk of 2 or 3 remaining 1265 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1266 init = (victim->u.p.ub -= 1); 1267 } 1268 __kmp_release_lock(lck, gtid); 1269 1270 KMP_DEBUG_ASSERT(init + 1 <= limit); 1271 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1272 status = 1; 1273 while_index = 0; 1274 // now update own count and ub with stolen range but init chunk 1275 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1276 pr->u.p.count = init + 1; 1277 pr->u.p.ub = limit; 1278 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1279 } // while (search for victim) 1280 } // if (try to find victim and steal) 1281 } else { 1282 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1283 typedef union { 1284 struct { 1285 UT count; 1286 T ub; 1287 } p; 1288 kmp_int64 b; 1289 } union_i4; 1290 // All operations on 'count' or 'ub' must be combined atomically 1291 // together. 1292 { 1293 union_i4 vold, vnew; 1294 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1295 vnew = vold; 1296 vnew.p.count++; 1297 while (!KMP_COMPARE_AND_STORE_ACQ64( 1298 (volatile kmp_int64 *)&pr->u.p.count, 1299 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1300 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1301 KMP_CPU_PAUSE(); 1302 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1303 vnew = vold; 1304 vnew.p.count++; 1305 } 1306 vnew = vold; 1307 init = vnew.p.count; 1308 status = (init < (UT)vnew.p.ub); 1309 } 1310 1311 if (!status) { 1312 kmp_info_t **other_threads = team->t.t_threads; 1313 int while_limit = pr->u.p.parm3; 1314 int while_index = 0; 1315 T id = pr->u.p.static_steal_counter; // loop id 1316 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1317 __kmp_dispatch_num_buffers; // current loop index 1318 // note: victim thread can potentially execute another loop 1319 // TODO: algorithm of searching for a victim 1320 // should be cleaned up and measured 1321 while ((!status) && (while_limit != ++while_index)) { 1322 dispatch_private_info_template<T> *victim; 1323 union_i4 vold, vnew; 1324 kmp_int32 remaining; 1325 T victimIdx = pr->u.p.parm4; 1326 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1327 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1328 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1329 KMP_DEBUG_ASSERT(victim); 1330 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1331 oldVictimIdx != victimIdx) { 1332 victimIdx = (victimIdx + 1) % nproc; 1333 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1334 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1335 KMP_DEBUG_ASSERT(victim); 1336 } 1337 if (victim == pr || id != victim->u.p.static_steal_counter) { 1338 continue; // try once more (nproc attempts in total) 1339 // no victim is ready yet to participate in stealing 1340 // because no victim passed kmp_init_dispatch yet 1341 } 1342 pr->u.p.parm4 = victimIdx; // new victim found 1343 while (1) { // CAS loop if victim has enough chunks to steal 1344 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1345 vnew = vold; 1346 1347 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1348 if (vnew.p.count >= (UT)vnew.p.ub || 1349 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1350 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1351 break; // not enough chunks to steal, goto next victim 1352 } 1353 if (remaining > 3) { 1354 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1355 } else { 1356 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1357 } 1358 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1359 // TODO: Should this be acquire or release? 1360 if (KMP_COMPARE_AND_STORE_ACQ64( 1361 (volatile kmp_int64 *)&victim->u.p.count, 1362 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1363 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1364 // stealing succeeded 1365 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1366 vold.p.ub - vnew.p.ub); 1367 status = 1; 1368 while_index = 0; 1369 // now update own count and ub 1370 init = vnew.p.ub; 1371 vold.p.count = init + 1; 1372 #if KMP_ARCH_X86 1373 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1374 #else 1375 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1376 #endif 1377 break; 1378 } // if (check CAS result) 1379 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1380 } // while (try to steal from particular victim) 1381 } // while (search for victim) 1382 } // if (try to find victim and steal) 1383 } // if (4-byte induction variable) 1384 if (!status) { 1385 *p_lb = 0; 1386 *p_ub = 0; 1387 if (p_st != NULL) 1388 *p_st = 0; 1389 } else { 1390 start = pr->u.p.parm2; 1391 init *= chunk; 1392 limit = chunk + init - 1; 1393 incr = pr->u.p.st; 1394 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1395 1396 KMP_DEBUG_ASSERT(init <= trip); 1397 if ((last = (limit >= trip)) != 0) 1398 limit = trip; 1399 if (p_st != NULL) 1400 *p_st = incr; 1401 1402 if (incr == 1) { 1403 *p_lb = start + init; 1404 *p_ub = start + limit; 1405 } else { 1406 *p_lb = start + init * incr; 1407 *p_ub = start + limit * incr; 1408 } 1409 1410 if (pr->flags.ordered) { 1411 pr->u.p.ordered_lower = init; 1412 pr->u.p.ordered_upper = limit; 1413 } // if 1414 } // if 1415 break; 1416 } // case 1417 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1418 case kmp_sch_static_balanced: { 1419 KD_TRACE( 1420 10, 1421 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1422 gtid)); 1423 /* check if thread has any iteration to do */ 1424 if ((status = !pr->u.p.count) != 0) { 1425 pr->u.p.count = 1; 1426 *p_lb = pr->u.p.lb; 1427 *p_ub = pr->u.p.ub; 1428 last = pr->u.p.parm1; 1429 if (p_st != NULL) 1430 *p_st = pr->u.p.st; 1431 } else { /* no iterations to do */ 1432 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1433 } 1434 } // case 1435 break; 1436 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1437 merged here */ 1438 case kmp_sch_static_chunked: { 1439 T parm1; 1440 1441 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1442 "kmp_sch_static_[affinity|chunked] case\n", 1443 gtid)); 1444 parm1 = pr->u.p.parm1; 1445 1446 trip = pr->u.p.tc - 1; 1447 init = parm1 * (pr->u.p.count + tid); 1448 1449 if ((status = (init <= trip)) != 0) { 1450 start = pr->u.p.lb; 1451 incr = pr->u.p.st; 1452 limit = parm1 + init - 1; 1453 1454 if ((last = (limit >= trip)) != 0) 1455 limit = trip; 1456 1457 if (p_st != NULL) 1458 *p_st = incr; 1459 1460 pr->u.p.count += nproc; 1461 1462 if (incr == 1) { 1463 *p_lb = start + init; 1464 *p_ub = start + limit; 1465 } else { 1466 *p_lb = start + init * incr; 1467 *p_ub = start + limit * incr; 1468 } 1469 1470 if (pr->flags.ordered) { 1471 pr->u.p.ordered_lower = init; 1472 pr->u.p.ordered_upper = limit; 1473 } // if 1474 } // if 1475 } // case 1476 break; 1477 1478 case kmp_sch_dynamic_chunked: { 1479 T chunk = pr->u.p.parm1; 1480 1481 KD_TRACE( 1482 100, 1483 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1484 gtid)); 1485 1486 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1487 trip = pr->u.p.tc - 1; 1488 1489 if ((status = (init <= trip)) == 0) { 1490 *p_lb = 0; 1491 *p_ub = 0; 1492 if (p_st != NULL) 1493 *p_st = 0; 1494 } else { 1495 start = pr->u.p.lb; 1496 limit = chunk + init - 1; 1497 incr = pr->u.p.st; 1498 1499 if ((last = (limit >= trip)) != 0) 1500 limit = trip; 1501 1502 if (p_st != NULL) 1503 *p_st = incr; 1504 1505 if (incr == 1) { 1506 *p_lb = start + init; 1507 *p_ub = start + limit; 1508 } else { 1509 *p_lb = start + init * incr; 1510 *p_ub = start + limit * incr; 1511 } 1512 1513 if (pr->flags.ordered) { 1514 pr->u.p.ordered_lower = init; 1515 pr->u.p.ordered_upper = limit; 1516 } // if 1517 } // if 1518 } // case 1519 break; 1520 1521 case kmp_sch_guided_iterative_chunked: { 1522 T chunkspec = pr->u.p.parm1; 1523 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1524 "iterative case\n", 1525 gtid)); 1526 trip = pr->u.p.tc; 1527 // Start atomic part of calculations 1528 while (1) { 1529 ST remaining; // signed, because can be < 0 1530 init = sh->u.s.iteration; // shared value 1531 remaining = trip - init; 1532 if (remaining <= 0) { // AC: need to compare with 0 first 1533 // nothing to do, don't try atomic op 1534 status = 0; 1535 break; 1536 } 1537 if ((T)remaining < 1538 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1539 // use dynamic-style schedule 1540 // atomically increment iterations, get old value 1541 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1542 (ST)chunkspec); 1543 remaining = trip - init; 1544 if (remaining <= 0) { 1545 status = 0; // all iterations got by other threads 1546 } else { 1547 // got some iterations to work on 1548 status = 1; 1549 if ((T)remaining > chunkspec) { 1550 limit = init + chunkspec - 1; 1551 } else { 1552 last = 1; // the last chunk 1553 limit = init + remaining - 1; 1554 } // if 1555 } // if 1556 break; 1557 } // if 1558 limit = init + 1559 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1560 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1561 (ST)init, (ST)limit)) { 1562 // CAS was successful, chunk obtained 1563 status = 1; 1564 --limit; 1565 break; 1566 } // if 1567 } // while 1568 if (status != 0) { 1569 start = pr->u.p.lb; 1570 incr = pr->u.p.st; 1571 if (p_st != NULL) 1572 *p_st = incr; 1573 *p_lb = start + init * incr; 1574 *p_ub = start + limit * incr; 1575 if (pr->flags.ordered) { 1576 pr->u.p.ordered_lower = init; 1577 pr->u.p.ordered_upper = limit; 1578 } // if 1579 } else { 1580 *p_lb = 0; 1581 *p_ub = 0; 1582 if (p_st != NULL) 1583 *p_st = 0; 1584 } // if 1585 } // case 1586 break; 1587 1588 case kmp_sch_guided_simd: { 1589 // same as iterative but curr-chunk adjusted to be multiple of given 1590 // chunk 1591 T chunk = pr->u.p.parm1; 1592 KD_TRACE(100, 1593 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1594 gtid)); 1595 trip = pr->u.p.tc; 1596 // Start atomic part of calculations 1597 while (1) { 1598 ST remaining; // signed, because can be < 0 1599 init = sh->u.s.iteration; // shared value 1600 remaining = trip - init; 1601 if (remaining <= 0) { // AC: need to compare with 0 first 1602 status = 0; // nothing to do, don't try atomic op 1603 break; 1604 } 1605 KMP_DEBUG_ASSERT(init % chunk == 0); 1606 // compare with K*nproc*(chunk+1), K=2 by default 1607 if ((T)remaining < pr->u.p.parm2) { 1608 // use dynamic-style schedule 1609 // atomically increment iterations, get old value 1610 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1611 (ST)chunk); 1612 remaining = trip - init; 1613 if (remaining <= 0) { 1614 status = 0; // all iterations got by other threads 1615 } else { 1616 // got some iterations to work on 1617 status = 1; 1618 if ((T)remaining > chunk) { 1619 limit = init + chunk - 1; 1620 } else { 1621 last = 1; // the last chunk 1622 limit = init + remaining - 1; 1623 } // if 1624 } // if 1625 break; 1626 } // if 1627 // divide by K*nproc 1628 UT span = remaining * (*(double *)&pr->u.p.parm3); 1629 UT rem = span % chunk; 1630 if (rem) // adjust so that span%chunk == 0 1631 span += chunk - rem; 1632 limit = init + span; 1633 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1634 (ST)init, (ST)limit)) { 1635 // CAS was successful, chunk obtained 1636 status = 1; 1637 --limit; 1638 break; 1639 } // if 1640 } // while 1641 if (status != 0) { 1642 start = pr->u.p.lb; 1643 incr = pr->u.p.st; 1644 if (p_st != NULL) 1645 *p_st = incr; 1646 *p_lb = start + init * incr; 1647 *p_ub = start + limit * incr; 1648 if (pr->flags.ordered) { 1649 pr->u.p.ordered_lower = init; 1650 pr->u.p.ordered_upper = limit; 1651 } // if 1652 } else { 1653 *p_lb = 0; 1654 *p_ub = 0; 1655 if (p_st != NULL) 1656 *p_st = 0; 1657 } // if 1658 } // case 1659 break; 1660 1661 case kmp_sch_guided_analytical_chunked: { 1662 T chunkspec = pr->u.p.parm1; 1663 UT chunkIdx; 1664 #if KMP_USE_X87CONTROL 1665 /* for storing original FPCW value for Windows* OS on 1666 IA-32 architecture 8-byte version */ 1667 unsigned int oldFpcw; 1668 unsigned int fpcwSet = 0; 1669 #endif 1670 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1671 "kmp_sch_guided_analytical_chunked case\n", 1672 gtid)); 1673 1674 trip = pr->u.p.tc; 1675 1676 KMP_DEBUG_ASSERT(nproc > 1); 1677 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1678 1679 while (1) { /* this while loop is a safeguard against unexpected zero 1680 chunk sizes */ 1681 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1682 if (chunkIdx >= (UT)pr->u.p.parm2) { 1683 --trip; 1684 /* use dynamic-style scheduling */ 1685 init = chunkIdx * chunkspec + pr->u.p.count; 1686 /* need to verify init > 0 in case of overflow in the above 1687 * calculation */ 1688 if ((status = (init > 0 && init <= trip)) != 0) { 1689 limit = init + chunkspec - 1; 1690 1691 if ((last = (limit >= trip)) != 0) 1692 limit = trip; 1693 } 1694 break; 1695 } else { 1696 /* use exponential-style scheduling */ 1697 /* The following check is to workaround the lack of long double precision on 1698 Windows* OS. 1699 This check works around the possible effect that init != 0 for chunkIdx == 0. 1700 */ 1701 #if KMP_USE_X87CONTROL 1702 /* If we haven't already done so, save original 1703 FPCW and set precision to 64-bit, as Windows* OS 1704 on IA-32 architecture defaults to 53-bit */ 1705 if (!fpcwSet) { 1706 oldFpcw = _control87(0, 0); 1707 _control87(_PC_64, _MCW_PC); 1708 fpcwSet = 0x30000; 1709 } 1710 #endif 1711 if (chunkIdx) { 1712 init = __kmp_dispatch_guided_remaining<T>( 1713 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1714 KMP_DEBUG_ASSERT(init); 1715 init = trip - init; 1716 } else 1717 init = 0; 1718 limit = trip - __kmp_dispatch_guided_remaining<T>( 1719 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1720 KMP_ASSERT(init <= limit); 1721 if (init < limit) { 1722 KMP_DEBUG_ASSERT(limit <= trip); 1723 --limit; 1724 status = 1; 1725 break; 1726 } // if 1727 } // if 1728 } // while (1) 1729 #if KMP_USE_X87CONTROL 1730 /* restore FPCW if necessary 1731 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1732 */ 1733 if (fpcwSet && (oldFpcw & fpcwSet)) 1734 _control87(oldFpcw, _MCW_PC); 1735 #endif 1736 if (status != 0) { 1737 start = pr->u.p.lb; 1738 incr = pr->u.p.st; 1739 if (p_st != NULL) 1740 *p_st = incr; 1741 *p_lb = start + init * incr; 1742 *p_ub = start + limit * incr; 1743 if (pr->flags.ordered) { 1744 pr->u.p.ordered_lower = init; 1745 pr->u.p.ordered_upper = limit; 1746 } 1747 } else { 1748 *p_lb = 0; 1749 *p_ub = 0; 1750 if (p_st != NULL) 1751 *p_st = 0; 1752 } 1753 } // case 1754 break; 1755 1756 case kmp_sch_trapezoidal: { 1757 UT index; 1758 T parm2 = pr->u.p.parm2; 1759 T parm3 = pr->u.p.parm3; 1760 T parm4 = pr->u.p.parm4; 1761 KD_TRACE(100, 1762 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1763 gtid)); 1764 1765 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1766 1767 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1768 trip = pr->u.p.tc - 1; 1769 1770 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1771 *p_lb = 0; 1772 *p_ub = 0; 1773 if (p_st != NULL) 1774 *p_st = 0; 1775 } else { 1776 start = pr->u.p.lb; 1777 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1778 incr = pr->u.p.st; 1779 1780 if ((last = (limit >= trip)) != 0) 1781 limit = trip; 1782 1783 if (p_st != NULL) 1784 *p_st = incr; 1785 1786 if (incr == 1) { 1787 *p_lb = start + init; 1788 *p_ub = start + limit; 1789 } else { 1790 *p_lb = start + init * incr; 1791 *p_ub = start + limit * incr; 1792 } 1793 1794 if (pr->flags.ordered) { 1795 pr->u.p.ordered_lower = init; 1796 pr->u.p.ordered_upper = limit; 1797 } // if 1798 } // if 1799 } // case 1800 break; 1801 default: { 1802 status = 0; // to avoid complaints on uninitialized variable use 1803 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1804 KMP_HNT(GetNewerLibrary), // Hint 1805 __kmp_msg_null // Variadic argument list terminator 1806 ); 1807 } break; 1808 } // switch 1809 if (p_last) 1810 *p_last = last; 1811 #ifdef KMP_DEBUG 1812 if (pr->flags.ordered) { 1813 char *buff; 1814 // create format specifiers before the debug output 1815 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1816 "ordered_lower:%%%s ordered_upper:%%%s\n", 1817 traits_t<UT>::spec, traits_t<UT>::spec); 1818 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1819 __kmp_str_free(&buff); 1820 } 1821 { 1822 char *buff; 1823 // create format specifiers before the debug output 1824 buff = __kmp_str_format( 1825 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1826 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1827 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1828 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1829 __kmp_str_free(&buff); 1830 } 1831 #endif 1832 return status; 1833 } 1834 1835 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1836 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1837 is not called. */ 1838 #if OMPT_SUPPORT && OMPT_OPTIONAL 1839 #define OMPT_LOOP_END \ 1840 if (status == 0) { \ 1841 if (ompt_enabled.ompt_callback_work) { \ 1842 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1843 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1844 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1845 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1846 &(task_info->task_data), 0, codeptr); \ 1847 } \ 1848 } 1849 // TODO: implement count 1850 #else 1851 #define OMPT_LOOP_END // no-op 1852 #endif 1853 1854 #if KMP_STATS_ENABLED 1855 #define KMP_STATS_LOOP_END \ 1856 { \ 1857 kmp_int64 u, l, t, i; \ 1858 l = (kmp_int64)(*p_lb); \ 1859 u = (kmp_int64)(*p_ub); \ 1860 i = (kmp_int64)(pr->u.p.st); \ 1861 if (status == 0) { \ 1862 t = 0; \ 1863 KMP_POP_PARTITIONED_TIMER(); \ 1864 } else if (i == 1) { \ 1865 if (u >= l) \ 1866 t = u - l + 1; \ 1867 else \ 1868 t = 0; \ 1869 } else if (i < 0) { \ 1870 if (l >= u) \ 1871 t = (l - u) / (-i) + 1; \ 1872 else \ 1873 t = 0; \ 1874 } else { \ 1875 if (u >= l) \ 1876 t = (u - l) / i + 1; \ 1877 else \ 1878 t = 0; \ 1879 } \ 1880 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1881 } 1882 #else 1883 #define KMP_STATS_LOOP_END /* Nothing */ 1884 #endif 1885 1886 template <typename T> 1887 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1888 T *p_lb, T *p_ub, 1889 typename traits_t<T>::signed_t *p_st 1890 #if OMPT_SUPPORT && OMPT_OPTIONAL 1891 , 1892 void *codeptr 1893 #endif 1894 ) { 1895 1896 typedef typename traits_t<T>::unsigned_t UT; 1897 typedef typename traits_t<T>::signed_t ST; 1898 // This is potentially slightly misleading, schedule(runtime) will appear here 1899 // even if the actual runtime schedule is static. (Which points out a 1900 // disadvantage of schedule(runtime): even when static scheduling is used it 1901 // costs more than a compile time choice to use static scheduling would.) 1902 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1903 1904 int status; 1905 dispatch_private_info_template<T> *pr; 1906 __kmp_assert_valid_gtid(gtid); 1907 kmp_info_t *th = __kmp_threads[gtid]; 1908 kmp_team_t *team = th->th.th_team; 1909 1910 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1911 KD_TRACE( 1912 1000, 1913 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1914 gtid, p_lb, p_ub, p_st, p_last)); 1915 1916 if (team->t.t_serialized) { 1917 /* NOTE: serialize this dispatch because we are not at the active level */ 1918 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1919 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1920 KMP_DEBUG_ASSERT(pr); 1921 1922 if ((status = (pr->u.p.tc != 0)) == 0) { 1923 *p_lb = 0; 1924 *p_ub = 0; 1925 // if ( p_last != NULL ) 1926 // *p_last = 0; 1927 if (p_st != NULL) 1928 *p_st = 0; 1929 if (__kmp_env_consistency_check) { 1930 if (pr->pushed_ws != ct_none) { 1931 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1932 } 1933 } 1934 } else if (pr->flags.nomerge) { 1935 kmp_int32 last; 1936 T start; 1937 UT limit, trip, init; 1938 ST incr; 1939 T chunk = pr->u.p.parm1; 1940 1941 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1942 gtid)); 1943 1944 init = chunk * pr->u.p.count++; 1945 trip = pr->u.p.tc - 1; 1946 1947 if ((status = (init <= trip)) == 0) { 1948 *p_lb = 0; 1949 *p_ub = 0; 1950 // if ( p_last != NULL ) 1951 // *p_last = 0; 1952 if (p_st != NULL) 1953 *p_st = 0; 1954 if (__kmp_env_consistency_check) { 1955 if (pr->pushed_ws != ct_none) { 1956 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1957 } 1958 } 1959 } else { 1960 start = pr->u.p.lb; 1961 limit = chunk + init - 1; 1962 incr = pr->u.p.st; 1963 1964 if ((last = (limit >= trip)) != 0) { 1965 limit = trip; 1966 #if KMP_OS_WINDOWS 1967 pr->u.p.last_upper = pr->u.p.ub; 1968 #endif /* KMP_OS_WINDOWS */ 1969 } 1970 if (p_last != NULL) 1971 *p_last = last; 1972 if (p_st != NULL) 1973 *p_st = incr; 1974 if (incr == 1) { 1975 *p_lb = start + init; 1976 *p_ub = start + limit; 1977 } else { 1978 *p_lb = start + init * incr; 1979 *p_ub = start + limit * incr; 1980 } 1981 1982 if (pr->flags.ordered) { 1983 pr->u.p.ordered_lower = init; 1984 pr->u.p.ordered_upper = limit; 1985 #ifdef KMP_DEBUG 1986 { 1987 char *buff; 1988 // create format specifiers before the debug output 1989 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1990 "ordered_lower:%%%s ordered_upper:%%%s\n", 1991 traits_t<UT>::spec, traits_t<UT>::spec); 1992 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1993 pr->u.p.ordered_upper)); 1994 __kmp_str_free(&buff); 1995 } 1996 #endif 1997 } // if 1998 } // if 1999 } else { 2000 pr->u.p.tc = 0; 2001 *p_lb = pr->u.p.lb; 2002 *p_ub = pr->u.p.ub; 2003 #if KMP_OS_WINDOWS 2004 pr->u.p.last_upper = *p_ub; 2005 #endif /* KMP_OS_WINDOWS */ 2006 if (p_last != NULL) 2007 *p_last = TRUE; 2008 if (p_st != NULL) 2009 *p_st = pr->u.p.st; 2010 } // if 2011 #ifdef KMP_DEBUG 2012 { 2013 char *buff; 2014 // create format specifiers before the debug output 2015 buff = __kmp_str_format( 2016 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2017 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2018 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2019 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 2020 __kmp_str_free(&buff); 2021 } 2022 #endif 2023 #if INCLUDE_SSC_MARKS 2024 SSC_MARK_DISPATCH_NEXT(); 2025 #endif 2026 OMPT_LOOP_END; 2027 KMP_STATS_LOOP_END; 2028 return status; 2029 } else { 2030 kmp_int32 last = 0; 2031 dispatch_shared_info_template<T> volatile *sh; 2032 2033 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2034 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2035 2036 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2037 th->th.th_dispatch->th_dispatch_pr_current); 2038 KMP_DEBUG_ASSERT(pr); 2039 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2040 th->th.th_dispatch->th_dispatch_sh_current); 2041 KMP_DEBUG_ASSERT(sh); 2042 2043 #if KMP_USE_HIER_SCHED 2044 if (pr->flags.use_hier) 2045 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2046 else 2047 #endif // KMP_USE_HIER_SCHED 2048 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2049 p_st, th->th.th_team_nproc, 2050 th->th.th_info.ds.ds_tid); 2051 // status == 0: no more iterations to execute 2052 if (status == 0) { 2053 UT num_done; 2054 2055 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2056 #ifdef KMP_DEBUG 2057 { 2058 char *buff; 2059 // create format specifiers before the debug output 2060 buff = __kmp_str_format( 2061 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2062 traits_t<UT>::spec); 2063 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2064 __kmp_str_free(&buff); 2065 } 2066 #endif 2067 2068 #if KMP_USE_HIER_SCHED 2069 pr->flags.use_hier = FALSE; 2070 #endif 2071 if ((ST)num_done == th->th.th_team_nproc - 1) { 2072 #if (KMP_STATIC_STEAL_ENABLED) 2073 if (pr->schedule == kmp_sch_static_steal && 2074 traits_t<T>::type_size > 4) { 2075 int i; 2076 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2077 __kmp_dispatch_num_buffers; // current loop index 2078 kmp_info_t **other_threads = team->t.t_threads; 2079 // loop complete, safe to destroy locks used for stealing 2080 for (i = 0; i < th->th.th_team_nproc; ++i) { 2081 dispatch_private_info_template<T> *buf = 2082 reinterpret_cast<dispatch_private_info_template<T> *>( 2083 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2084 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2085 KMP_ASSERT(lck != NULL); 2086 __kmp_destroy_lock(lck); 2087 __kmp_free(lck); 2088 buf->u.p.th_steal_lock = NULL; 2089 } 2090 } 2091 #endif 2092 /* NOTE: release this buffer to be reused */ 2093 2094 KMP_MB(); /* Flush all pending memory write invalidates. */ 2095 2096 sh->u.s.num_done = 0; 2097 sh->u.s.iteration = 0; 2098 2099 /* TODO replace with general release procedure? */ 2100 if (pr->flags.ordered) { 2101 sh->u.s.ordered_iteration = 0; 2102 } 2103 2104 KMP_MB(); /* Flush all pending memory write invalidates. */ 2105 2106 sh->buffer_index += __kmp_dispatch_num_buffers; 2107 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2108 gtid, sh->buffer_index)); 2109 2110 KMP_MB(); /* Flush all pending memory write invalidates. */ 2111 2112 } // if 2113 if (__kmp_env_consistency_check) { 2114 if (pr->pushed_ws != ct_none) { 2115 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2116 } 2117 } 2118 2119 th->th.th_dispatch->th_deo_fcn = NULL; 2120 th->th.th_dispatch->th_dxo_fcn = NULL; 2121 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2122 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2123 } // if (status == 0) 2124 #if KMP_OS_WINDOWS 2125 else if (last) { 2126 pr->u.p.last_upper = pr->u.p.ub; 2127 } 2128 #endif /* KMP_OS_WINDOWS */ 2129 if (p_last != NULL && status != 0) 2130 *p_last = last; 2131 } // if 2132 2133 #ifdef KMP_DEBUG 2134 { 2135 char *buff; 2136 // create format specifiers before the debug output 2137 buff = __kmp_str_format( 2138 "__kmp_dispatch_next: T#%%d normal case: " 2139 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2140 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2141 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2142 (p_last ? *p_last : 0), status)); 2143 __kmp_str_free(&buff); 2144 } 2145 #endif 2146 #if INCLUDE_SSC_MARKS 2147 SSC_MARK_DISPATCH_NEXT(); 2148 #endif 2149 OMPT_LOOP_END; 2150 KMP_STATS_LOOP_END; 2151 return status; 2152 } 2153 2154 template <typename T> 2155 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2156 kmp_int32 *plastiter, T *plower, T *pupper, 2157 typename traits_t<T>::signed_t incr) { 2158 typedef typename traits_t<T>::unsigned_t UT; 2159 kmp_uint32 team_id; 2160 kmp_uint32 nteams; 2161 UT trip_count; 2162 kmp_team_t *team; 2163 kmp_info_t *th; 2164 2165 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2166 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2167 #ifdef KMP_DEBUG 2168 typedef typename traits_t<T>::signed_t ST; 2169 { 2170 char *buff; 2171 // create format specifiers before the debug output 2172 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2173 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2174 traits_t<T>::spec, traits_t<T>::spec, 2175 traits_t<ST>::spec, traits_t<T>::spec); 2176 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2177 __kmp_str_free(&buff); 2178 } 2179 #endif 2180 2181 if (__kmp_env_consistency_check) { 2182 if (incr == 0) { 2183 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2184 loc); 2185 } 2186 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2187 // The loop is illegal. 2188 // Some zero-trip loops maintained by compiler, e.g.: 2189 // for(i=10;i<0;++i) // lower >= upper - run-time check 2190 // for(i=0;i>10;--i) // lower <= upper - run-time check 2191 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2192 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2193 // Compiler does not check the following illegal loops: 2194 // for(i=0;i<10;i+=incr) // where incr<0 2195 // for(i=10;i>0;i-=incr) // where incr<0 2196 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2197 } 2198 } 2199 __kmp_assert_valid_gtid(gtid); 2200 th = __kmp_threads[gtid]; 2201 team = th->th.th_team; 2202 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2203 nteams = th->th.th_teams_size.nteams; 2204 team_id = team->t.t_master_tid; 2205 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2206 2207 // compute global trip count 2208 if (incr == 1) { 2209 trip_count = *pupper - *plower + 1; 2210 } else if (incr == -1) { 2211 trip_count = *plower - *pupper + 1; 2212 } else if (incr > 0) { 2213 // upper-lower can exceed the limit of signed type 2214 trip_count = (UT)(*pupper - *plower) / incr + 1; 2215 } else { 2216 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2217 } 2218 2219 if (trip_count <= nteams) { 2220 KMP_DEBUG_ASSERT( 2221 __kmp_static == kmp_sch_static_greedy || 2222 __kmp_static == 2223 kmp_sch_static_balanced); // Unknown static scheduling type. 2224 // only some teams get single iteration, others get nothing 2225 if (team_id < trip_count) { 2226 *pupper = *plower = *plower + team_id * incr; 2227 } else { 2228 *plower = *pupper + incr; // zero-trip loop 2229 } 2230 if (plastiter != NULL) 2231 *plastiter = (team_id == trip_count - 1); 2232 } else { 2233 if (__kmp_static == kmp_sch_static_balanced) { 2234 UT chunk = trip_count / nteams; 2235 UT extras = trip_count % nteams; 2236 *plower += 2237 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2238 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2239 if (plastiter != NULL) 2240 *plastiter = (team_id == nteams - 1); 2241 } else { 2242 T chunk_inc_count = 2243 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2244 T upper = *pupper; 2245 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2246 // Unknown static scheduling type. 2247 *plower += team_id * chunk_inc_count; 2248 *pupper = *plower + chunk_inc_count - incr; 2249 // Check/correct bounds if needed 2250 if (incr > 0) { 2251 if (*pupper < *plower) 2252 *pupper = traits_t<T>::max_value; 2253 if (plastiter != NULL) 2254 *plastiter = *plower <= upper && *pupper > upper - incr; 2255 if (*pupper > upper) 2256 *pupper = upper; // tracker C73258 2257 } else { 2258 if (*pupper > *plower) 2259 *pupper = traits_t<T>::min_value; 2260 if (plastiter != NULL) 2261 *plastiter = *plower >= upper && *pupper < upper - incr; 2262 if (*pupper < upper) 2263 *pupper = upper; // tracker C73258 2264 } 2265 } 2266 } 2267 } 2268 2269 //----------------------------------------------------------------------------- 2270 // Dispatch routines 2271 // Transfer call to template< type T > 2272 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2273 // T lb, T ub, ST st, ST chunk ) 2274 extern "C" { 2275 2276 /*! 2277 @ingroup WORK_SHARING 2278 @{ 2279 @param loc Source location 2280 @param gtid Global thread id 2281 @param schedule Schedule type 2282 @param lb Lower bound 2283 @param ub Upper bound 2284 @param st Step (or increment if you prefer) 2285 @param chunk The chunk size to block with 2286 2287 This function prepares the runtime to start a dynamically scheduled for loop, 2288 saving the loop arguments. 2289 These functions are all identical apart from the types of the arguments. 2290 */ 2291 2292 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2293 enum sched_type schedule, kmp_int32 lb, 2294 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2295 KMP_DEBUG_ASSERT(__kmp_init_serial); 2296 #if OMPT_SUPPORT && OMPT_OPTIONAL 2297 OMPT_STORE_RETURN_ADDRESS(gtid); 2298 #endif 2299 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2300 } 2301 /*! 2302 See @ref __kmpc_dispatch_init_4 2303 */ 2304 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2305 enum sched_type schedule, kmp_uint32 lb, 2306 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2307 KMP_DEBUG_ASSERT(__kmp_init_serial); 2308 #if OMPT_SUPPORT && OMPT_OPTIONAL 2309 OMPT_STORE_RETURN_ADDRESS(gtid); 2310 #endif 2311 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2312 } 2313 2314 /*! 2315 See @ref __kmpc_dispatch_init_4 2316 */ 2317 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2318 enum sched_type schedule, kmp_int64 lb, 2319 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2320 KMP_DEBUG_ASSERT(__kmp_init_serial); 2321 #if OMPT_SUPPORT && OMPT_OPTIONAL 2322 OMPT_STORE_RETURN_ADDRESS(gtid); 2323 #endif 2324 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2325 } 2326 2327 /*! 2328 See @ref __kmpc_dispatch_init_4 2329 */ 2330 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2331 enum sched_type schedule, kmp_uint64 lb, 2332 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2333 KMP_DEBUG_ASSERT(__kmp_init_serial); 2334 #if OMPT_SUPPORT && OMPT_OPTIONAL 2335 OMPT_STORE_RETURN_ADDRESS(gtid); 2336 #endif 2337 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2338 } 2339 2340 /*! 2341 See @ref __kmpc_dispatch_init_4 2342 2343 Difference from __kmpc_dispatch_init set of functions is these functions 2344 are called for composite distribute parallel for construct. Thus before 2345 regular iterations dispatching we need to calc per-team iteration space. 2346 2347 These functions are all identical apart from the types of the arguments. 2348 */ 2349 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2350 enum sched_type schedule, kmp_int32 *p_last, 2351 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2352 kmp_int32 chunk) { 2353 KMP_DEBUG_ASSERT(__kmp_init_serial); 2354 #if OMPT_SUPPORT && OMPT_OPTIONAL 2355 OMPT_STORE_RETURN_ADDRESS(gtid); 2356 #endif 2357 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2358 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2359 } 2360 2361 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2362 enum sched_type schedule, kmp_int32 *p_last, 2363 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2364 kmp_int32 chunk) { 2365 KMP_DEBUG_ASSERT(__kmp_init_serial); 2366 #if OMPT_SUPPORT && OMPT_OPTIONAL 2367 OMPT_STORE_RETURN_ADDRESS(gtid); 2368 #endif 2369 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2370 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2371 } 2372 2373 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2374 enum sched_type schedule, kmp_int32 *p_last, 2375 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2376 kmp_int64 chunk) { 2377 KMP_DEBUG_ASSERT(__kmp_init_serial); 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 OMPT_STORE_RETURN_ADDRESS(gtid); 2380 #endif 2381 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2382 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2383 } 2384 2385 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2386 enum sched_type schedule, kmp_int32 *p_last, 2387 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2388 kmp_int64 chunk) { 2389 KMP_DEBUG_ASSERT(__kmp_init_serial); 2390 #if OMPT_SUPPORT && OMPT_OPTIONAL 2391 OMPT_STORE_RETURN_ADDRESS(gtid); 2392 #endif 2393 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2394 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2395 } 2396 2397 /*! 2398 @param loc Source code location 2399 @param gtid Global thread id 2400 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2401 otherwise 2402 @param p_lb Pointer to the lower bound for the next chunk of work 2403 @param p_ub Pointer to the upper bound for the next chunk of work 2404 @param p_st Pointer to the stride for the next chunk of work 2405 @return one if there is work to be done, zero otherwise 2406 2407 Get the next dynamically allocated chunk of work for this thread. 2408 If there is no more work, then the lb,ub and stride need not be modified. 2409 */ 2410 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2411 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2412 #if OMPT_SUPPORT && OMPT_OPTIONAL 2413 OMPT_STORE_RETURN_ADDRESS(gtid); 2414 #endif 2415 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2416 #if OMPT_SUPPORT && OMPT_OPTIONAL 2417 , 2418 OMPT_LOAD_RETURN_ADDRESS(gtid) 2419 #endif 2420 ); 2421 } 2422 2423 /*! 2424 See @ref __kmpc_dispatch_next_4 2425 */ 2426 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2427 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2428 kmp_int32 *p_st) { 2429 #if OMPT_SUPPORT && OMPT_OPTIONAL 2430 OMPT_STORE_RETURN_ADDRESS(gtid); 2431 #endif 2432 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2433 #if OMPT_SUPPORT && OMPT_OPTIONAL 2434 , 2435 OMPT_LOAD_RETURN_ADDRESS(gtid) 2436 #endif 2437 ); 2438 } 2439 2440 /*! 2441 See @ref __kmpc_dispatch_next_4 2442 */ 2443 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2444 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2445 #if OMPT_SUPPORT && OMPT_OPTIONAL 2446 OMPT_STORE_RETURN_ADDRESS(gtid); 2447 #endif 2448 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2449 #if OMPT_SUPPORT && OMPT_OPTIONAL 2450 , 2451 OMPT_LOAD_RETURN_ADDRESS(gtid) 2452 #endif 2453 ); 2454 } 2455 2456 /*! 2457 See @ref __kmpc_dispatch_next_4 2458 */ 2459 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2460 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2461 kmp_int64 *p_st) { 2462 #if OMPT_SUPPORT && OMPT_OPTIONAL 2463 OMPT_STORE_RETURN_ADDRESS(gtid); 2464 #endif 2465 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2466 #if OMPT_SUPPORT && OMPT_OPTIONAL 2467 , 2468 OMPT_LOAD_RETURN_ADDRESS(gtid) 2469 #endif 2470 ); 2471 } 2472 2473 /*! 2474 @param loc Source code location 2475 @param gtid Global thread id 2476 2477 Mark the end of a dynamic loop. 2478 */ 2479 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2480 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2481 } 2482 2483 /*! 2484 See @ref __kmpc_dispatch_fini_4 2485 */ 2486 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2487 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2488 } 2489 2490 /*! 2491 See @ref __kmpc_dispatch_fini_4 2492 */ 2493 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2494 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2495 } 2496 2497 /*! 2498 See @ref __kmpc_dispatch_fini_4 2499 */ 2500 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2501 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2502 } 2503 /*! @} */ 2504 2505 //----------------------------------------------------------------------------- 2506 // Non-template routines from kmp_dispatch.cpp used in other sources 2507 2508 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2509 return value == checker; 2510 } 2511 2512 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2513 return value != checker; 2514 } 2515 2516 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2517 return value < checker; 2518 } 2519 2520 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2521 return value >= checker; 2522 } 2523 2524 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2525 return value <= checker; 2526 } 2527 2528 kmp_uint32 2529 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2530 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2531 void *obj // Higher-level synchronization object, or NULL. 2532 ) { 2533 // note: we may not belong to a team at this point 2534 volatile kmp_uint32 *spin = spinner; 2535 kmp_uint32 check = checker; 2536 kmp_uint32 spins; 2537 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2538 kmp_uint32 r; 2539 2540 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2541 KMP_INIT_YIELD(spins); 2542 // main wait spin loop 2543 while (!f(r = TCR_4(*spin), check)) { 2544 KMP_FSYNC_SPIN_PREPARE(obj); 2545 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2546 split. It causes problems with infinite recursion because of exit lock */ 2547 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2548 __kmp_abort_thread(); */ 2549 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2550 } 2551 KMP_FSYNC_SPIN_ACQUIRED(obj); 2552 return r; 2553 } 2554 2555 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2556 kmp_uint32 (*pred)(void *, kmp_uint32), 2557 void *obj // Higher-level synchronization object, or NULL. 2558 ) { 2559 // note: we may not belong to a team at this point 2560 void *spin = spinner; 2561 kmp_uint32 check = checker; 2562 kmp_uint32 spins; 2563 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2564 2565 KMP_FSYNC_SPIN_INIT(obj, spin); 2566 KMP_INIT_YIELD(spins); 2567 // main wait spin loop 2568 while (!f(spin, check)) { 2569 KMP_FSYNC_SPIN_PREPARE(obj); 2570 /* if we have waited a bit, or are noversubscribed, yield */ 2571 /* pause is in the following code */ 2572 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2573 } 2574 KMP_FSYNC_SPIN_ACQUIRED(obj); 2575 } 2576 2577 } // extern "C" 2578 2579 #ifdef KMP_GOMP_COMPAT 2580 2581 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2582 enum sched_type schedule, kmp_int32 lb, 2583 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2584 int push_ws) { 2585 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2586 push_ws); 2587 } 2588 2589 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2590 enum sched_type schedule, kmp_uint32 lb, 2591 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2592 int push_ws) { 2593 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2594 push_ws); 2595 } 2596 2597 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2598 enum sched_type schedule, kmp_int64 lb, 2599 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2600 int push_ws) { 2601 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2602 push_ws); 2603 } 2604 2605 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2606 enum sched_type schedule, kmp_uint64 lb, 2607 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2608 int push_ws) { 2609 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2610 push_ws); 2611 } 2612 2613 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2614 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2615 } 2616 2617 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2618 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2619 } 2620 2621 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2622 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2623 } 2624 2625 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2626 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2627 } 2628 2629 #endif /* KMP_GOMP_COMPAT */ 2630 2631 /* ------------------------------------------------------------------------ */ 2632