1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 // Initialize a dispatch_private_info_template<T> buffer for a particular 94 // type of schedule,chunk. The loop description is found in lb (lower bound), 95 // ub (upper bound), and st (stride). nproc is the number of threads relevant 96 // to the scheduling (often the number of threads in a team, but not always if 97 // hierarchical scheduling is used). tid is the id of the thread calling 98 // the function within the group of nproc threads. It will have a value 99 // between 0 and nproc - 1. This is often just the thread id within a team, but 100 // is not necessarily the case when using hierarchical scheduling. 101 // loc is the source file location of the corresponding loop 102 // gtid is the global thread id 103 template <typename T> 104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 105 dispatch_private_info_template<T> *pr, 106 enum sched_type schedule, T lb, T ub, 107 typename traits_t<T>::signed_t st, 108 #if USE_ITT_BUILD 109 kmp_uint64 *cur_chunk, 110 #endif 111 typename traits_t<T>::signed_t chunk, 112 T nproc, T tid) { 113 typedef typename traits_t<T>::unsigned_t UT; 114 typedef typename traits_t<T>::floating_t DBL; 115 116 int active; 117 T tc; 118 kmp_info_t *th; 119 kmp_team_t *team; 120 int monotonicity; 121 bool use_hier; 122 123 #ifdef KMP_DEBUG 124 typedef typename traits_t<T>::signed_t ST; 125 { 126 char *buff; 127 // create format specifiers before the debug output 128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 129 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 131 traits_t<T>::spec, traits_t<T>::spec, 132 traits_t<ST>::spec, traits_t<ST>::spec, 133 traits_t<T>::spec, traits_t<T>::spec); 134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 135 __kmp_str_free(&buff); 136 } 137 #endif 138 /* setup data */ 139 th = __kmp_threads[gtid]; 140 team = th->th.th_team; 141 active = !team->t.t_serialized; 142 143 #if USE_ITT_BUILD 144 int itt_need_metadata_reporting = 145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 147 team->t.t_active_level == 1; 148 #endif 149 150 #if KMP_USE_HIER_SCHED 151 use_hier = pr->flags.use_hier; 152 #else 153 use_hier = false; 154 #endif 155 156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159 160 /* Pick up the nomerge/ordered bits from the scheduling type */ 161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 162 pr->flags.nomerge = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.nomerge = FALSE; 167 } 168 pr->type_size = traits_t<T>::type_size; // remember the size of variables 169 if (kmp_ord_lower & schedule) { 170 pr->flags.ordered = TRUE; 171 schedule = 172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 173 } else { 174 pr->flags.ordered = FALSE; 175 } 176 // Ordered overrides nonmonotonic 177 if (pr->flags.ordered) { 178 monotonicity = SCHEDULE_MONOTONIC; 179 } 180 181 if (schedule == kmp_sch_static) { 182 schedule = __kmp_static; 183 } else { 184 if (schedule == kmp_sch_runtime) { 185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 186 // not specified) 187 schedule = team->t.t_sched.r_sched_type; 188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 190 // Detail the schedule if needed (global controls are differentiated 191 // appropriately) 192 if (schedule == kmp_sch_guided_chunked) { 193 schedule = __kmp_guided; 194 } else if (schedule == kmp_sch_static) { 195 schedule = __kmp_static; 196 } 197 // Use the chunk size specified by OMP_SCHEDULE (or default if not 198 // specified) 199 chunk = team->t.t_sched.chunk; 200 #if USE_ITT_BUILD 201 if (cur_chunk) 202 *cur_chunk = chunk; 203 #endif 204 #ifdef KMP_DEBUG 205 { 206 char *buff; 207 // create format specifiers before the debug output 208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 209 "schedule:%%d chunk:%%%s\n", 210 traits_t<ST>::spec); 211 KD_TRACE(10, (buff, gtid, schedule, chunk)); 212 __kmp_str_free(&buff); 213 } 214 #endif 215 } else { 216 if (schedule == kmp_sch_guided_chunked) { 217 schedule = __kmp_guided; 218 } 219 if (chunk <= 0) { 220 chunk = KMP_DEFAULT_CHUNK; 221 } 222 } 223 224 if (schedule == kmp_sch_auto) { 225 // mapping and differentiation: in the __kmp_do_serial_initialize() 226 schedule = __kmp_auto; 227 #ifdef KMP_DEBUG 228 { 229 char *buff; 230 // create format specifiers before the debug output 231 buff = __kmp_str_format( 232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 233 "schedule:%%d chunk:%%%s\n", 234 traits_t<ST>::spec); 235 KD_TRACE(10, (buff, gtid, schedule, chunk)); 236 __kmp_str_free(&buff); 237 } 238 #endif 239 } 240 #if KMP_STATIC_STEAL_ENABLED 241 // map nonmonotonic:dynamic to static steal 242 if (schedule == kmp_sch_dynamic_chunked) { 243 if (monotonicity == SCHEDULE_NONMONOTONIC) 244 schedule = kmp_sch_static_steal; 245 } 246 #endif 247 /* guided analytical not safe for too many threads */ 248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 249 schedule = kmp_sch_guided_iterative_chunked; 250 KMP_WARNING(DispatchManyThreads); 251 } 252 if (schedule == kmp_sch_runtime_simd) { 253 // compiler provides simd_width in the chunk parameter 254 schedule = team->t.t_sched.r_sched_type; 255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257 // Detail the schedule if needed (global controls are differentiated 258 // appropriately) 259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260 schedule == __kmp_static) { 261 schedule = kmp_sch_static_balanced_chunked; 262 } else { 263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264 schedule = kmp_sch_guided_simd; 265 } 266 chunk = team->t.t_sched.chunk * chunk; 267 } 268 #if USE_ITT_BUILD 269 if (cur_chunk) 270 *cur_chunk = chunk; 271 #endif 272 #ifdef KMP_DEBUG 273 { 274 char *buff; 275 // create format specifiers before the debug output 276 buff = __kmp_str_format( 277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278 " chunk:%%%s\n", 279 traits_t<ST>::spec); 280 KD_TRACE(10, (buff, gtid, schedule, chunk)); 281 __kmp_str_free(&buff); 282 } 283 #endif 284 } 285 pr->u.p.parm1 = chunk; 286 } 287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 288 "unknown scheduling type"); 289 290 pr->u.p.count = 0; 291 292 if (__kmp_env_consistency_check) { 293 if (st == 0) { 294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 296 } 297 } 298 // compute trip count 299 if (st == 1) { // most common case 300 if (ub >= lb) { 301 tc = ub - lb + 1; 302 } else { // ub < lb 303 tc = 0; // zero-trip 304 } 305 } else if (st < 0) { 306 if (lb >= ub) { 307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(lb - ub) / (-st) + 1; 310 } else { // lb < ub 311 tc = 0; // zero-trip 312 } 313 } else { // st > 0 314 if (ub >= lb) { 315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 316 // where the division needs to be unsigned regardless of the result type 317 tc = (UT)(ub - lb) / st + 1; 318 } else { // ub < lb 319 tc = 0; // zero-trip 320 } 321 } 322 323 #if KMP_STATS_ENABLED 324 if (KMP_MASTER_GTID(gtid)) { 325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326 } 327 #endif 328 329 pr->u.p.lb = lb; 330 pr->u.p.ub = ub; 331 pr->u.p.st = st; 332 pr->u.p.tc = tc; 333 334 #if KMP_OS_WINDOWS 335 pr->u.p.last_upper = ub + st; 336 #endif /* KMP_OS_WINDOWS */ 337 338 /* NOTE: only the active parallel region(s) has active ordered sections */ 339 340 if (active) { 341 if (pr->flags.ordered) { 342 pr->ordered_bumped = 0; 343 pr->u.p.ordered_lower = 1; 344 pr->u.p.ordered_upper = 0; 345 } 346 } 347 348 switch (schedule) { 349 #if (KMP_STATIC_STEAL_ENABLED) 350 case kmp_sch_static_steal: { 351 T ntc, init; 352 353 KD_TRACE(100, 354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 355 gtid)); 356 357 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 358 if (nproc > 1 && ntc >= nproc) { 359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 360 T id = tid; 361 T small_chunk, extras; 362 363 small_chunk = ntc / nproc; 364 extras = ntc % nproc; 365 366 init = id * small_chunk + (id < extras ? id : extras); 367 pr->u.p.count = init; 368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 369 370 pr->u.p.parm2 = lb; 371 // parm3 is the number of times to attempt stealing which is 372 // proportional to the number of chunks per thread up until 373 // the maximum value of nproc. 374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 376 pr->u.p.st = st; 377 if (traits_t<T>::type_size > 4) { 378 // AC: TODO: check if 16-byte CAS available and use it to 379 // improve performance (probably wait for explicit request 380 // before spending time on this). 381 // For now use dynamically allocated per-thread lock, 382 // free memory in __kmp_dispatch_next when status==0. 383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384 pr->u.p.th_steal_lock = 385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386 __kmp_init_lock(pr->u.p.th_steal_lock); 387 } 388 break; 389 } else { 390 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391 schedule = kmp_sch_dynamic_chunked; 392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393 "kmp_sch_dynamic_chunked\n", 394 gtid)); 395 if (pr->u.p.parm1 <= 0) 396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 397 break; 398 } // if 399 } // case 400 #endif 401 case kmp_sch_static_balanced: { 402 T init, limit; 403 404 KD_TRACE( 405 100, 406 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 407 gtid)); 408 409 if (nproc > 1) { 410 T id = tid; 411 412 if (tc < nproc) { 413 if (id < tc) { 414 init = id; 415 limit = id; 416 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 417 } else { 418 pr->u.p.count = 1; /* means no more chunks to execute */ 419 pr->u.p.parm1 = FALSE; 420 break; 421 } 422 } else { 423 T small_chunk = tc / nproc; 424 T extras = tc % nproc; 425 init = id * small_chunk + (id < extras ? id : extras); 426 limit = init + small_chunk - (id < extras ? 0 : 1); 427 pr->u.p.parm1 = (id == nproc - 1); 428 } 429 } else { 430 if (tc > 0) { 431 init = 0; 432 limit = tc - 1; 433 pr->u.p.parm1 = TRUE; 434 } else { 435 // zero trip count 436 pr->u.p.count = 1; /* means no more chunks to execute */ 437 pr->u.p.parm1 = FALSE; 438 break; 439 } 440 } 441 #if USE_ITT_BUILD 442 // Calculate chunk for metadata report 443 if (itt_need_metadata_reporting) 444 if (cur_chunk) 445 *cur_chunk = limit - init + 1; 446 #endif 447 if (st == 1) { 448 pr->u.p.lb = lb + init; 449 pr->u.p.ub = lb + limit; 450 } else { 451 // calculated upper bound, "ub" is user-defined upper bound 452 T ub_tmp = lb + limit * st; 453 pr->u.p.lb = lb + init * st; 454 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 455 // it exactly 456 if (st > 0) { 457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 458 } else { 459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 460 } 461 } 462 if (pr->flags.ordered) { 463 pr->u.p.ordered_lower = init; 464 pr->u.p.ordered_upper = limit; 465 } 466 break; 467 } // case 468 case kmp_sch_static_balanced_chunked: { 469 // similar to balanced, but chunk adjusted to multiple of simd width 470 T nth = nproc; 471 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 472 " -> falling-through to static_greedy\n", 473 gtid)); 474 schedule = kmp_sch_static_greedy; 475 if (nth > 1) 476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 477 else 478 pr->u.p.parm1 = tc; 479 break; 480 } // case 481 case kmp_sch_guided_simd: 482 case kmp_sch_guided_iterative_chunked: { 483 KD_TRACE( 484 100, 485 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 486 " case\n", 487 gtid)); 488 489 if (nproc > 1) { 490 if ((2L * chunk + 1) * nproc >= tc) { 491 /* chunk size too large, switch to dynamic */ 492 schedule = kmp_sch_dynamic_chunked; 493 } else { 494 // when remaining iters become less than parm2 - switch to dynamic 495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 496 *(double *)&pr->u.p.parm3 = 497 guided_flt_param / nproc; // may occupy parm3 and parm4 498 } 499 } else { 500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 501 "kmp_sch_static_greedy\n", 502 gtid)); 503 schedule = kmp_sch_static_greedy; 504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 508 gtid)); 509 pr->u.p.parm1 = tc; 510 } // if 511 } // case 512 break; 513 case kmp_sch_guided_analytical_chunked: { 514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 515 "kmp_sch_guided_analytical_chunked case\n", 516 gtid)); 517 518 if (nproc > 1) { 519 if ((2L * chunk + 1) * nproc >= tc) { 520 /* chunk size too large, switch to dynamic */ 521 schedule = kmp_sch_dynamic_chunked; 522 } else { 523 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 524 DBL x; 525 526 #if KMP_USE_X87CONTROL 527 /* Linux* OS already has 64-bit computation by default for long double, 528 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 529 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 530 instead of the default 53-bit. Even though long double doesn't work 531 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 532 expected to impact the correctness of the algorithm, but this has not 533 been mathematically proven. */ 534 // save original FPCW and set precision to 64-bit, as 535 // Windows* OS on IA-32 architecture defaults to 53-bit 536 unsigned int oldFpcw = _control87(0, 0); 537 _control87(_PC_64, _MCW_PC); // 0,0x30000 538 #endif 539 /* value used for comparison in solver for cross-over point */ 540 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 541 542 /* crossover point--chunk indexes equal to or greater than 543 this point switch to dynamic-style scheduling */ 544 UT cross; 545 546 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 547 x = (long double)1.0 - (long double)0.5 / nproc; 548 549 #ifdef KMP_DEBUG 550 { // test natural alignment 551 struct _test_a { 552 char a; 553 union { 554 char b; 555 DBL d; 556 }; 557 } t; 558 ptrdiff_t natural_alignment = 559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 560 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 561 // long)natural_alignment ); 562 KMP_DEBUG_ASSERT( 563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 564 } 565 #endif // KMP_DEBUG 566 567 /* save the term in thread private dispatch structure */ 568 *(DBL *)&pr->u.p.parm3 = x; 569 570 /* solve for the crossover point to the nearest integer i for which C_i 571 <= chunk */ 572 { 573 UT left, right, mid; 574 long double p; 575 576 /* estimate initial upper and lower bound */ 577 578 /* doesn't matter what value right is as long as it is positive, but 579 it affects performance of the solver */ 580 right = 229; 581 p = __kmp_pow<UT>(x, right); 582 if (p > target) { 583 do { 584 p *= p; 585 right <<= 1; 586 } while (p > target && right < (1 << 27)); 587 /* lower bound is previous (failed) estimate of upper bound */ 588 left = right >> 1; 589 } else { 590 left = 0; 591 } 592 593 /* bisection root-finding method */ 594 while (left + 1 < right) { 595 mid = (left + right) / 2; 596 if (__kmp_pow<UT>(x, mid) > target) { 597 left = mid; 598 } else { 599 right = mid; 600 } 601 } // while 602 cross = right; 603 } 604 /* assert sanity of computed crossover point */ 605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 606 __kmp_pow<UT>(x, cross) <= target); 607 608 /* save the crossover point in thread private dispatch structure */ 609 pr->u.p.parm2 = cross; 610 611 // C75803 612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 614 #else 615 #define GUIDED_ANALYTICAL_WORKAROUND (x) 616 #endif 617 /* dynamic-style scheduling offset */ 618 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 619 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 620 cross * chunk; 621 #if KMP_USE_X87CONTROL 622 // restore FPCW 623 _control87(oldFpcw, _MCW_PC); 624 #endif 625 } // if 626 } else { 627 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 628 "kmp_sch_static_greedy\n", 629 gtid)); 630 schedule = kmp_sch_static_greedy; 631 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 632 pr->u.p.parm1 = tc; 633 } // if 634 } // case 635 break; 636 case kmp_sch_static_greedy: 637 KD_TRACE( 638 100, 639 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 640 gtid)); 641 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 642 break; 643 case kmp_sch_static_chunked: 644 case kmp_sch_dynamic_chunked: 645 if (pr->u.p.parm1 <= 0) { 646 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 647 } 648 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 649 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 650 gtid)); 651 break; 652 case kmp_sch_trapezoidal: { 653 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 654 655 T parm1, parm2, parm3, parm4; 656 KD_TRACE(100, 657 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 658 gtid)); 659 660 parm1 = chunk; 661 662 /* F : size of the first cycle */ 663 parm2 = (tc / (2 * nproc)); 664 665 if (parm2 < 1) { 666 parm2 = 1; 667 } 668 669 /* L : size of the last cycle. Make sure the last cycle is not larger 670 than the first cycle. */ 671 if (parm1 < 1) { 672 parm1 = 1; 673 } else if (parm1 > parm2) { 674 parm1 = parm2; 675 } 676 677 /* N : number of cycles */ 678 parm3 = (parm2 + parm1); 679 parm3 = (2 * tc + parm3 - 1) / parm3; 680 681 if (parm3 < 2) { 682 parm3 = 2; 683 } 684 685 /* sigma : decreasing incr of the trapezoid */ 686 parm4 = (parm3 - 1); 687 parm4 = (parm2 - parm1) / parm4; 688 689 // pointless check, because parm4 >= 0 always 690 // if ( parm4 < 0 ) { 691 // parm4 = 0; 692 //} 693 694 pr->u.p.parm1 = parm1; 695 pr->u.p.parm2 = parm2; 696 pr->u.p.parm3 = parm3; 697 pr->u.p.parm4 = parm4; 698 } // case 699 break; 700 701 default: { 702 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 703 KMP_HNT(GetNewerLibrary), // Hint 704 __kmp_msg_null // Variadic argument list terminator 705 ); 706 } break; 707 } // switch 708 pr->schedule = schedule; 709 } 710 711 #if KMP_USE_HIER_SCHED 712 template <typename T> 713 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 714 typename traits_t<T>::signed_t st); 715 template <> 716 inline void 717 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 718 kmp_int32 ub, kmp_int32 st) { 719 __kmp_dispatch_init_hierarchy<kmp_int32>( 720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 722 } 723 template <> 724 inline void 725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 726 kmp_uint32 ub, kmp_int32 st) { 727 __kmp_dispatch_init_hierarchy<kmp_uint32>( 728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 730 } 731 template <> 732 inline void 733 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 734 kmp_int64 ub, kmp_int64 st) { 735 __kmp_dispatch_init_hierarchy<kmp_int64>( 736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 738 } 739 template <> 740 inline void 741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 742 kmp_uint64 ub, kmp_int64 st) { 743 __kmp_dispatch_init_hierarchy<kmp_uint64>( 744 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 745 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 746 } 747 748 // free all the hierarchy scheduling memory associated with the team 749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 750 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 751 for (int i = 0; i < num_disp_buff; ++i) { 752 // type does not matter here so use kmp_int32 753 auto sh = 754 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 755 &team->t.t_disp_buffer[i]); 756 if (sh->hier) { 757 sh->hier->deallocate(); 758 __kmp_free(sh->hier); 759 } 760 } 761 } 762 #endif 763 764 // UT - unsigned flavor of T, ST - signed flavor of T, 765 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 766 template <typename T> 767 static void 768 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 769 T ub, typename traits_t<T>::signed_t st, 770 typename traits_t<T>::signed_t chunk, int push_ws) { 771 typedef typename traits_t<T>::unsigned_t UT; 772 773 int active; 774 kmp_info_t *th; 775 kmp_team_t *team; 776 kmp_uint32 my_buffer_index; 777 dispatch_private_info_template<T> *pr; 778 dispatch_shared_info_template<T> volatile *sh; 779 780 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 781 sizeof(dispatch_private_info)); 782 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 783 sizeof(dispatch_shared_info)); 784 __kmp_assert_valid_gtid(gtid); 785 786 if (!TCR_4(__kmp_init_parallel)) 787 __kmp_parallel_initialize(); 788 789 __kmp_resume_if_soft_paused(); 790 791 #if INCLUDE_SSC_MARKS 792 SSC_MARK_DISPATCH_INIT(); 793 #endif 794 #ifdef KMP_DEBUG 795 typedef typename traits_t<T>::signed_t ST; 796 { 797 char *buff; 798 // create format specifiers before the debug output 799 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 800 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 801 traits_t<ST>::spec, traits_t<T>::spec, 802 traits_t<T>::spec, traits_t<ST>::spec); 803 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 804 __kmp_str_free(&buff); 805 } 806 #endif 807 /* setup data */ 808 th = __kmp_threads[gtid]; 809 team = th->th.th_team; 810 active = !team->t.t_serialized; 811 th->th.th_ident = loc; 812 813 // Any half-decent optimizer will remove this test when the blocks are empty 814 // since the macros expand to nothing 815 // when statistics are disabled. 816 if (schedule == __kmp_static) { 817 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 818 } else { 819 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 820 } 821 822 #if KMP_USE_HIER_SCHED 823 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 824 // Hierarchical scheduling does not work with ordered, so if ordered is 825 // detected, then revert back to threaded scheduling. 826 bool ordered; 827 enum sched_type my_sched = schedule; 828 my_buffer_index = th->th.th_dispatch->th_disp_index; 829 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 830 &th->th.th_dispatch 831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 832 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 833 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 834 my_sched = 835 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 836 ordered = (kmp_ord_lower & my_sched); 837 if (pr->flags.use_hier) { 838 if (ordered) { 839 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 840 "Disabling hierarchical scheduling.\n", 841 gtid)); 842 pr->flags.use_hier = FALSE; 843 } 844 } 845 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 846 // Don't use hierarchical for ordered parallel loops and don't 847 // use the runtime hierarchy if one was specified in the program 848 if (!ordered && !pr->flags.use_hier) 849 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 850 } 851 #endif // KMP_USE_HIER_SCHED 852 853 #if USE_ITT_BUILD 854 kmp_uint64 cur_chunk = chunk; 855 int itt_need_metadata_reporting = 856 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 857 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 858 team->t.t_active_level == 1; 859 #endif 860 if (!active) { 861 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 862 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 863 } else { 864 KMP_DEBUG_ASSERT(th->th.th_dispatch == 865 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 866 867 my_buffer_index = th->th.th_dispatch->th_disp_index++; 868 869 /* What happens when number of threads changes, need to resize buffer? */ 870 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 871 &th->th.th_dispatch 872 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 873 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 874 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 875 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 876 my_buffer_index)); 877 } 878 879 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 880 #if USE_ITT_BUILD 881 &cur_chunk, 882 #endif 883 chunk, (T)th->th.th_team_nproc, 884 (T)th->th.th_info.ds.ds_tid); 885 if (active) { 886 if (pr->flags.ordered == 0) { 887 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 888 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 889 } else { 890 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 891 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 892 } 893 } 894 895 if (active) { 896 /* The name of this buffer should be my_buffer_index when it's free to use 897 * it */ 898 899 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 900 "sh->buffer_index:%d\n", 901 gtid, my_buffer_index, sh->buffer_index)); 902 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 903 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 904 // Note: KMP_WAIT() cannot be used there: buffer index and 905 // my_buffer_index are *always* 32-bit integers. 906 KMP_MB(); /* is this necessary? */ 907 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 908 "sh->buffer_index:%d\n", 909 gtid, my_buffer_index, sh->buffer_index)); 910 911 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 912 th->th.th_dispatch->th_dispatch_sh_current = 913 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 914 #if USE_ITT_BUILD 915 if (pr->flags.ordered) { 916 __kmp_itt_ordered_init(gtid); 917 } 918 // Report loop metadata 919 if (itt_need_metadata_reporting) { 920 // Only report metadata by master of active team at level 1 921 kmp_uint64 schedtype = 0; 922 switch (schedule) { 923 case kmp_sch_static_chunked: 924 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 925 break; 926 case kmp_sch_static_greedy: 927 cur_chunk = pr->u.p.parm1; 928 break; 929 case kmp_sch_dynamic_chunked: 930 schedtype = 1; 931 break; 932 case kmp_sch_guided_iterative_chunked: 933 case kmp_sch_guided_analytical_chunked: 934 case kmp_sch_guided_simd: 935 schedtype = 2; 936 break; 937 default: 938 // Should we put this case under "static"? 939 // case kmp_sch_static_steal: 940 schedtype = 3; 941 break; 942 } 943 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 944 } 945 #if KMP_USE_HIER_SCHED 946 if (pr->flags.use_hier) { 947 pr->u.p.count = 0; 948 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 949 } 950 #endif // KMP_USER_HIER_SCHED 951 #endif /* USE_ITT_BUILD */ 952 } 953 954 #ifdef KMP_DEBUG 955 { 956 char *buff; 957 // create format specifiers before the debug output 958 buff = __kmp_str_format( 959 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 960 "lb:%%%s ub:%%%s" 961 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 962 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 963 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 964 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 965 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 966 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 967 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 968 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 969 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 970 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 971 __kmp_str_free(&buff); 972 } 973 #endif 974 #if (KMP_STATIC_STEAL_ENABLED) 975 // It cannot be guaranteed that after execution of a loop with some other 976 // schedule kind all the parm3 variables will contain the same value. Even if 977 // all parm3 will be the same, it still exists a bad case like using 0 and 1 978 // rather than program life-time increment. So the dedicated variable is 979 // required. The 'static_steal_counter' is used. 980 if (pr->schedule == kmp_sch_static_steal) { 981 // Other threads will inspect this variable when searching for a victim. 982 // This is a flag showing that other threads may steal from this thread 983 // since then. 984 volatile T *p = &pr->u.p.static_steal_counter; 985 *p = *p + 1; 986 } 987 #endif // ( KMP_STATIC_STEAL_ENABLED ) 988 989 #if OMPT_SUPPORT && OMPT_OPTIONAL 990 if (ompt_enabled.ompt_callback_work) { 991 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 992 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 993 ompt_callbacks.ompt_callback(ompt_callback_work)( 994 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 995 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 996 } 997 #endif 998 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 999 } 1000 1001 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1002 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1003 * every chunk of iterations. If the ordered section(s) were not executed 1004 * for this iteration (or every iteration in this chunk), we need to set the 1005 * ordered iteration counters so that the next thread can proceed. */ 1006 template <typename UT> 1007 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1008 typedef typename traits_t<UT>::signed_t ST; 1009 __kmp_assert_valid_gtid(gtid); 1010 kmp_info_t *th = __kmp_threads[gtid]; 1011 1012 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1013 if (!th->th.th_team->t.t_serialized) { 1014 1015 dispatch_private_info_template<UT> *pr = 1016 reinterpret_cast<dispatch_private_info_template<UT> *>( 1017 th->th.th_dispatch->th_dispatch_pr_current); 1018 dispatch_shared_info_template<UT> volatile *sh = 1019 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1020 th->th.th_dispatch->th_dispatch_sh_current); 1021 KMP_DEBUG_ASSERT(pr); 1022 KMP_DEBUG_ASSERT(sh); 1023 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1024 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1025 1026 if (pr->ordered_bumped) { 1027 KD_TRACE( 1028 1000, 1029 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1030 gtid)); 1031 pr->ordered_bumped = 0; 1032 } else { 1033 UT lower = pr->u.p.ordered_lower; 1034 1035 #ifdef KMP_DEBUG 1036 { 1037 char *buff; 1038 // create format specifiers before the debug output 1039 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1040 "ordered_iteration:%%%s lower:%%%s\n", 1041 traits_t<UT>::spec, traits_t<UT>::spec); 1042 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1043 __kmp_str_free(&buff); 1044 } 1045 #endif 1046 1047 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1048 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1049 KMP_MB(); /* is this necessary? */ 1050 #ifdef KMP_DEBUG 1051 { 1052 char *buff; 1053 // create format specifiers before the debug output 1054 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1055 "ordered_iteration:%%%s lower:%%%s\n", 1056 traits_t<UT>::spec, traits_t<UT>::spec); 1057 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1058 __kmp_str_free(&buff); 1059 } 1060 #endif 1061 1062 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1063 } // if 1064 } // if 1065 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1066 } 1067 1068 #ifdef KMP_GOMP_COMPAT 1069 1070 template <typename UT> 1071 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1072 typedef typename traits_t<UT>::signed_t ST; 1073 __kmp_assert_valid_gtid(gtid); 1074 kmp_info_t *th = __kmp_threads[gtid]; 1075 1076 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1077 if (!th->th.th_team->t.t_serialized) { 1078 // int cid; 1079 dispatch_private_info_template<UT> *pr = 1080 reinterpret_cast<dispatch_private_info_template<UT> *>( 1081 th->th.th_dispatch->th_dispatch_pr_current); 1082 dispatch_shared_info_template<UT> volatile *sh = 1083 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1084 th->th.th_dispatch->th_dispatch_sh_current); 1085 KMP_DEBUG_ASSERT(pr); 1086 KMP_DEBUG_ASSERT(sh); 1087 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1088 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1089 1090 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1091 UT lower = pr->u.p.ordered_lower; 1092 UT upper = pr->u.p.ordered_upper; 1093 UT inc = upper - lower + 1; 1094 1095 if (pr->ordered_bumped == inc) { 1096 KD_TRACE( 1097 1000, 1098 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1099 gtid)); 1100 pr->ordered_bumped = 0; 1101 } else { 1102 inc -= pr->ordered_bumped; 1103 1104 #ifdef KMP_DEBUG 1105 { 1106 char *buff; 1107 // create format specifiers before the debug output 1108 buff = __kmp_str_format( 1109 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1110 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1111 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1112 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1113 __kmp_str_free(&buff); 1114 } 1115 #endif 1116 1117 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1118 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1119 1120 KMP_MB(); /* is this necessary? */ 1121 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1122 "ordered_bumped to zero\n", 1123 gtid)); 1124 pr->ordered_bumped = 0; 1125 //!!!!! TODO check if the inc should be unsigned, or signed??? 1126 #ifdef KMP_DEBUG 1127 { 1128 char *buff; 1129 // create format specifiers before the debug output 1130 buff = __kmp_str_format( 1131 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1132 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1133 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1134 traits_t<UT>::spec); 1135 KD_TRACE(1000, 1136 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1137 __kmp_str_free(&buff); 1138 } 1139 #endif 1140 1141 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1142 } 1143 // } 1144 } 1145 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1146 } 1147 1148 #endif /* KMP_GOMP_COMPAT */ 1149 1150 template <typename T> 1151 int __kmp_dispatch_next_algorithm(int gtid, 1152 dispatch_private_info_template<T> *pr, 1153 dispatch_shared_info_template<T> volatile *sh, 1154 kmp_int32 *p_last, T *p_lb, T *p_ub, 1155 typename traits_t<T>::signed_t *p_st, T nproc, 1156 T tid) { 1157 typedef typename traits_t<T>::unsigned_t UT; 1158 typedef typename traits_t<T>::signed_t ST; 1159 typedef typename traits_t<T>::floating_t DBL; 1160 int status = 0; 1161 kmp_int32 last = 0; 1162 T start; 1163 ST incr; 1164 UT limit, trip, init; 1165 kmp_info_t *th = __kmp_threads[gtid]; 1166 kmp_team_t *team = th->th.th_team; 1167 1168 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1169 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1170 KMP_DEBUG_ASSERT(pr); 1171 KMP_DEBUG_ASSERT(sh); 1172 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1173 #ifdef KMP_DEBUG 1174 { 1175 char *buff; 1176 // create format specifiers before the debug output 1177 buff = 1178 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1179 "sh:%%p nproc:%%%s tid:%%%s\n", 1180 traits_t<T>::spec, traits_t<T>::spec); 1181 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1182 __kmp_str_free(&buff); 1183 } 1184 #endif 1185 1186 // zero trip count 1187 if (pr->u.p.tc == 0) { 1188 KD_TRACE(10, 1189 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1190 "zero status:%d\n", 1191 gtid, status)); 1192 return 0; 1193 } 1194 1195 switch (pr->schedule) { 1196 #if (KMP_STATIC_STEAL_ENABLED) 1197 case kmp_sch_static_steal: { 1198 T chunk = pr->u.p.parm1; 1199 1200 KD_TRACE(100, 1201 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1202 gtid)); 1203 1204 trip = pr->u.p.tc - 1; 1205 1206 if (traits_t<T>::type_size > 4) { 1207 // use lock for 8-byte and CAS for 4-byte induction 1208 // variable. TODO (optional): check and use 16-byte CAS 1209 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1210 KMP_DEBUG_ASSERT(lck != NULL); 1211 if (pr->u.p.count < (UT)pr->u.p.ub) { 1212 __kmp_acquire_lock(lck, gtid); 1213 // try to get own chunk of iterations 1214 init = (pr->u.p.count)++; 1215 status = (init < (UT)pr->u.p.ub); 1216 __kmp_release_lock(lck, gtid); 1217 } else { 1218 status = 0; // no own chunks 1219 } 1220 if (!status) { // try to steal 1221 kmp_info_t **other_threads = team->t.t_threads; 1222 int while_limit = pr->u.p.parm3; 1223 int while_index = 0; 1224 T id = pr->u.p.static_steal_counter; // loop id 1225 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1226 __kmp_dispatch_num_buffers; // current loop index 1227 // note: victim thread can potentially execute another loop 1228 // TODO: algorithm of searching for a victim 1229 // should be cleaned up and measured 1230 while ((!status) && (while_limit != ++while_index)) { 1231 dispatch_private_info_template<T> *victim; 1232 T remaining; 1233 T victimIdx = pr->u.p.parm4; 1234 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1235 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1236 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1237 KMP_DEBUG_ASSERT(victim); 1238 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1239 oldVictimIdx != victimIdx) { 1240 victimIdx = (victimIdx + 1) % nproc; 1241 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1242 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1243 KMP_DEBUG_ASSERT(victim); 1244 } 1245 if (victim == pr || id != victim->u.p.static_steal_counter) { 1246 continue; // try once more (nproc attempts in total) 1247 // no victim is ready yet to participate in stealing 1248 // because no victim passed kmp_init_dispatch yet 1249 } 1250 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1251 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1252 continue; // not enough chunks to steal, goto next victim 1253 } 1254 1255 lck = victim->u.p.th_steal_lock; 1256 KMP_ASSERT(lck != NULL); 1257 __kmp_acquire_lock(lck, gtid); 1258 limit = victim->u.p.ub; // keep initial ub 1259 if (victim->u.p.count >= limit || 1260 (remaining = limit - victim->u.p.count) < 2) { 1261 __kmp_release_lock(lck, gtid); 1262 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1263 continue; // not enough chunks to steal 1264 } 1265 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1266 // by 1 1267 if (remaining > 3) { 1268 // steal 1/4 of remaining 1269 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1270 init = (victim->u.p.ub -= (remaining >> 2)); 1271 } else { 1272 // steal 1 chunk of 2 or 3 remaining 1273 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1274 init = (victim->u.p.ub -= 1); 1275 } 1276 __kmp_release_lock(lck, gtid); 1277 1278 KMP_DEBUG_ASSERT(init + 1 <= limit); 1279 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1280 status = 1; 1281 while_index = 0; 1282 // now update own count and ub with stolen range but init chunk 1283 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1284 pr->u.p.count = init + 1; 1285 pr->u.p.ub = limit; 1286 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1287 } // while (search for victim) 1288 } // if (try to find victim and steal) 1289 } else { 1290 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1291 typedef union { 1292 struct { 1293 UT count; 1294 T ub; 1295 } p; 1296 kmp_int64 b; 1297 } union_i4; 1298 // All operations on 'count' or 'ub' must be combined atomically 1299 // together. 1300 { 1301 union_i4 vold, vnew; 1302 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1303 vnew = vold; 1304 vnew.p.count++; 1305 while (!KMP_COMPARE_AND_STORE_ACQ64( 1306 (volatile kmp_int64 *)&pr->u.p.count, 1307 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1308 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1309 KMP_CPU_PAUSE(); 1310 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1311 vnew = vold; 1312 vnew.p.count++; 1313 } 1314 vnew = vold; 1315 init = vnew.p.count; 1316 status = (init < (UT)vnew.p.ub); 1317 } 1318 1319 if (!status) { 1320 kmp_info_t **other_threads = team->t.t_threads; 1321 int while_limit = pr->u.p.parm3; 1322 int while_index = 0; 1323 T id = pr->u.p.static_steal_counter; // loop id 1324 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1325 __kmp_dispatch_num_buffers; // current loop index 1326 // note: victim thread can potentially execute another loop 1327 // TODO: algorithm of searching for a victim 1328 // should be cleaned up and measured 1329 while ((!status) && (while_limit != ++while_index)) { 1330 dispatch_private_info_template<T> *victim; 1331 union_i4 vold, vnew; 1332 kmp_int32 remaining; 1333 T victimIdx = pr->u.p.parm4; 1334 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1335 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1336 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1337 KMP_DEBUG_ASSERT(victim); 1338 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1339 oldVictimIdx != victimIdx) { 1340 victimIdx = (victimIdx + 1) % nproc; 1341 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1342 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1343 KMP_DEBUG_ASSERT(victim); 1344 } 1345 if (victim == pr || id != victim->u.p.static_steal_counter) { 1346 continue; // try once more (nproc attempts in total) 1347 // no victim is ready yet to participate in stealing 1348 // because no victim passed kmp_init_dispatch yet 1349 } 1350 pr->u.p.parm4 = victimIdx; // new victim found 1351 while (1) { // CAS loop if victim has enough chunks to steal 1352 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1353 vnew = vold; 1354 1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1356 if (vnew.p.count >= (UT)vnew.p.ub || 1357 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1358 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1359 break; // not enough chunks to steal, goto next victim 1360 } 1361 if (remaining > 3) { 1362 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining 1363 } else { 1364 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1365 } 1366 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1367 // TODO: Should this be acquire or release? 1368 if (KMP_COMPARE_AND_STORE_ACQ64( 1369 (volatile kmp_int64 *)&victim->u.p.count, 1370 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1371 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1372 // stealing succeeded 1373 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1374 vold.p.ub - vnew.p.ub); 1375 status = 1; 1376 while_index = 0; 1377 // now update own count and ub 1378 init = vnew.p.ub; 1379 vold.p.count = init + 1; 1380 #if KMP_ARCH_X86 1381 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1382 #else 1383 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1384 #endif 1385 break; 1386 } // if (check CAS result) 1387 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1388 } // while (try to steal from particular victim) 1389 } // while (search for victim) 1390 } // if (try to find victim and steal) 1391 } // if (4-byte induction variable) 1392 if (!status) { 1393 *p_lb = 0; 1394 *p_ub = 0; 1395 if (p_st != NULL) 1396 *p_st = 0; 1397 } else { 1398 start = pr->u.p.parm2; 1399 init *= chunk; 1400 limit = chunk + init - 1; 1401 incr = pr->u.p.st; 1402 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1403 1404 KMP_DEBUG_ASSERT(init <= trip); 1405 if ((last = (limit >= trip)) != 0) 1406 limit = trip; 1407 if (p_st != NULL) 1408 *p_st = incr; 1409 1410 if (incr == 1) { 1411 *p_lb = start + init; 1412 *p_ub = start + limit; 1413 } else { 1414 *p_lb = start + init * incr; 1415 *p_ub = start + limit * incr; 1416 } 1417 1418 if (pr->flags.ordered) { 1419 pr->u.p.ordered_lower = init; 1420 pr->u.p.ordered_upper = limit; 1421 } // if 1422 } // if 1423 break; 1424 } // case 1425 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1426 case kmp_sch_static_balanced: { 1427 KD_TRACE( 1428 10, 1429 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1430 gtid)); 1431 /* check if thread has any iteration to do */ 1432 if ((status = !pr->u.p.count) != 0) { 1433 pr->u.p.count = 1; 1434 *p_lb = pr->u.p.lb; 1435 *p_ub = pr->u.p.ub; 1436 last = pr->u.p.parm1; 1437 if (p_st != NULL) 1438 *p_st = pr->u.p.st; 1439 } else { /* no iterations to do */ 1440 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1441 } 1442 } // case 1443 break; 1444 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1445 merged here */ 1446 case kmp_sch_static_chunked: { 1447 T parm1; 1448 1449 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1450 "kmp_sch_static_[affinity|chunked] case\n", 1451 gtid)); 1452 parm1 = pr->u.p.parm1; 1453 1454 trip = pr->u.p.tc - 1; 1455 init = parm1 * (pr->u.p.count + tid); 1456 1457 if ((status = (init <= trip)) != 0) { 1458 start = pr->u.p.lb; 1459 incr = pr->u.p.st; 1460 limit = parm1 + init - 1; 1461 1462 if ((last = (limit >= trip)) != 0) 1463 limit = trip; 1464 1465 if (p_st != NULL) 1466 *p_st = incr; 1467 1468 pr->u.p.count += nproc; 1469 1470 if (incr == 1) { 1471 *p_lb = start + init; 1472 *p_ub = start + limit; 1473 } else { 1474 *p_lb = start + init * incr; 1475 *p_ub = start + limit * incr; 1476 } 1477 1478 if (pr->flags.ordered) { 1479 pr->u.p.ordered_lower = init; 1480 pr->u.p.ordered_upper = limit; 1481 } // if 1482 } // if 1483 } // case 1484 break; 1485 1486 case kmp_sch_dynamic_chunked: { 1487 T chunk = pr->u.p.parm1; 1488 1489 KD_TRACE( 1490 100, 1491 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1492 gtid)); 1493 1494 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1495 trip = pr->u.p.tc - 1; 1496 1497 if ((status = (init <= trip)) == 0) { 1498 *p_lb = 0; 1499 *p_ub = 0; 1500 if (p_st != NULL) 1501 *p_st = 0; 1502 } else { 1503 start = pr->u.p.lb; 1504 limit = chunk + init - 1; 1505 incr = pr->u.p.st; 1506 1507 if ((last = (limit >= trip)) != 0) 1508 limit = trip; 1509 1510 if (p_st != NULL) 1511 *p_st = incr; 1512 1513 if (incr == 1) { 1514 *p_lb = start + init; 1515 *p_ub = start + limit; 1516 } else { 1517 *p_lb = start + init * incr; 1518 *p_ub = start + limit * incr; 1519 } 1520 1521 if (pr->flags.ordered) { 1522 pr->u.p.ordered_lower = init; 1523 pr->u.p.ordered_upper = limit; 1524 } // if 1525 } // if 1526 } // case 1527 break; 1528 1529 case kmp_sch_guided_iterative_chunked: { 1530 T chunkspec = pr->u.p.parm1; 1531 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1532 "iterative case\n", 1533 gtid)); 1534 trip = pr->u.p.tc; 1535 // Start atomic part of calculations 1536 while (1) { 1537 ST remaining; // signed, because can be < 0 1538 init = sh->u.s.iteration; // shared value 1539 remaining = trip - init; 1540 if (remaining <= 0) { // AC: need to compare with 0 first 1541 // nothing to do, don't try atomic op 1542 status = 0; 1543 break; 1544 } 1545 if ((T)remaining < 1546 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1547 // use dynamic-style schedule 1548 // atomically increment iterations, get old value 1549 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1550 (ST)chunkspec); 1551 remaining = trip - init; 1552 if (remaining <= 0) { 1553 status = 0; // all iterations got by other threads 1554 } else { 1555 // got some iterations to work on 1556 status = 1; 1557 if ((T)remaining > chunkspec) { 1558 limit = init + chunkspec - 1; 1559 } else { 1560 last = 1; // the last chunk 1561 limit = init + remaining - 1; 1562 } // if 1563 } // if 1564 break; 1565 } // if 1566 limit = init + 1567 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc 1568 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1569 (ST)init, (ST)limit)) { 1570 // CAS was successful, chunk obtained 1571 status = 1; 1572 --limit; 1573 break; 1574 } // if 1575 } // while 1576 if (status != 0) { 1577 start = pr->u.p.lb; 1578 incr = pr->u.p.st; 1579 if (p_st != NULL) 1580 *p_st = incr; 1581 *p_lb = start + init * incr; 1582 *p_ub = start + limit * incr; 1583 if (pr->flags.ordered) { 1584 pr->u.p.ordered_lower = init; 1585 pr->u.p.ordered_upper = limit; 1586 } // if 1587 } else { 1588 *p_lb = 0; 1589 *p_ub = 0; 1590 if (p_st != NULL) 1591 *p_st = 0; 1592 } // if 1593 } // case 1594 break; 1595 1596 case kmp_sch_guided_simd: { 1597 // same as iterative but curr-chunk adjusted to be multiple of given 1598 // chunk 1599 T chunk = pr->u.p.parm1; 1600 KD_TRACE(100, 1601 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1602 gtid)); 1603 trip = pr->u.p.tc; 1604 // Start atomic part of calculations 1605 while (1) { 1606 ST remaining; // signed, because can be < 0 1607 init = sh->u.s.iteration; // shared value 1608 remaining = trip - init; 1609 if (remaining <= 0) { // AC: need to compare with 0 first 1610 status = 0; // nothing to do, don't try atomic op 1611 break; 1612 } 1613 KMP_DEBUG_ASSERT(init % chunk == 0); 1614 // compare with K*nproc*(chunk+1), K=2 by default 1615 if ((T)remaining < pr->u.p.parm2) { 1616 // use dynamic-style schedule 1617 // atomically increment iterations, get old value 1618 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1619 (ST)chunk); 1620 remaining = trip - init; 1621 if (remaining <= 0) { 1622 status = 0; // all iterations got by other threads 1623 } else { 1624 // got some iterations to work on 1625 status = 1; 1626 if ((T)remaining > chunk) { 1627 limit = init + chunk - 1; 1628 } else { 1629 last = 1; // the last chunk 1630 limit = init + remaining - 1; 1631 } // if 1632 } // if 1633 break; 1634 } // if 1635 // divide by K*nproc 1636 UT span = remaining * (*(double *)&pr->u.p.parm3); 1637 UT rem = span % chunk; 1638 if (rem) // adjust so that span%chunk == 0 1639 span += chunk - rem; 1640 limit = init + span; 1641 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1642 (ST)init, (ST)limit)) { 1643 // CAS was successful, chunk obtained 1644 status = 1; 1645 --limit; 1646 break; 1647 } // if 1648 } // while 1649 if (status != 0) { 1650 start = pr->u.p.lb; 1651 incr = pr->u.p.st; 1652 if (p_st != NULL) 1653 *p_st = incr; 1654 *p_lb = start + init * incr; 1655 *p_ub = start + limit * incr; 1656 if (pr->flags.ordered) { 1657 pr->u.p.ordered_lower = init; 1658 pr->u.p.ordered_upper = limit; 1659 } // if 1660 } else { 1661 *p_lb = 0; 1662 *p_ub = 0; 1663 if (p_st != NULL) 1664 *p_st = 0; 1665 } // if 1666 } // case 1667 break; 1668 1669 case kmp_sch_guided_analytical_chunked: { 1670 T chunkspec = pr->u.p.parm1; 1671 UT chunkIdx; 1672 #if KMP_USE_X87CONTROL 1673 /* for storing original FPCW value for Windows* OS on 1674 IA-32 architecture 8-byte version */ 1675 unsigned int oldFpcw; 1676 unsigned int fpcwSet = 0; 1677 #endif 1678 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1679 "kmp_sch_guided_analytical_chunked case\n", 1680 gtid)); 1681 1682 trip = pr->u.p.tc; 1683 1684 KMP_DEBUG_ASSERT(nproc > 1); 1685 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1686 1687 while (1) { /* this while loop is a safeguard against unexpected zero 1688 chunk sizes */ 1689 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1690 if (chunkIdx >= (UT)pr->u.p.parm2) { 1691 --trip; 1692 /* use dynamic-style scheduling */ 1693 init = chunkIdx * chunkspec + pr->u.p.count; 1694 /* need to verify init > 0 in case of overflow in the above 1695 * calculation */ 1696 if ((status = (init > 0 && init <= trip)) != 0) { 1697 limit = init + chunkspec - 1; 1698 1699 if ((last = (limit >= trip)) != 0) 1700 limit = trip; 1701 } 1702 break; 1703 } else { 1704 /* use exponential-style scheduling */ 1705 /* The following check is to workaround the lack of long double precision on 1706 Windows* OS. 1707 This check works around the possible effect that init != 0 for chunkIdx == 0. 1708 */ 1709 #if KMP_USE_X87CONTROL 1710 /* If we haven't already done so, save original 1711 FPCW and set precision to 64-bit, as Windows* OS 1712 on IA-32 architecture defaults to 53-bit */ 1713 if (!fpcwSet) { 1714 oldFpcw = _control87(0, 0); 1715 _control87(_PC_64, _MCW_PC); 1716 fpcwSet = 0x30000; 1717 } 1718 #endif 1719 if (chunkIdx) { 1720 init = __kmp_dispatch_guided_remaining<T>( 1721 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1722 KMP_DEBUG_ASSERT(init); 1723 init = trip - init; 1724 } else 1725 init = 0; 1726 limit = trip - __kmp_dispatch_guided_remaining<T>( 1727 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1728 KMP_ASSERT(init <= limit); 1729 if (init < limit) { 1730 KMP_DEBUG_ASSERT(limit <= trip); 1731 --limit; 1732 status = 1; 1733 break; 1734 } // if 1735 } // if 1736 } // while (1) 1737 #if KMP_USE_X87CONTROL 1738 /* restore FPCW if necessary 1739 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1740 */ 1741 if (fpcwSet && (oldFpcw & fpcwSet)) 1742 _control87(oldFpcw, _MCW_PC); 1743 #endif 1744 if (status != 0) { 1745 start = pr->u.p.lb; 1746 incr = pr->u.p.st; 1747 if (p_st != NULL) 1748 *p_st = incr; 1749 *p_lb = start + init * incr; 1750 *p_ub = start + limit * incr; 1751 if (pr->flags.ordered) { 1752 pr->u.p.ordered_lower = init; 1753 pr->u.p.ordered_upper = limit; 1754 } 1755 } else { 1756 *p_lb = 0; 1757 *p_ub = 0; 1758 if (p_st != NULL) 1759 *p_st = 0; 1760 } 1761 } // case 1762 break; 1763 1764 case kmp_sch_trapezoidal: { 1765 UT index; 1766 T parm2 = pr->u.p.parm2; 1767 T parm3 = pr->u.p.parm3; 1768 T parm4 = pr->u.p.parm4; 1769 KD_TRACE(100, 1770 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1771 gtid)); 1772 1773 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1774 1775 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1776 trip = pr->u.p.tc - 1; 1777 1778 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1779 *p_lb = 0; 1780 *p_ub = 0; 1781 if (p_st != NULL) 1782 *p_st = 0; 1783 } else { 1784 start = pr->u.p.lb; 1785 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1786 incr = pr->u.p.st; 1787 1788 if ((last = (limit >= trip)) != 0) 1789 limit = trip; 1790 1791 if (p_st != NULL) 1792 *p_st = incr; 1793 1794 if (incr == 1) { 1795 *p_lb = start + init; 1796 *p_ub = start + limit; 1797 } else { 1798 *p_lb = start + init * incr; 1799 *p_ub = start + limit * incr; 1800 } 1801 1802 if (pr->flags.ordered) { 1803 pr->u.p.ordered_lower = init; 1804 pr->u.p.ordered_upper = limit; 1805 } // if 1806 } // if 1807 } // case 1808 break; 1809 default: { 1810 status = 0; // to avoid complaints on uninitialized variable use 1811 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1812 KMP_HNT(GetNewerLibrary), // Hint 1813 __kmp_msg_null // Variadic argument list terminator 1814 ); 1815 } break; 1816 } // switch 1817 if (p_last) 1818 *p_last = last; 1819 #ifdef KMP_DEBUG 1820 if (pr->flags.ordered) { 1821 char *buff; 1822 // create format specifiers before the debug output 1823 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1824 "ordered_lower:%%%s ordered_upper:%%%s\n", 1825 traits_t<UT>::spec, traits_t<UT>::spec); 1826 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1827 __kmp_str_free(&buff); 1828 } 1829 { 1830 char *buff; 1831 // create format specifiers before the debug output 1832 buff = __kmp_str_format( 1833 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1834 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1835 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1836 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1837 __kmp_str_free(&buff); 1838 } 1839 #endif 1840 return status; 1841 } 1842 1843 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1844 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1845 is not called. */ 1846 #if OMPT_SUPPORT && OMPT_OPTIONAL 1847 #define OMPT_LOOP_END \ 1848 if (status == 0) { \ 1849 if (ompt_enabled.ompt_callback_work) { \ 1850 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1851 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1852 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1853 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1854 &(task_info->task_data), 0, codeptr); \ 1855 } \ 1856 } 1857 // TODO: implement count 1858 #else 1859 #define OMPT_LOOP_END // no-op 1860 #endif 1861 1862 #if KMP_STATS_ENABLED 1863 #define KMP_STATS_LOOP_END \ 1864 { \ 1865 kmp_int64 u, l, t, i; \ 1866 l = (kmp_int64)(*p_lb); \ 1867 u = (kmp_int64)(*p_ub); \ 1868 i = (kmp_int64)(pr->u.p.st); \ 1869 if (status == 0) { \ 1870 t = 0; \ 1871 KMP_POP_PARTITIONED_TIMER(); \ 1872 } else if (i == 1) { \ 1873 if (u >= l) \ 1874 t = u - l + 1; \ 1875 else \ 1876 t = 0; \ 1877 } else if (i < 0) { \ 1878 if (l >= u) \ 1879 t = (l - u) / (-i) + 1; \ 1880 else \ 1881 t = 0; \ 1882 } else { \ 1883 if (u >= l) \ 1884 t = (u - l) / i + 1; \ 1885 else \ 1886 t = 0; \ 1887 } \ 1888 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1889 } 1890 #else 1891 #define KMP_STATS_LOOP_END /* Nothing */ 1892 #endif 1893 1894 template <typename T> 1895 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1896 T *p_lb, T *p_ub, 1897 typename traits_t<T>::signed_t *p_st 1898 #if OMPT_SUPPORT && OMPT_OPTIONAL 1899 , 1900 void *codeptr 1901 #endif 1902 ) { 1903 1904 typedef typename traits_t<T>::unsigned_t UT; 1905 typedef typename traits_t<T>::signed_t ST; 1906 // This is potentially slightly misleading, schedule(runtime) will appear here 1907 // even if the actual runtime schedule is static. (Which points out a 1908 // disadvantage of schedule(runtime): even when static scheduling is used it 1909 // costs more than a compile time choice to use static scheduling would.) 1910 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1911 1912 int status; 1913 dispatch_private_info_template<T> *pr; 1914 __kmp_assert_valid_gtid(gtid); 1915 kmp_info_t *th = __kmp_threads[gtid]; 1916 kmp_team_t *team = th->th.th_team; 1917 1918 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1919 KD_TRACE( 1920 1000, 1921 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1922 gtid, p_lb, p_ub, p_st, p_last)); 1923 1924 if (team->t.t_serialized) { 1925 /* NOTE: serialize this dispatch because we are not at the active level */ 1926 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1927 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1928 KMP_DEBUG_ASSERT(pr); 1929 1930 if ((status = (pr->u.p.tc != 0)) == 0) { 1931 *p_lb = 0; 1932 *p_ub = 0; 1933 // if ( p_last != NULL ) 1934 // *p_last = 0; 1935 if (p_st != NULL) 1936 *p_st = 0; 1937 if (__kmp_env_consistency_check) { 1938 if (pr->pushed_ws != ct_none) { 1939 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1940 } 1941 } 1942 } else if (pr->flags.nomerge) { 1943 kmp_int32 last; 1944 T start; 1945 UT limit, trip, init; 1946 ST incr; 1947 T chunk = pr->u.p.parm1; 1948 1949 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1950 gtid)); 1951 1952 init = chunk * pr->u.p.count++; 1953 trip = pr->u.p.tc - 1; 1954 1955 if ((status = (init <= trip)) == 0) { 1956 *p_lb = 0; 1957 *p_ub = 0; 1958 // if ( p_last != NULL ) 1959 // *p_last = 0; 1960 if (p_st != NULL) 1961 *p_st = 0; 1962 if (__kmp_env_consistency_check) { 1963 if (pr->pushed_ws != ct_none) { 1964 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1965 } 1966 } 1967 } else { 1968 start = pr->u.p.lb; 1969 limit = chunk + init - 1; 1970 incr = pr->u.p.st; 1971 1972 if ((last = (limit >= trip)) != 0) { 1973 limit = trip; 1974 #if KMP_OS_WINDOWS 1975 pr->u.p.last_upper = pr->u.p.ub; 1976 #endif /* KMP_OS_WINDOWS */ 1977 } 1978 if (p_last != NULL) 1979 *p_last = last; 1980 if (p_st != NULL) 1981 *p_st = incr; 1982 if (incr == 1) { 1983 *p_lb = start + init; 1984 *p_ub = start + limit; 1985 } else { 1986 *p_lb = start + init * incr; 1987 *p_ub = start + limit * incr; 1988 } 1989 1990 if (pr->flags.ordered) { 1991 pr->u.p.ordered_lower = init; 1992 pr->u.p.ordered_upper = limit; 1993 #ifdef KMP_DEBUG 1994 { 1995 char *buff; 1996 // create format specifiers before the debug output 1997 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1998 "ordered_lower:%%%s ordered_upper:%%%s\n", 1999 traits_t<UT>::spec, traits_t<UT>::spec); 2000 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2001 pr->u.p.ordered_upper)); 2002 __kmp_str_free(&buff); 2003 } 2004 #endif 2005 } // if 2006 } // if 2007 } else { 2008 pr->u.p.tc = 0; 2009 *p_lb = pr->u.p.lb; 2010 *p_ub = pr->u.p.ub; 2011 #if KMP_OS_WINDOWS 2012 pr->u.p.last_upper = *p_ub; 2013 #endif /* KMP_OS_WINDOWS */ 2014 if (p_last != NULL) 2015 *p_last = TRUE; 2016 if (p_st != NULL) 2017 *p_st = pr->u.p.st; 2018 } // if 2019 #ifdef KMP_DEBUG 2020 { 2021 char *buff; 2022 // create format specifiers before the debug output 2023 buff = __kmp_str_format( 2024 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2025 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2026 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2027 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2028 (p_last ? *p_last : 0), status)); 2029 __kmp_str_free(&buff); 2030 } 2031 #endif 2032 #if INCLUDE_SSC_MARKS 2033 SSC_MARK_DISPATCH_NEXT(); 2034 #endif 2035 OMPT_LOOP_END; 2036 KMP_STATS_LOOP_END; 2037 return status; 2038 } else { 2039 kmp_int32 last = 0; 2040 dispatch_shared_info_template<T> volatile *sh; 2041 2042 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2043 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2044 2045 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2046 th->th.th_dispatch->th_dispatch_pr_current); 2047 KMP_DEBUG_ASSERT(pr); 2048 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2049 th->th.th_dispatch->th_dispatch_sh_current); 2050 KMP_DEBUG_ASSERT(sh); 2051 2052 #if KMP_USE_HIER_SCHED 2053 if (pr->flags.use_hier) 2054 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2055 else 2056 #endif // KMP_USE_HIER_SCHED 2057 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2058 p_st, th->th.th_team_nproc, 2059 th->th.th_info.ds.ds_tid); 2060 // status == 0: no more iterations to execute 2061 if (status == 0) { 2062 UT num_done; 2063 2064 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2065 #ifdef KMP_DEBUG 2066 { 2067 char *buff; 2068 // create format specifiers before the debug output 2069 buff = __kmp_str_format( 2070 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2071 traits_t<UT>::spec); 2072 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2073 __kmp_str_free(&buff); 2074 } 2075 #endif 2076 2077 #if KMP_USE_HIER_SCHED 2078 pr->flags.use_hier = FALSE; 2079 #endif 2080 if ((ST)num_done == th->th.th_team_nproc - 1) { 2081 #if (KMP_STATIC_STEAL_ENABLED) 2082 if (pr->schedule == kmp_sch_static_steal && 2083 traits_t<T>::type_size > 4) { 2084 int i; 2085 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2086 __kmp_dispatch_num_buffers; // current loop index 2087 kmp_info_t **other_threads = team->t.t_threads; 2088 // loop complete, safe to destroy locks used for stealing 2089 for (i = 0; i < th->th.th_team_nproc; ++i) { 2090 dispatch_private_info_template<T> *buf = 2091 reinterpret_cast<dispatch_private_info_template<T> *>( 2092 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2093 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2094 KMP_ASSERT(lck != NULL); 2095 __kmp_destroy_lock(lck); 2096 __kmp_free(lck); 2097 buf->u.p.th_steal_lock = NULL; 2098 } 2099 } 2100 #endif 2101 /* NOTE: release this buffer to be reused */ 2102 2103 KMP_MB(); /* Flush all pending memory write invalidates. */ 2104 2105 sh->u.s.num_done = 0; 2106 sh->u.s.iteration = 0; 2107 2108 /* TODO replace with general release procedure? */ 2109 if (pr->flags.ordered) { 2110 sh->u.s.ordered_iteration = 0; 2111 } 2112 2113 KMP_MB(); /* Flush all pending memory write invalidates. */ 2114 2115 sh->buffer_index += __kmp_dispatch_num_buffers; 2116 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2117 gtid, sh->buffer_index)); 2118 2119 KMP_MB(); /* Flush all pending memory write invalidates. */ 2120 2121 } // if 2122 if (__kmp_env_consistency_check) { 2123 if (pr->pushed_ws != ct_none) { 2124 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2125 } 2126 } 2127 2128 th->th.th_dispatch->th_deo_fcn = NULL; 2129 th->th.th_dispatch->th_dxo_fcn = NULL; 2130 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2131 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2132 } // if (status == 0) 2133 #if KMP_OS_WINDOWS 2134 else if (last) { 2135 pr->u.p.last_upper = pr->u.p.ub; 2136 } 2137 #endif /* KMP_OS_WINDOWS */ 2138 if (p_last != NULL && status != 0) 2139 *p_last = last; 2140 } // if 2141 2142 #ifdef KMP_DEBUG 2143 { 2144 char *buff; 2145 // create format specifiers before the debug output 2146 buff = __kmp_str_format( 2147 "__kmp_dispatch_next: T#%%d normal case: " 2148 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2149 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2150 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2151 (p_last ? *p_last : 0), status)); 2152 __kmp_str_free(&buff); 2153 } 2154 #endif 2155 #if INCLUDE_SSC_MARKS 2156 SSC_MARK_DISPATCH_NEXT(); 2157 #endif 2158 OMPT_LOOP_END; 2159 KMP_STATS_LOOP_END; 2160 return status; 2161 } 2162 2163 template <typename T> 2164 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2165 kmp_int32 *plastiter, T *plower, T *pupper, 2166 typename traits_t<T>::signed_t incr) { 2167 typedef typename traits_t<T>::unsigned_t UT; 2168 kmp_uint32 team_id; 2169 kmp_uint32 nteams; 2170 UT trip_count; 2171 kmp_team_t *team; 2172 kmp_info_t *th; 2173 2174 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2175 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2176 #ifdef KMP_DEBUG 2177 typedef typename traits_t<T>::signed_t ST; 2178 { 2179 char *buff; 2180 // create format specifiers before the debug output 2181 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2182 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2183 traits_t<T>::spec, traits_t<T>::spec, 2184 traits_t<ST>::spec, traits_t<T>::spec); 2185 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2186 __kmp_str_free(&buff); 2187 } 2188 #endif 2189 2190 if (__kmp_env_consistency_check) { 2191 if (incr == 0) { 2192 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2193 loc); 2194 } 2195 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2196 // The loop is illegal. 2197 // Some zero-trip loops maintained by compiler, e.g.: 2198 // for(i=10;i<0;++i) // lower >= upper - run-time check 2199 // for(i=0;i>10;--i) // lower <= upper - run-time check 2200 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2201 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2202 // Compiler does not check the following illegal loops: 2203 // for(i=0;i<10;i+=incr) // where incr<0 2204 // for(i=10;i>0;i-=incr) // where incr<0 2205 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2206 } 2207 } 2208 __kmp_assert_valid_gtid(gtid); 2209 th = __kmp_threads[gtid]; 2210 team = th->th.th_team; 2211 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2212 nteams = th->th.th_teams_size.nteams; 2213 team_id = team->t.t_master_tid; 2214 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2215 2216 // compute global trip count 2217 if (incr == 1) { 2218 trip_count = *pupper - *plower + 1; 2219 } else if (incr == -1) { 2220 trip_count = *plower - *pupper + 1; 2221 } else if (incr > 0) { 2222 // upper-lower can exceed the limit of signed type 2223 trip_count = (UT)(*pupper - *plower) / incr + 1; 2224 } else { 2225 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2226 } 2227 2228 if (trip_count <= nteams) { 2229 KMP_DEBUG_ASSERT( 2230 __kmp_static == kmp_sch_static_greedy || 2231 __kmp_static == 2232 kmp_sch_static_balanced); // Unknown static scheduling type. 2233 // only some teams get single iteration, others get nothing 2234 if (team_id < trip_count) { 2235 *pupper = *plower = *plower + team_id * incr; 2236 } else { 2237 *plower = *pupper + incr; // zero-trip loop 2238 } 2239 if (plastiter != NULL) 2240 *plastiter = (team_id == trip_count - 1); 2241 } else { 2242 if (__kmp_static == kmp_sch_static_balanced) { 2243 UT chunk = trip_count / nteams; 2244 UT extras = trip_count % nteams; 2245 *plower += 2246 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2247 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2248 if (plastiter != NULL) 2249 *plastiter = (team_id == nteams - 1); 2250 } else { 2251 T chunk_inc_count = 2252 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2253 T upper = *pupper; 2254 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2255 // Unknown static scheduling type. 2256 *plower += team_id * chunk_inc_count; 2257 *pupper = *plower + chunk_inc_count - incr; 2258 // Check/correct bounds if needed 2259 if (incr > 0) { 2260 if (*pupper < *plower) 2261 *pupper = traits_t<T>::max_value; 2262 if (plastiter != NULL) 2263 *plastiter = *plower <= upper && *pupper > upper - incr; 2264 if (*pupper > upper) 2265 *pupper = upper; // tracker C73258 2266 } else { 2267 if (*pupper > *plower) 2268 *pupper = traits_t<T>::min_value; 2269 if (plastiter != NULL) 2270 *plastiter = *plower >= upper && *pupper < upper - incr; 2271 if (*pupper < upper) 2272 *pupper = upper; // tracker C73258 2273 } 2274 } 2275 } 2276 } 2277 2278 //----------------------------------------------------------------------------- 2279 // Dispatch routines 2280 // Transfer call to template< type T > 2281 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2282 // T lb, T ub, ST st, ST chunk ) 2283 extern "C" { 2284 2285 /*! 2286 @ingroup WORK_SHARING 2287 @{ 2288 @param loc Source location 2289 @param gtid Global thread id 2290 @param schedule Schedule type 2291 @param lb Lower bound 2292 @param ub Upper bound 2293 @param st Step (or increment if you prefer) 2294 @param chunk The chunk size to block with 2295 2296 This function prepares the runtime to start a dynamically scheduled for loop, 2297 saving the loop arguments. 2298 These functions are all identical apart from the types of the arguments. 2299 */ 2300 2301 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2302 enum sched_type schedule, kmp_int32 lb, 2303 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2304 KMP_DEBUG_ASSERT(__kmp_init_serial); 2305 #if OMPT_SUPPORT && OMPT_OPTIONAL 2306 OMPT_STORE_RETURN_ADDRESS(gtid); 2307 #endif 2308 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2309 } 2310 /*! 2311 See @ref __kmpc_dispatch_init_4 2312 */ 2313 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2314 enum sched_type schedule, kmp_uint32 lb, 2315 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2316 KMP_DEBUG_ASSERT(__kmp_init_serial); 2317 #if OMPT_SUPPORT && OMPT_OPTIONAL 2318 OMPT_STORE_RETURN_ADDRESS(gtid); 2319 #endif 2320 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2321 } 2322 2323 /*! 2324 See @ref __kmpc_dispatch_init_4 2325 */ 2326 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2327 enum sched_type schedule, kmp_int64 lb, 2328 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2329 KMP_DEBUG_ASSERT(__kmp_init_serial); 2330 #if OMPT_SUPPORT && OMPT_OPTIONAL 2331 OMPT_STORE_RETURN_ADDRESS(gtid); 2332 #endif 2333 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2334 } 2335 2336 /*! 2337 See @ref __kmpc_dispatch_init_4 2338 */ 2339 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2340 enum sched_type schedule, kmp_uint64 lb, 2341 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2342 KMP_DEBUG_ASSERT(__kmp_init_serial); 2343 #if OMPT_SUPPORT && OMPT_OPTIONAL 2344 OMPT_STORE_RETURN_ADDRESS(gtid); 2345 #endif 2346 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2347 } 2348 2349 /*! 2350 See @ref __kmpc_dispatch_init_4 2351 2352 Difference from __kmpc_dispatch_init set of functions is these functions 2353 are called for composite distribute parallel for construct. Thus before 2354 regular iterations dispatching we need to calc per-team iteration space. 2355 2356 These functions are all identical apart from the types of the arguments. 2357 */ 2358 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2359 enum sched_type schedule, kmp_int32 *p_last, 2360 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2361 kmp_int32 chunk) { 2362 KMP_DEBUG_ASSERT(__kmp_init_serial); 2363 #if OMPT_SUPPORT && OMPT_OPTIONAL 2364 OMPT_STORE_RETURN_ADDRESS(gtid); 2365 #endif 2366 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2367 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2368 } 2369 2370 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2371 enum sched_type schedule, kmp_int32 *p_last, 2372 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2373 kmp_int32 chunk) { 2374 KMP_DEBUG_ASSERT(__kmp_init_serial); 2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2376 OMPT_STORE_RETURN_ADDRESS(gtid); 2377 #endif 2378 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2379 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2380 } 2381 2382 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2383 enum sched_type schedule, kmp_int32 *p_last, 2384 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2385 kmp_int64 chunk) { 2386 KMP_DEBUG_ASSERT(__kmp_init_serial); 2387 #if OMPT_SUPPORT && OMPT_OPTIONAL 2388 OMPT_STORE_RETURN_ADDRESS(gtid); 2389 #endif 2390 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2391 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2392 } 2393 2394 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2395 enum sched_type schedule, kmp_int32 *p_last, 2396 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2397 kmp_int64 chunk) { 2398 KMP_DEBUG_ASSERT(__kmp_init_serial); 2399 #if OMPT_SUPPORT && OMPT_OPTIONAL 2400 OMPT_STORE_RETURN_ADDRESS(gtid); 2401 #endif 2402 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2403 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2404 } 2405 2406 /*! 2407 @param loc Source code location 2408 @param gtid Global thread id 2409 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2410 otherwise 2411 @param p_lb Pointer to the lower bound for the next chunk of work 2412 @param p_ub Pointer to the upper bound for the next chunk of work 2413 @param p_st Pointer to the stride for the next chunk of work 2414 @return one if there is work to be done, zero otherwise 2415 2416 Get the next dynamically allocated chunk of work for this thread. 2417 If there is no more work, then the lb,ub and stride need not be modified. 2418 */ 2419 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2420 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2421 #if OMPT_SUPPORT && OMPT_OPTIONAL 2422 OMPT_STORE_RETURN_ADDRESS(gtid); 2423 #endif 2424 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2426 , 2427 OMPT_LOAD_RETURN_ADDRESS(gtid) 2428 #endif 2429 ); 2430 } 2431 2432 /*! 2433 See @ref __kmpc_dispatch_next_4 2434 */ 2435 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2436 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2437 kmp_int32 *p_st) { 2438 #if OMPT_SUPPORT && OMPT_OPTIONAL 2439 OMPT_STORE_RETURN_ADDRESS(gtid); 2440 #endif 2441 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2442 #if OMPT_SUPPORT && OMPT_OPTIONAL 2443 , 2444 OMPT_LOAD_RETURN_ADDRESS(gtid) 2445 #endif 2446 ); 2447 } 2448 2449 /*! 2450 See @ref __kmpc_dispatch_next_4 2451 */ 2452 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2453 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2454 #if OMPT_SUPPORT && OMPT_OPTIONAL 2455 OMPT_STORE_RETURN_ADDRESS(gtid); 2456 #endif 2457 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2458 #if OMPT_SUPPORT && OMPT_OPTIONAL 2459 , 2460 OMPT_LOAD_RETURN_ADDRESS(gtid) 2461 #endif 2462 ); 2463 } 2464 2465 /*! 2466 See @ref __kmpc_dispatch_next_4 2467 */ 2468 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2469 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2470 kmp_int64 *p_st) { 2471 #if OMPT_SUPPORT && OMPT_OPTIONAL 2472 OMPT_STORE_RETURN_ADDRESS(gtid); 2473 #endif 2474 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2475 #if OMPT_SUPPORT && OMPT_OPTIONAL 2476 , 2477 OMPT_LOAD_RETURN_ADDRESS(gtid) 2478 #endif 2479 ); 2480 } 2481 2482 /*! 2483 @param loc Source code location 2484 @param gtid Global thread id 2485 2486 Mark the end of a dynamic loop. 2487 */ 2488 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2489 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2490 } 2491 2492 /*! 2493 See @ref __kmpc_dispatch_fini_4 2494 */ 2495 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2496 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2497 } 2498 2499 /*! 2500 See @ref __kmpc_dispatch_fini_4 2501 */ 2502 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2503 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2504 } 2505 2506 /*! 2507 See @ref __kmpc_dispatch_fini_4 2508 */ 2509 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2510 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2511 } 2512 /*! @} */ 2513 2514 //----------------------------------------------------------------------------- 2515 // Non-template routines from kmp_dispatch.cpp used in other sources 2516 2517 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2518 return value == checker; 2519 } 2520 2521 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2522 return value != checker; 2523 } 2524 2525 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2526 return value < checker; 2527 } 2528 2529 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2530 return value >= checker; 2531 } 2532 2533 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2534 return value <= checker; 2535 } 2536 2537 kmp_uint32 2538 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2539 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2540 void *obj // Higher-level synchronization object, or NULL. 2541 ) { 2542 // note: we may not belong to a team at this point 2543 volatile kmp_uint32 *spin = spinner; 2544 kmp_uint32 check = checker; 2545 kmp_uint32 spins; 2546 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2547 kmp_uint32 r; 2548 2549 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2550 KMP_INIT_YIELD(spins); 2551 // main wait spin loop 2552 while (!f(r = TCR_4(*spin), check)) { 2553 KMP_FSYNC_SPIN_PREPARE(obj); 2554 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2555 split. It causes problems with infinite recursion because of exit lock */ 2556 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2557 __kmp_abort_thread(); */ 2558 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2559 } 2560 KMP_FSYNC_SPIN_ACQUIRED(obj); 2561 return r; 2562 } 2563 2564 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2565 kmp_uint32 (*pred)(void *, kmp_uint32), 2566 void *obj // Higher-level synchronization object, or NULL. 2567 ) { 2568 // note: we may not belong to a team at this point 2569 void *spin = spinner; 2570 kmp_uint32 check = checker; 2571 kmp_uint32 spins; 2572 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2573 2574 KMP_FSYNC_SPIN_INIT(obj, spin); 2575 KMP_INIT_YIELD(spins); 2576 // main wait spin loop 2577 while (!f(spin, check)) { 2578 KMP_FSYNC_SPIN_PREPARE(obj); 2579 /* if we have waited a bit, or are noversubscribed, yield */ 2580 /* pause is in the following code */ 2581 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2582 } 2583 KMP_FSYNC_SPIN_ACQUIRED(obj); 2584 } 2585 2586 } // extern "C" 2587 2588 #ifdef KMP_GOMP_COMPAT 2589 2590 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2591 enum sched_type schedule, kmp_int32 lb, 2592 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2593 int push_ws) { 2594 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2595 push_ws); 2596 } 2597 2598 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2599 enum sched_type schedule, kmp_uint32 lb, 2600 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2601 int push_ws) { 2602 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2603 push_ws); 2604 } 2605 2606 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2607 enum sched_type schedule, kmp_int64 lb, 2608 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2609 int push_ws) { 2610 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2611 push_ws); 2612 } 2613 2614 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2615 enum sched_type schedule, kmp_uint64 lb, 2616 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2617 int push_ws) { 2618 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2619 push_ws); 2620 } 2621 2622 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2623 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2624 } 2625 2626 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2627 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2628 } 2629 2630 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2631 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2632 } 2633 2634 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2635 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2636 } 2637 2638 #endif /* KMP_GOMP_COMPAT */ 2639 2640 /* ------------------------------------------------------------------------ */ 2641