1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 // Initialize a dispatch_private_info_template<T> buffer for a particular 94 // type of schedule,chunk. The loop description is found in lb (lower bound), 95 // ub (upper bound), and st (stride). nproc is the number of threads relevant 96 // to the scheduling (often the number of threads in a team, but not always if 97 // hierarchical scheduling is used). tid is the id of the thread calling 98 // the function within the group of nproc threads. It will have a value 99 // between 0 and nproc - 1. This is often just the thread id within a team, but 100 // is not necessarily the case when using hierarchical scheduling. 101 // loc is the source file location of the corresponding loop 102 // gtid is the global thread id 103 template <typename T> 104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 105 dispatch_private_info_template<T> *pr, 106 enum sched_type schedule, T lb, T ub, 107 typename traits_t<T>::signed_t st, 108 #if USE_ITT_BUILD 109 kmp_uint64 *cur_chunk, 110 #endif 111 typename traits_t<T>::signed_t chunk, 112 T nproc, T tid) { 113 typedef typename traits_t<T>::unsigned_t UT; 114 typedef typename traits_t<T>::floating_t DBL; 115 116 int active; 117 T tc; 118 kmp_info_t *th; 119 kmp_team_t *team; 120 int monotonicity; 121 bool use_hier; 122 123 #ifdef KMP_DEBUG 124 typedef typename traits_t<T>::signed_t ST; 125 { 126 char *buff; 127 // create format specifiers before the debug output 128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 129 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 131 traits_t<T>::spec, traits_t<T>::spec, 132 traits_t<ST>::spec, traits_t<ST>::spec, 133 traits_t<T>::spec, traits_t<T>::spec); 134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 135 __kmp_str_free(&buff); 136 } 137 #endif 138 /* setup data */ 139 th = __kmp_threads[gtid]; 140 team = th->th.th_team; 141 active = !team->t.t_serialized; 142 143 #if USE_ITT_BUILD 144 int itt_need_metadata_reporting = 145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 147 team->t.t_active_level == 1; 148 #endif 149 150 #if KMP_USE_HIER_SCHED 151 use_hier = pr->flags.use_hier; 152 #else 153 use_hier = false; 154 #endif 155 156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159 160 /* Pick up the nomerge/ordered bits from the scheduling type */ 161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 162 pr->flags.nomerge = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.nomerge = FALSE; 167 } 168 pr->type_size = traits_t<T>::type_size; // remember the size of variables 169 if (kmp_ord_lower & schedule) { 170 pr->flags.ordered = TRUE; 171 schedule = 172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 173 } else { 174 pr->flags.ordered = FALSE; 175 } 176 // Ordered overrides nonmonotonic 177 if (pr->flags.ordered) { 178 monotonicity = SCHEDULE_MONOTONIC; 179 } 180 181 if (schedule == kmp_sch_static) { 182 schedule = __kmp_static; 183 } else { 184 if (schedule == kmp_sch_runtime) { 185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 186 // not specified) 187 schedule = team->t.t_sched.r_sched_type; 188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 190 // Detail the schedule if needed (global controls are differentiated 191 // appropriately) 192 if (schedule == kmp_sch_guided_chunked) { 193 schedule = __kmp_guided; 194 } else if (schedule == kmp_sch_static) { 195 schedule = __kmp_static; 196 } 197 // Use the chunk size specified by OMP_SCHEDULE (or default if not 198 // specified) 199 chunk = team->t.t_sched.chunk; 200 #if USE_ITT_BUILD 201 if (cur_chunk) 202 *cur_chunk = chunk; 203 #endif 204 #ifdef KMP_DEBUG 205 { 206 char *buff; 207 // create format specifiers before the debug output 208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 209 "schedule:%%d chunk:%%%s\n", 210 traits_t<ST>::spec); 211 KD_TRACE(10, (buff, gtid, schedule, chunk)); 212 __kmp_str_free(&buff); 213 } 214 #endif 215 } else { 216 if (schedule == kmp_sch_guided_chunked) { 217 schedule = __kmp_guided; 218 } 219 if (chunk <= 0) { 220 chunk = KMP_DEFAULT_CHUNK; 221 } 222 } 223 224 if (schedule == kmp_sch_auto) { 225 // mapping and differentiation: in the __kmp_do_serial_initialize() 226 schedule = __kmp_auto; 227 #ifdef KMP_DEBUG 228 { 229 char *buff; 230 // create format specifiers before the debug output 231 buff = __kmp_str_format( 232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 233 "schedule:%%d chunk:%%%s\n", 234 traits_t<ST>::spec); 235 KD_TRACE(10, (buff, gtid, schedule, chunk)); 236 __kmp_str_free(&buff); 237 } 238 #endif 239 } 240 #if KMP_STATIC_STEAL_ENABLED 241 // map nonmonotonic:dynamic to static steal 242 if (schedule == kmp_sch_dynamic_chunked) { 243 if (monotonicity == SCHEDULE_NONMONOTONIC) 244 schedule = kmp_sch_static_steal; 245 } 246 #endif 247 /* guided analytical not safe for too many threads */ 248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 249 schedule = kmp_sch_guided_iterative_chunked; 250 KMP_WARNING(DispatchManyThreads); 251 } 252 if (schedule == kmp_sch_runtime_simd) { 253 // compiler provides simd_width in the chunk parameter 254 schedule = team->t.t_sched.r_sched_type; 255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257 // Detail the schedule if needed (global controls are differentiated 258 // appropriately) 259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260 schedule == __kmp_static) { 261 schedule = kmp_sch_static_balanced_chunked; 262 } else { 263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264 schedule = kmp_sch_guided_simd; 265 } 266 chunk = team->t.t_sched.chunk * chunk; 267 } 268 #if USE_ITT_BUILD 269 if (cur_chunk) 270 *cur_chunk = chunk; 271 #endif 272 #ifdef KMP_DEBUG 273 { 274 char *buff; 275 // create format specifiers before the debug output 276 buff = __kmp_str_format( 277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278 " chunk:%%%s\n", 279 traits_t<ST>::spec); 280 KD_TRACE(10, (buff, gtid, schedule, chunk)); 281 __kmp_str_free(&buff); 282 } 283 #endif 284 } 285 pr->u.p.parm1 = chunk; 286 } 287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 288 "unknown scheduling type"); 289 290 pr->u.p.count = 0; 291 292 if (__kmp_env_consistency_check) { 293 if (st == 0) { 294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 296 } 297 } 298 // compute trip count 299 if (st == 1) { // most common case 300 if (ub >= lb) { 301 tc = ub - lb + 1; 302 } else { // ub < lb 303 tc = 0; // zero-trip 304 } 305 } else if (st < 0) { 306 if (lb >= ub) { 307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(lb - ub) / (-st) + 1; 310 } else { // lb < ub 311 tc = 0; // zero-trip 312 } 313 } else { // st > 0 314 if (ub >= lb) { 315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 316 // where the division needs to be unsigned regardless of the result type 317 tc = (UT)(ub - lb) / st + 1; 318 } else { // ub < lb 319 tc = 0; // zero-trip 320 } 321 } 322 323 #if KMP_STATS_ENABLED 324 if (KMP_MASTER_GTID(gtid)) { 325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326 } 327 #endif 328 329 pr->u.p.lb = lb; 330 pr->u.p.ub = ub; 331 pr->u.p.st = st; 332 pr->u.p.tc = tc; 333 334 #if KMP_OS_WINDOWS 335 pr->u.p.last_upper = ub + st; 336 #endif /* KMP_OS_WINDOWS */ 337 338 /* NOTE: only the active parallel region(s) has active ordered sections */ 339 340 if (active) { 341 if (pr->flags.ordered) { 342 pr->ordered_bumped = 0; 343 pr->u.p.ordered_lower = 1; 344 pr->u.p.ordered_upper = 0; 345 } 346 } 347 348 switch (schedule) { 349 #if (KMP_STATIC_STEAL_ENABLED) 350 case kmp_sch_static_steal: { 351 T ntc, init; 352 353 KD_TRACE(100, 354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 355 gtid)); 356 357 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 358 if (nproc > 1 && ntc >= nproc) { 359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 360 T id = tid; 361 T small_chunk, extras; 362 363 small_chunk = ntc / nproc; 364 extras = ntc % nproc; 365 366 init = id * small_chunk + (id < extras ? id : extras); 367 pr->u.p.count = init; 368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 369 370 pr->u.p.parm2 = lb; 371 // parm3 is the number of times to attempt stealing which is 372 // proportional to the number of chunks per thread up until 373 // the maximum value of nproc. 374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 376 pr->u.p.st = st; 377 if (traits_t<T>::type_size > 4) { 378 // AC: TODO: check if 16-byte CAS available and use it to 379 // improve performance (probably wait for explicit request 380 // before spending time on this). 381 // For now use dynamically allocated per-thread lock, 382 // free memory in __kmp_dispatch_next when status==0. 383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384 pr->u.p.th_steal_lock = 385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386 __kmp_init_lock(pr->u.p.th_steal_lock); 387 } 388 break; 389 } else { 390 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391 schedule = kmp_sch_dynamic_chunked; 392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393 "kmp_sch_dynamic_chunked\n", 394 gtid)); 395 if (pr->u.p.parm1 <= 0) 396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 397 break; 398 } // if 399 } // case 400 #endif 401 case kmp_sch_static_balanced: { 402 T init, limit; 403 404 KD_TRACE( 405 100, 406 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 407 gtid)); 408 409 if (nproc > 1) { 410 T id = tid; 411 412 if (tc < nproc) { 413 if (id < tc) { 414 init = id; 415 limit = id; 416 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 417 } else { 418 pr->u.p.count = 1; /* means no more chunks to execute */ 419 pr->u.p.parm1 = FALSE; 420 break; 421 } 422 } else { 423 T small_chunk = tc / nproc; 424 T extras = tc % nproc; 425 init = id * small_chunk + (id < extras ? id : extras); 426 limit = init + small_chunk - (id < extras ? 0 : 1); 427 pr->u.p.parm1 = (id == nproc - 1); 428 } 429 } else { 430 if (tc > 0) { 431 init = 0; 432 limit = tc - 1; 433 pr->u.p.parm1 = TRUE; 434 } else { 435 // zero trip count 436 pr->u.p.count = 1; /* means no more chunks to execute */ 437 pr->u.p.parm1 = FALSE; 438 break; 439 } 440 } 441 #if USE_ITT_BUILD 442 // Calculate chunk for metadata report 443 if (itt_need_metadata_reporting) 444 if (cur_chunk) 445 *cur_chunk = limit - init + 1; 446 #endif 447 if (st == 1) { 448 pr->u.p.lb = lb + init; 449 pr->u.p.ub = lb + limit; 450 } else { 451 // calculated upper bound, "ub" is user-defined upper bound 452 T ub_tmp = lb + limit * st; 453 pr->u.p.lb = lb + init * st; 454 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 455 // it exactly 456 if (st > 0) { 457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 458 } else { 459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 460 } 461 } 462 if (pr->flags.ordered) { 463 pr->u.p.ordered_lower = init; 464 pr->u.p.ordered_upper = limit; 465 } 466 break; 467 } // case 468 case kmp_sch_static_balanced_chunked: { 469 // similar to balanced, but chunk adjusted to multiple of simd width 470 T nth = nproc; 471 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 472 " -> falling-through to static_greedy\n", 473 gtid)); 474 schedule = kmp_sch_static_greedy; 475 if (nth > 1) 476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 477 else 478 pr->u.p.parm1 = tc; 479 break; 480 } // case 481 case kmp_sch_guided_simd: 482 case kmp_sch_guided_iterative_chunked: { 483 KD_TRACE( 484 100, 485 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 486 " case\n", 487 gtid)); 488 489 if (nproc > 1) { 490 if ((2L * chunk + 1) * nproc >= tc) { 491 /* chunk size too large, switch to dynamic */ 492 schedule = kmp_sch_dynamic_chunked; 493 } else { 494 // when remaining iters become less than parm2 - switch to dynamic 495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 496 *(double *)&pr->u.p.parm3 = 497 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 498 } 499 } else { 500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 501 "kmp_sch_static_greedy\n", 502 gtid)); 503 schedule = kmp_sch_static_greedy; 504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 508 gtid)); 509 pr->u.p.parm1 = tc; 510 } // if 511 } // case 512 break; 513 case kmp_sch_guided_analytical_chunked: { 514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 515 "kmp_sch_guided_analytical_chunked case\n", 516 gtid)); 517 518 if (nproc > 1) { 519 if ((2L * chunk + 1) * nproc >= tc) { 520 /* chunk size too large, switch to dynamic */ 521 schedule = kmp_sch_dynamic_chunked; 522 } else { 523 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 524 DBL x; 525 526 #if KMP_USE_X87CONTROL 527 /* Linux* OS already has 64-bit computation by default for long double, 528 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 529 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 530 instead of the default 53-bit. Even though long double doesn't work 531 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 532 expected to impact the correctness of the algorithm, but this has not 533 been mathematically proven. */ 534 // save original FPCW and set precision to 64-bit, as 535 // Windows* OS on IA-32 architecture defaults to 53-bit 536 unsigned int oldFpcw = _control87(0, 0); 537 _control87(_PC_64, _MCW_PC); // 0,0x30000 538 #endif 539 /* value used for comparison in solver for cross-over point */ 540 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 541 542 /* crossover point--chunk indexes equal to or greater than 543 this point switch to dynamic-style scheduling */ 544 UT cross; 545 546 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 547 x = 1.0 - 0.5 / (double)nproc; 548 549 #ifdef KMP_DEBUG 550 { // test natural alignment 551 struct _test_a { 552 char a; 553 union { 554 char b; 555 DBL d; 556 }; 557 } t; 558 ptrdiff_t natural_alignment = 559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 560 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 561 // long)natural_alignment ); 562 KMP_DEBUG_ASSERT( 563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 564 } 565 #endif // KMP_DEBUG 566 567 /* save the term in thread private dispatch structure */ 568 *(DBL *)&pr->u.p.parm3 = x; 569 570 /* solve for the crossover point to the nearest integer i for which C_i 571 <= chunk */ 572 { 573 UT left, right, mid; 574 long double p; 575 576 /* estimate initial upper and lower bound */ 577 578 /* doesn't matter what value right is as long as it is positive, but 579 it affects performance of the solver */ 580 right = 229; 581 p = __kmp_pow<UT>(x, right); 582 if (p > target) { 583 do { 584 p *= p; 585 right <<= 1; 586 } while (p > target && right < (1 << 27)); 587 /* lower bound is previous (failed) estimate of upper bound */ 588 left = right >> 1; 589 } else { 590 left = 0; 591 } 592 593 /* bisection root-finding method */ 594 while (left + 1 < right) { 595 mid = (left + right) / 2; 596 if (__kmp_pow<UT>(x, mid) > target) { 597 left = mid; 598 } else { 599 right = mid; 600 } 601 } // while 602 cross = right; 603 } 604 /* assert sanity of computed crossover point */ 605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 606 __kmp_pow<UT>(x, cross) <= target); 607 608 /* save the crossover point in thread private dispatch structure */ 609 pr->u.p.parm2 = cross; 610 611 // C75803 612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 614 #else 615 #define GUIDED_ANALYTICAL_WORKAROUND (x) 616 #endif 617 /* dynamic-style scheduling offset */ 618 pr->u.p.count = tc - 619 __kmp_dispatch_guided_remaining( 620 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 621 cross * chunk; 622 #if KMP_USE_X87CONTROL 623 // restore FPCW 624 _control87(oldFpcw, _MCW_PC); 625 #endif 626 } // if 627 } else { 628 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 629 "kmp_sch_static_greedy\n", 630 gtid)); 631 schedule = kmp_sch_static_greedy; 632 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 633 pr->u.p.parm1 = tc; 634 } // if 635 } // case 636 break; 637 case kmp_sch_static_greedy: 638 KD_TRACE( 639 100, 640 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 641 gtid)); 642 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 643 break; 644 case kmp_sch_static_chunked: 645 case kmp_sch_dynamic_chunked: 646 if (pr->u.p.parm1 <= 0) 647 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 648 else if (pr->u.p.parm1 > tc) 649 pr->u.p.parm1 = tc; 650 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 651 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 652 gtid)); 653 break; 654 case kmp_sch_trapezoidal: { 655 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 656 657 T parm1, parm2, parm3, parm4; 658 KD_TRACE(100, 659 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 660 gtid)); 661 662 parm1 = chunk; 663 664 /* F : size of the first cycle */ 665 parm2 = (tc / (2 * nproc)); 666 667 if (parm2 < 1) { 668 parm2 = 1; 669 } 670 671 /* L : size of the last cycle. Make sure the last cycle is not larger 672 than the first cycle. */ 673 if (parm1 < 1) { 674 parm1 = 1; 675 } else if (parm1 > parm2) { 676 parm1 = parm2; 677 } 678 679 /* N : number of cycles */ 680 parm3 = (parm2 + parm1); 681 parm3 = (2 * tc + parm3 - 1) / parm3; 682 683 if (parm3 < 2) { 684 parm3 = 2; 685 } 686 687 /* sigma : decreasing incr of the trapezoid */ 688 parm4 = (parm3 - 1); 689 parm4 = (parm2 - parm1) / parm4; 690 691 // pointless check, because parm4 >= 0 always 692 // if ( parm4 < 0 ) { 693 // parm4 = 0; 694 //} 695 696 pr->u.p.parm1 = parm1; 697 pr->u.p.parm2 = parm2; 698 pr->u.p.parm3 = parm3; 699 pr->u.p.parm4 = parm4; 700 } // case 701 break; 702 703 default: { 704 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 705 KMP_HNT(GetNewerLibrary), // Hint 706 __kmp_msg_null // Variadic argument list terminator 707 ); 708 } break; 709 } // switch 710 pr->schedule = schedule; 711 } 712 713 #if KMP_USE_HIER_SCHED 714 template <typename T> 715 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 716 typename traits_t<T>::signed_t st); 717 template <> 718 inline void 719 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 720 kmp_int32 ub, kmp_int32 st) { 721 __kmp_dispatch_init_hierarchy<kmp_int32>( 722 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 723 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 724 } 725 template <> 726 inline void 727 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 728 kmp_uint32 ub, kmp_int32 st) { 729 __kmp_dispatch_init_hierarchy<kmp_uint32>( 730 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 731 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 732 } 733 template <> 734 inline void 735 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 736 kmp_int64 ub, kmp_int64 st) { 737 __kmp_dispatch_init_hierarchy<kmp_int64>( 738 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 739 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 740 } 741 template <> 742 inline void 743 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 744 kmp_uint64 ub, kmp_int64 st) { 745 __kmp_dispatch_init_hierarchy<kmp_uint64>( 746 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 747 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 748 } 749 750 // free all the hierarchy scheduling memory associated with the team 751 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 752 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 753 for (int i = 0; i < num_disp_buff; ++i) { 754 // type does not matter here so use kmp_int32 755 auto sh = 756 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 757 &team->t.t_disp_buffer[i]); 758 if (sh->hier) { 759 sh->hier->deallocate(); 760 __kmp_free(sh->hier); 761 } 762 } 763 } 764 #endif 765 766 // UT - unsigned flavor of T, ST - signed flavor of T, 767 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 768 template <typename T> 769 static void 770 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 771 T ub, typename traits_t<T>::signed_t st, 772 typename traits_t<T>::signed_t chunk, int push_ws) { 773 typedef typename traits_t<T>::unsigned_t UT; 774 775 int active; 776 kmp_info_t *th; 777 kmp_team_t *team; 778 kmp_uint32 my_buffer_index; 779 dispatch_private_info_template<T> *pr; 780 dispatch_shared_info_template<T> volatile *sh; 781 782 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 783 sizeof(dispatch_private_info)); 784 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 785 sizeof(dispatch_shared_info)); 786 __kmp_assert_valid_gtid(gtid); 787 788 if (!TCR_4(__kmp_init_parallel)) 789 __kmp_parallel_initialize(); 790 791 __kmp_resume_if_soft_paused(); 792 793 #if INCLUDE_SSC_MARKS 794 SSC_MARK_DISPATCH_INIT(); 795 #endif 796 #ifdef KMP_DEBUG 797 typedef typename traits_t<T>::signed_t ST; 798 { 799 char *buff; 800 // create format specifiers before the debug output 801 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 802 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 803 traits_t<ST>::spec, traits_t<T>::spec, 804 traits_t<T>::spec, traits_t<ST>::spec); 805 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 806 __kmp_str_free(&buff); 807 } 808 #endif 809 /* setup data */ 810 th = __kmp_threads[gtid]; 811 team = th->th.th_team; 812 active = !team->t.t_serialized; 813 th->th.th_ident = loc; 814 815 // Any half-decent optimizer will remove this test when the blocks are empty 816 // since the macros expand to nothing 817 // when statistics are disabled. 818 if (schedule == __kmp_static) { 819 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 820 } else { 821 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 822 } 823 824 #if KMP_USE_HIER_SCHED 825 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 826 // Hierarchical scheduling does not work with ordered, so if ordered is 827 // detected, then revert back to threaded scheduling. 828 bool ordered; 829 enum sched_type my_sched = schedule; 830 my_buffer_index = th->th.th_dispatch->th_disp_index; 831 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 832 &th->th.th_dispatch 833 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 834 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 835 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 836 my_sched = 837 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 838 ordered = (kmp_ord_lower & my_sched); 839 if (pr->flags.use_hier) { 840 if (ordered) { 841 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 842 "Disabling hierarchical scheduling.\n", 843 gtid)); 844 pr->flags.use_hier = FALSE; 845 } 846 } 847 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 848 // Don't use hierarchical for ordered parallel loops and don't 849 // use the runtime hierarchy if one was specified in the program 850 if (!ordered && !pr->flags.use_hier) 851 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 852 } 853 #endif // KMP_USE_HIER_SCHED 854 855 #if USE_ITT_BUILD 856 kmp_uint64 cur_chunk = chunk; 857 int itt_need_metadata_reporting = 858 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 859 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 860 team->t.t_active_level == 1; 861 #endif 862 if (!active) { 863 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 864 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 865 } else { 866 KMP_DEBUG_ASSERT(th->th.th_dispatch == 867 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 868 869 my_buffer_index = th->th.th_dispatch->th_disp_index++; 870 871 /* What happens when number of threads changes, need to resize buffer? */ 872 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 873 &th->th.th_dispatch 874 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 875 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 876 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 877 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 878 my_buffer_index)); 879 } 880 881 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 882 #if USE_ITT_BUILD 883 &cur_chunk, 884 #endif 885 chunk, (T)th->th.th_team_nproc, 886 (T)th->th.th_info.ds.ds_tid); 887 if (active) { 888 if (pr->flags.ordered == 0) { 889 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 890 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 891 } else { 892 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 893 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 894 } 895 } 896 897 if (active) { 898 /* The name of this buffer should be my_buffer_index when it's free to use 899 * it */ 900 901 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 902 "sh->buffer_index:%d\n", 903 gtid, my_buffer_index, sh->buffer_index)); 904 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 905 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 906 // Note: KMP_WAIT() cannot be used there: buffer index and 907 // my_buffer_index are *always* 32-bit integers. 908 KMP_MB(); /* is this necessary? */ 909 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 910 "sh->buffer_index:%d\n", 911 gtid, my_buffer_index, sh->buffer_index)); 912 913 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 914 th->th.th_dispatch->th_dispatch_sh_current = 915 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 916 #if USE_ITT_BUILD 917 if (pr->flags.ordered) { 918 __kmp_itt_ordered_init(gtid); 919 } 920 // Report loop metadata 921 if (itt_need_metadata_reporting) { 922 // Only report metadata by master of active team at level 1 923 kmp_uint64 schedtype = 0; 924 switch (schedule) { 925 case kmp_sch_static_chunked: 926 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 927 break; 928 case kmp_sch_static_greedy: 929 cur_chunk = pr->u.p.parm1; 930 break; 931 case kmp_sch_dynamic_chunked: 932 schedtype = 1; 933 break; 934 case kmp_sch_guided_iterative_chunked: 935 case kmp_sch_guided_analytical_chunked: 936 case kmp_sch_guided_simd: 937 schedtype = 2; 938 break; 939 default: 940 // Should we put this case under "static"? 941 // case kmp_sch_static_steal: 942 schedtype = 3; 943 break; 944 } 945 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 946 } 947 #if KMP_USE_HIER_SCHED 948 if (pr->flags.use_hier) { 949 pr->u.p.count = 0; 950 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 951 } 952 #endif // KMP_USER_HIER_SCHED 953 #endif /* USE_ITT_BUILD */ 954 } 955 956 #ifdef KMP_DEBUG 957 { 958 char *buff; 959 // create format specifiers before the debug output 960 buff = __kmp_str_format( 961 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 962 "lb:%%%s ub:%%%s" 963 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 964 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 965 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 966 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 967 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 968 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 969 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 970 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 971 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 972 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 973 __kmp_str_free(&buff); 974 } 975 #endif 976 #if (KMP_STATIC_STEAL_ENABLED) 977 // It cannot be guaranteed that after execution of a loop with some other 978 // schedule kind all the parm3 variables will contain the same value. Even if 979 // all parm3 will be the same, it still exists a bad case like using 0 and 1 980 // rather than program life-time increment. So the dedicated variable is 981 // required. The 'static_steal_counter' is used. 982 if (pr->schedule == kmp_sch_static_steal) { 983 // Other threads will inspect this variable when searching for a victim. 984 // This is a flag showing that other threads may steal from this thread 985 // since then. 986 volatile T *p = &pr->u.p.static_steal_counter; 987 *p = *p + 1; 988 } 989 #endif // ( KMP_STATIC_STEAL_ENABLED ) 990 991 #if OMPT_SUPPORT && OMPT_OPTIONAL 992 if (ompt_enabled.ompt_callback_work) { 993 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 994 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 995 ompt_callbacks.ompt_callback(ompt_callback_work)( 996 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 997 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 998 } 999 #endif 1000 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1001 } 1002 1003 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1004 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1005 * every chunk of iterations. If the ordered section(s) were not executed 1006 * for this iteration (or every iteration in this chunk), we need to set the 1007 * ordered iteration counters so that the next thread can proceed. */ 1008 template <typename UT> 1009 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1010 typedef typename traits_t<UT>::signed_t ST; 1011 __kmp_assert_valid_gtid(gtid); 1012 kmp_info_t *th = __kmp_threads[gtid]; 1013 1014 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1015 if (!th->th.th_team->t.t_serialized) { 1016 1017 dispatch_private_info_template<UT> *pr = 1018 reinterpret_cast<dispatch_private_info_template<UT> *>( 1019 th->th.th_dispatch->th_dispatch_pr_current); 1020 dispatch_shared_info_template<UT> volatile *sh = 1021 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1022 th->th.th_dispatch->th_dispatch_sh_current); 1023 KMP_DEBUG_ASSERT(pr); 1024 KMP_DEBUG_ASSERT(sh); 1025 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1026 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1027 1028 if (pr->ordered_bumped) { 1029 KD_TRACE( 1030 1000, 1031 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1032 gtid)); 1033 pr->ordered_bumped = 0; 1034 } else { 1035 UT lower = pr->u.p.ordered_lower; 1036 1037 #ifdef KMP_DEBUG 1038 { 1039 char *buff; 1040 // create format specifiers before the debug output 1041 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1042 "ordered_iteration:%%%s lower:%%%s\n", 1043 traits_t<UT>::spec, traits_t<UT>::spec); 1044 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1045 __kmp_str_free(&buff); 1046 } 1047 #endif 1048 1049 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1050 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1051 KMP_MB(); /* is this necessary? */ 1052 #ifdef KMP_DEBUG 1053 { 1054 char *buff; 1055 // create format specifiers before the debug output 1056 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1057 "ordered_iteration:%%%s lower:%%%s\n", 1058 traits_t<UT>::spec, traits_t<UT>::spec); 1059 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1060 __kmp_str_free(&buff); 1061 } 1062 #endif 1063 1064 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1065 } // if 1066 } // if 1067 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1068 } 1069 1070 #ifdef KMP_GOMP_COMPAT 1071 1072 template <typename UT> 1073 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1074 typedef typename traits_t<UT>::signed_t ST; 1075 __kmp_assert_valid_gtid(gtid); 1076 kmp_info_t *th = __kmp_threads[gtid]; 1077 1078 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1079 if (!th->th.th_team->t.t_serialized) { 1080 // int cid; 1081 dispatch_private_info_template<UT> *pr = 1082 reinterpret_cast<dispatch_private_info_template<UT> *>( 1083 th->th.th_dispatch->th_dispatch_pr_current); 1084 dispatch_shared_info_template<UT> volatile *sh = 1085 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1086 th->th.th_dispatch->th_dispatch_sh_current); 1087 KMP_DEBUG_ASSERT(pr); 1088 KMP_DEBUG_ASSERT(sh); 1089 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1090 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1091 1092 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1093 UT lower = pr->u.p.ordered_lower; 1094 UT upper = pr->u.p.ordered_upper; 1095 UT inc = upper - lower + 1; 1096 1097 if (pr->ordered_bumped == inc) { 1098 KD_TRACE( 1099 1000, 1100 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1101 gtid)); 1102 pr->ordered_bumped = 0; 1103 } else { 1104 inc -= pr->ordered_bumped; 1105 1106 #ifdef KMP_DEBUG 1107 { 1108 char *buff; 1109 // create format specifiers before the debug output 1110 buff = __kmp_str_format( 1111 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1112 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1113 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1114 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1115 __kmp_str_free(&buff); 1116 } 1117 #endif 1118 1119 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1120 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1121 1122 KMP_MB(); /* is this necessary? */ 1123 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1124 "ordered_bumped to zero\n", 1125 gtid)); 1126 pr->ordered_bumped = 0; 1127 //!!!!! TODO check if the inc should be unsigned, or signed??? 1128 #ifdef KMP_DEBUG 1129 { 1130 char *buff; 1131 // create format specifiers before the debug output 1132 buff = __kmp_str_format( 1133 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1134 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1135 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1136 traits_t<UT>::spec); 1137 KD_TRACE(1000, 1138 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1139 __kmp_str_free(&buff); 1140 } 1141 #endif 1142 1143 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1144 } 1145 // } 1146 } 1147 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1148 } 1149 1150 #endif /* KMP_GOMP_COMPAT */ 1151 1152 template <typename T> 1153 int __kmp_dispatch_next_algorithm(int gtid, 1154 dispatch_private_info_template<T> *pr, 1155 dispatch_shared_info_template<T> volatile *sh, 1156 kmp_int32 *p_last, T *p_lb, T *p_ub, 1157 typename traits_t<T>::signed_t *p_st, T nproc, 1158 T tid) { 1159 typedef typename traits_t<T>::unsigned_t UT; 1160 typedef typename traits_t<T>::signed_t ST; 1161 typedef typename traits_t<T>::floating_t DBL; 1162 int status = 0; 1163 bool last = false; 1164 T start; 1165 ST incr; 1166 UT limit, trip, init; 1167 kmp_info_t *th = __kmp_threads[gtid]; 1168 kmp_team_t *team = th->th.th_team; 1169 1170 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1171 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1172 KMP_DEBUG_ASSERT(pr); 1173 KMP_DEBUG_ASSERT(sh); 1174 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1175 #ifdef KMP_DEBUG 1176 { 1177 char *buff; 1178 // create format specifiers before the debug output 1179 buff = 1180 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1181 "sh:%%p nproc:%%%s tid:%%%s\n", 1182 traits_t<T>::spec, traits_t<T>::spec); 1183 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1184 __kmp_str_free(&buff); 1185 } 1186 #endif 1187 1188 // zero trip count 1189 if (pr->u.p.tc == 0) { 1190 KD_TRACE(10, 1191 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1192 "zero status:%d\n", 1193 gtid, status)); 1194 return 0; 1195 } 1196 1197 switch (pr->schedule) { 1198 #if (KMP_STATIC_STEAL_ENABLED) 1199 case kmp_sch_static_steal: { 1200 T chunk = pr->u.p.parm1; 1201 1202 KD_TRACE(100, 1203 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1204 gtid)); 1205 1206 trip = pr->u.p.tc - 1; 1207 1208 if (traits_t<T>::type_size > 4) { 1209 // use lock for 8-byte and CAS for 4-byte induction 1210 // variable. TODO (optional): check and use 16-byte CAS 1211 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1212 KMP_DEBUG_ASSERT(lck != NULL); 1213 if (pr->u.p.count < (UT)pr->u.p.ub) { 1214 __kmp_acquire_lock(lck, gtid); 1215 // try to get own chunk of iterations 1216 init = (pr->u.p.count)++; 1217 status = (init < (UT)pr->u.p.ub); 1218 __kmp_release_lock(lck, gtid); 1219 } else { 1220 status = 0; // no own chunks 1221 } 1222 if (!status) { // try to steal 1223 kmp_info_t **other_threads = team->t.t_threads; 1224 T while_limit = pr->u.p.parm3; 1225 T while_index = 0; 1226 T id = pr->u.p.static_steal_counter; // loop id 1227 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1228 __kmp_dispatch_num_buffers; // current loop index 1229 // note: victim thread can potentially execute another loop 1230 // TODO: algorithm of searching for a victim 1231 // should be cleaned up and measured 1232 while ((!status) && (while_limit != ++while_index)) { 1233 dispatch_private_info_template<T> *victim; 1234 T remaining; 1235 T victimIdx = pr->u.p.parm4; 1236 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1237 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1238 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1239 KMP_DEBUG_ASSERT(victim); 1240 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1241 oldVictimIdx != victimIdx) { 1242 victimIdx = (victimIdx + 1) % nproc; 1243 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1244 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1245 KMP_DEBUG_ASSERT(victim); 1246 } 1247 if (victim == pr || id != victim->u.p.static_steal_counter) { 1248 continue; // try once more (nproc attempts in total) 1249 // no victim is ready yet to participate in stealing 1250 // because no victim passed kmp_init_dispatch yet 1251 } 1252 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1253 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1254 continue; // not enough chunks to steal, goto next victim 1255 } 1256 1257 lck = victim->u.p.th_steal_lock; 1258 KMP_ASSERT(lck != NULL); 1259 __kmp_acquire_lock(lck, gtid); 1260 limit = victim->u.p.ub; // keep initial ub 1261 if (victim->u.p.count >= limit || 1262 (remaining = limit - victim->u.p.count) < 2) { 1263 __kmp_release_lock(lck, gtid); 1264 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1265 continue; // not enough chunks to steal 1266 } 1267 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1268 // by 1 1269 if (remaining > 3) { 1270 // steal 1/4 of remaining 1271 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1272 init = (victim->u.p.ub -= (remaining >> 2)); 1273 } else { 1274 // steal 1 chunk of 2 or 3 remaining 1275 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1276 init = (victim->u.p.ub -= 1); 1277 } 1278 __kmp_release_lock(lck, gtid); 1279 1280 KMP_DEBUG_ASSERT(init + 1 <= limit); 1281 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1282 status = 1; 1283 while_index = 0; 1284 // now update own count and ub with stolen range but init chunk 1285 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1286 pr->u.p.count = init + 1; 1287 pr->u.p.ub = limit; 1288 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1289 } // while (search for victim) 1290 } // if (try to find victim and steal) 1291 } else { 1292 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1293 typedef union { 1294 struct { 1295 UT count; 1296 T ub; 1297 } p; 1298 kmp_int64 b; 1299 } union_i4; 1300 // All operations on 'count' or 'ub' must be combined atomically 1301 // together. 1302 { 1303 union_i4 vold, vnew; 1304 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1305 vnew = vold; 1306 vnew.p.count++; 1307 while (!KMP_COMPARE_AND_STORE_ACQ64( 1308 (volatile kmp_int64 *)&pr->u.p.count, 1309 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1310 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1311 KMP_CPU_PAUSE(); 1312 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1313 vnew = vold; 1314 vnew.p.count++; 1315 } 1316 vnew = vold; 1317 init = vnew.p.count; 1318 status = (init < (UT)vnew.p.ub); 1319 } 1320 1321 if (!status) { 1322 kmp_info_t **other_threads = team->t.t_threads; 1323 T while_limit = pr->u.p.parm3; 1324 T while_index = 0; 1325 T id = pr->u.p.static_steal_counter; // loop id 1326 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1327 __kmp_dispatch_num_buffers; // current loop index 1328 // note: victim thread can potentially execute another loop 1329 // TODO: algorithm of searching for a victim 1330 // should be cleaned up and measured 1331 while ((!status) && (while_limit != ++while_index)) { 1332 dispatch_private_info_template<T> *victim; 1333 union_i4 vold, vnew; 1334 T remaining; 1335 T victimIdx = pr->u.p.parm4; 1336 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1337 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1338 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1339 KMP_DEBUG_ASSERT(victim); 1340 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1341 oldVictimIdx != victimIdx) { 1342 victimIdx = (victimIdx + 1) % nproc; 1343 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1344 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1345 KMP_DEBUG_ASSERT(victim); 1346 } 1347 if (victim == pr || id != victim->u.p.static_steal_counter) { 1348 continue; // try once more (nproc attempts in total) 1349 // no victim is ready yet to participate in stealing 1350 // because no victim passed kmp_init_dispatch yet 1351 } 1352 pr->u.p.parm4 = victimIdx; // new victim found 1353 while (1) { // CAS loop if victim has enough chunks to steal 1354 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1355 vnew = vold; 1356 1357 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1358 if (vnew.p.count >= (UT)vnew.p.ub || 1359 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1360 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1361 break; // not enough chunks to steal, goto next victim 1362 } 1363 if (remaining > 3) { 1364 // try to steal 1/4 of remaining 1365 vnew.p.ub -= remaining >> 2; 1366 } else { 1367 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1368 } 1369 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1370 // TODO: Should this be acquire or release? 1371 if (KMP_COMPARE_AND_STORE_ACQ64( 1372 (volatile kmp_int64 *)&victim->u.p.count, 1373 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1374 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1375 // stealing succeeded 1376 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1377 vold.p.ub - vnew.p.ub); 1378 status = 1; 1379 while_index = 0; 1380 // now update own count and ub 1381 init = vnew.p.ub; 1382 vold.p.count = init + 1; 1383 #if KMP_ARCH_X86 1384 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1385 #else 1386 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1387 #endif 1388 break; 1389 } // if (check CAS result) 1390 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1391 } // while (try to steal from particular victim) 1392 } // while (search for victim) 1393 } // if (try to find victim and steal) 1394 } // if (4-byte induction variable) 1395 if (!status) { 1396 *p_lb = 0; 1397 *p_ub = 0; 1398 if (p_st != NULL) 1399 *p_st = 0; 1400 } else { 1401 start = pr->u.p.parm2; 1402 init *= chunk; 1403 limit = chunk + init - 1; 1404 incr = pr->u.p.st; 1405 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1406 1407 KMP_DEBUG_ASSERT(init <= trip); 1408 if ((last = (limit >= trip)) != 0) 1409 limit = trip; 1410 if (p_st != NULL) 1411 *p_st = incr; 1412 1413 if (incr == 1) { 1414 *p_lb = start + init; 1415 *p_ub = start + limit; 1416 } else { 1417 *p_lb = start + init * incr; 1418 *p_ub = start + limit * incr; 1419 } 1420 1421 if (pr->flags.ordered) { 1422 pr->u.p.ordered_lower = init; 1423 pr->u.p.ordered_upper = limit; 1424 } // if 1425 } // if 1426 break; 1427 } // case 1428 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1429 case kmp_sch_static_balanced: { 1430 KD_TRACE( 1431 10, 1432 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1433 gtid)); 1434 /* check if thread has any iteration to do */ 1435 if ((status = !pr->u.p.count) != 0) { 1436 pr->u.p.count = 1; 1437 *p_lb = pr->u.p.lb; 1438 *p_ub = pr->u.p.ub; 1439 last = (pr->u.p.parm1 != 0); 1440 if (p_st != NULL) 1441 *p_st = pr->u.p.st; 1442 } else { /* no iterations to do */ 1443 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1444 } 1445 } // case 1446 break; 1447 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1448 merged here */ 1449 case kmp_sch_static_chunked: { 1450 T parm1; 1451 1452 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1453 "kmp_sch_static_[affinity|chunked] case\n", 1454 gtid)); 1455 parm1 = pr->u.p.parm1; 1456 1457 trip = pr->u.p.tc - 1; 1458 init = parm1 * (pr->u.p.count + tid); 1459 1460 if ((status = (init <= trip)) != 0) { 1461 start = pr->u.p.lb; 1462 incr = pr->u.p.st; 1463 limit = parm1 + init - 1; 1464 1465 if ((last = (limit >= trip)) != 0) 1466 limit = trip; 1467 1468 if (p_st != NULL) 1469 *p_st = incr; 1470 1471 pr->u.p.count += nproc; 1472 1473 if (incr == 1) { 1474 *p_lb = start + init; 1475 *p_ub = start + limit; 1476 } else { 1477 *p_lb = start + init * incr; 1478 *p_ub = start + limit * incr; 1479 } 1480 1481 if (pr->flags.ordered) { 1482 pr->u.p.ordered_lower = init; 1483 pr->u.p.ordered_upper = limit; 1484 } // if 1485 } // if 1486 } // case 1487 break; 1488 1489 case kmp_sch_dynamic_chunked: { 1490 T chunk = pr->u.p.parm1; 1491 1492 KD_TRACE( 1493 100, 1494 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1495 gtid)); 1496 1497 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1498 trip = pr->u.p.tc - 1; 1499 1500 if ((status = (init <= trip)) == 0) { 1501 *p_lb = 0; 1502 *p_ub = 0; 1503 if (p_st != NULL) 1504 *p_st = 0; 1505 } else { 1506 start = pr->u.p.lb; 1507 limit = chunk + init - 1; 1508 incr = pr->u.p.st; 1509 1510 if ((last = (limit >= trip)) != 0) 1511 limit = trip; 1512 1513 if (p_st != NULL) 1514 *p_st = incr; 1515 1516 if (incr == 1) { 1517 *p_lb = start + init; 1518 *p_ub = start + limit; 1519 } else { 1520 *p_lb = start + init * incr; 1521 *p_ub = start + limit * incr; 1522 } 1523 1524 if (pr->flags.ordered) { 1525 pr->u.p.ordered_lower = init; 1526 pr->u.p.ordered_upper = limit; 1527 } // if 1528 } // if 1529 } // case 1530 break; 1531 1532 case kmp_sch_guided_iterative_chunked: { 1533 T chunkspec = pr->u.p.parm1; 1534 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1535 "iterative case\n", 1536 gtid)); 1537 trip = pr->u.p.tc; 1538 // Start atomic part of calculations 1539 while (1) { 1540 ST remaining; // signed, because can be < 0 1541 init = sh->u.s.iteration; // shared value 1542 remaining = trip - init; 1543 if (remaining <= 0) { // AC: need to compare with 0 first 1544 // nothing to do, don't try atomic op 1545 status = 0; 1546 break; 1547 } 1548 if ((T)remaining < 1549 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1550 // use dynamic-style schedule 1551 // atomically increment iterations, get old value 1552 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1553 (ST)chunkspec); 1554 remaining = trip - init; 1555 if (remaining <= 0) { 1556 status = 0; // all iterations got by other threads 1557 } else { 1558 // got some iterations to work on 1559 status = 1; 1560 if ((T)remaining > chunkspec) { 1561 limit = init + chunkspec - 1; 1562 } else { 1563 last = true; // the last chunk 1564 limit = init + remaining - 1; 1565 } // if 1566 } // if 1567 break; 1568 } // if 1569 limit = init + (UT)((double)remaining * 1570 *(double *)&pr->u.p.parm3); // divide by K*nproc 1571 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1572 (ST)init, (ST)limit)) { 1573 // CAS was successful, chunk obtained 1574 status = 1; 1575 --limit; 1576 break; 1577 } // if 1578 } // while 1579 if (status != 0) { 1580 start = pr->u.p.lb; 1581 incr = pr->u.p.st; 1582 if (p_st != NULL) 1583 *p_st = incr; 1584 *p_lb = start + init * incr; 1585 *p_ub = start + limit * incr; 1586 if (pr->flags.ordered) { 1587 pr->u.p.ordered_lower = init; 1588 pr->u.p.ordered_upper = limit; 1589 } // if 1590 } else { 1591 *p_lb = 0; 1592 *p_ub = 0; 1593 if (p_st != NULL) 1594 *p_st = 0; 1595 } // if 1596 } // case 1597 break; 1598 1599 case kmp_sch_guided_simd: { 1600 // same as iterative but curr-chunk adjusted to be multiple of given 1601 // chunk 1602 T chunk = pr->u.p.parm1; 1603 KD_TRACE(100, 1604 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1605 gtid)); 1606 trip = pr->u.p.tc; 1607 // Start atomic part of calculations 1608 while (1) { 1609 ST remaining; // signed, because can be < 0 1610 init = sh->u.s.iteration; // shared value 1611 remaining = trip - init; 1612 if (remaining <= 0) { // AC: need to compare with 0 first 1613 status = 0; // nothing to do, don't try atomic op 1614 break; 1615 } 1616 KMP_DEBUG_ASSERT(init % chunk == 0); 1617 // compare with K*nproc*(chunk+1), K=2 by default 1618 if ((T)remaining < pr->u.p.parm2) { 1619 // use dynamic-style schedule 1620 // atomically increment iterations, get old value 1621 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1622 (ST)chunk); 1623 remaining = trip - init; 1624 if (remaining <= 0) { 1625 status = 0; // all iterations got by other threads 1626 } else { 1627 // got some iterations to work on 1628 status = 1; 1629 if ((T)remaining > chunk) { 1630 limit = init + chunk - 1; 1631 } else { 1632 last = true; // the last chunk 1633 limit = init + remaining - 1; 1634 } // if 1635 } // if 1636 break; 1637 } // if 1638 // divide by K*nproc 1639 UT span; 1640 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1641 &span); 1642 UT rem = span % chunk; 1643 if (rem) // adjust so that span%chunk == 0 1644 span += chunk - rem; 1645 limit = init + span; 1646 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1647 (ST)init, (ST)limit)) { 1648 // CAS was successful, chunk obtained 1649 status = 1; 1650 --limit; 1651 break; 1652 } // if 1653 } // while 1654 if (status != 0) { 1655 start = pr->u.p.lb; 1656 incr = pr->u.p.st; 1657 if (p_st != NULL) 1658 *p_st = incr; 1659 *p_lb = start + init * incr; 1660 *p_ub = start + limit * incr; 1661 if (pr->flags.ordered) { 1662 pr->u.p.ordered_lower = init; 1663 pr->u.p.ordered_upper = limit; 1664 } // if 1665 } else { 1666 *p_lb = 0; 1667 *p_ub = 0; 1668 if (p_st != NULL) 1669 *p_st = 0; 1670 } // if 1671 } // case 1672 break; 1673 1674 case kmp_sch_guided_analytical_chunked: { 1675 T chunkspec = pr->u.p.parm1; 1676 UT chunkIdx; 1677 #if KMP_USE_X87CONTROL 1678 /* for storing original FPCW value for Windows* OS on 1679 IA-32 architecture 8-byte version */ 1680 unsigned int oldFpcw; 1681 unsigned int fpcwSet = 0; 1682 #endif 1683 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1684 "kmp_sch_guided_analytical_chunked case\n", 1685 gtid)); 1686 1687 trip = pr->u.p.tc; 1688 1689 KMP_DEBUG_ASSERT(nproc > 1); 1690 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1691 1692 while (1) { /* this while loop is a safeguard against unexpected zero 1693 chunk sizes */ 1694 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1695 if (chunkIdx >= (UT)pr->u.p.parm2) { 1696 --trip; 1697 /* use dynamic-style scheduling */ 1698 init = chunkIdx * chunkspec + pr->u.p.count; 1699 /* need to verify init > 0 in case of overflow in the above 1700 * calculation */ 1701 if ((status = (init > 0 && init <= trip)) != 0) { 1702 limit = init + chunkspec - 1; 1703 1704 if ((last = (limit >= trip)) != 0) 1705 limit = trip; 1706 } 1707 break; 1708 } else { 1709 /* use exponential-style scheduling */ 1710 /* The following check is to workaround the lack of long double precision on 1711 Windows* OS. 1712 This check works around the possible effect that init != 0 for chunkIdx == 0. 1713 */ 1714 #if KMP_USE_X87CONTROL 1715 /* If we haven't already done so, save original 1716 FPCW and set precision to 64-bit, as Windows* OS 1717 on IA-32 architecture defaults to 53-bit */ 1718 if (!fpcwSet) { 1719 oldFpcw = _control87(0, 0); 1720 _control87(_PC_64, _MCW_PC); 1721 fpcwSet = 0x30000; 1722 } 1723 #endif 1724 if (chunkIdx) { 1725 init = __kmp_dispatch_guided_remaining<T>( 1726 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1727 KMP_DEBUG_ASSERT(init); 1728 init = trip - init; 1729 } else 1730 init = 0; 1731 limit = trip - __kmp_dispatch_guided_remaining<T>( 1732 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1733 KMP_ASSERT(init <= limit); 1734 if (init < limit) { 1735 KMP_DEBUG_ASSERT(limit <= trip); 1736 --limit; 1737 status = 1; 1738 break; 1739 } // if 1740 } // if 1741 } // while (1) 1742 #if KMP_USE_X87CONTROL 1743 /* restore FPCW if necessary 1744 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1745 */ 1746 if (fpcwSet && (oldFpcw & fpcwSet)) 1747 _control87(oldFpcw, _MCW_PC); 1748 #endif 1749 if (status != 0) { 1750 start = pr->u.p.lb; 1751 incr = pr->u.p.st; 1752 if (p_st != NULL) 1753 *p_st = incr; 1754 *p_lb = start + init * incr; 1755 *p_ub = start + limit * incr; 1756 if (pr->flags.ordered) { 1757 pr->u.p.ordered_lower = init; 1758 pr->u.p.ordered_upper = limit; 1759 } 1760 } else { 1761 *p_lb = 0; 1762 *p_ub = 0; 1763 if (p_st != NULL) 1764 *p_st = 0; 1765 } 1766 } // case 1767 break; 1768 1769 case kmp_sch_trapezoidal: { 1770 UT index; 1771 T parm2 = pr->u.p.parm2; 1772 T parm3 = pr->u.p.parm3; 1773 T parm4 = pr->u.p.parm4; 1774 KD_TRACE(100, 1775 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1776 gtid)); 1777 1778 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1779 1780 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1781 trip = pr->u.p.tc - 1; 1782 1783 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1784 *p_lb = 0; 1785 *p_ub = 0; 1786 if (p_st != NULL) 1787 *p_st = 0; 1788 } else { 1789 start = pr->u.p.lb; 1790 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1791 incr = pr->u.p.st; 1792 1793 if ((last = (limit >= trip)) != 0) 1794 limit = trip; 1795 1796 if (p_st != NULL) 1797 *p_st = incr; 1798 1799 if (incr == 1) { 1800 *p_lb = start + init; 1801 *p_ub = start + limit; 1802 } else { 1803 *p_lb = start + init * incr; 1804 *p_ub = start + limit * incr; 1805 } 1806 1807 if (pr->flags.ordered) { 1808 pr->u.p.ordered_lower = init; 1809 pr->u.p.ordered_upper = limit; 1810 } // if 1811 } // if 1812 } // case 1813 break; 1814 default: { 1815 status = 0; // to avoid complaints on uninitialized variable use 1816 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1817 KMP_HNT(GetNewerLibrary), // Hint 1818 __kmp_msg_null // Variadic argument list terminator 1819 ); 1820 } break; 1821 } // switch 1822 if (p_last) 1823 *p_last = last; 1824 #ifdef KMP_DEBUG 1825 if (pr->flags.ordered) { 1826 char *buff; 1827 // create format specifiers before the debug output 1828 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1829 "ordered_lower:%%%s ordered_upper:%%%s\n", 1830 traits_t<UT>::spec, traits_t<UT>::spec); 1831 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1832 __kmp_str_free(&buff); 1833 } 1834 { 1835 char *buff; 1836 // create format specifiers before the debug output 1837 buff = __kmp_str_format( 1838 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1839 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1840 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1841 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1842 __kmp_str_free(&buff); 1843 } 1844 #endif 1845 return status; 1846 } 1847 1848 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1849 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1850 is not called. */ 1851 #if OMPT_SUPPORT && OMPT_OPTIONAL 1852 #define OMPT_LOOP_END \ 1853 if (status == 0) { \ 1854 if (ompt_enabled.ompt_callback_work) { \ 1855 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1856 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1857 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1858 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1859 &(task_info->task_data), 0, codeptr); \ 1860 } \ 1861 } 1862 // TODO: implement count 1863 #else 1864 #define OMPT_LOOP_END // no-op 1865 #endif 1866 1867 #if KMP_STATS_ENABLED 1868 #define KMP_STATS_LOOP_END \ 1869 { \ 1870 kmp_int64 u, l, t, i; \ 1871 l = (kmp_int64)(*p_lb); \ 1872 u = (kmp_int64)(*p_ub); \ 1873 i = (kmp_int64)(pr->u.p.st); \ 1874 if (status == 0) { \ 1875 t = 0; \ 1876 KMP_POP_PARTITIONED_TIMER(); \ 1877 } else if (i == 1) { \ 1878 if (u >= l) \ 1879 t = u - l + 1; \ 1880 else \ 1881 t = 0; \ 1882 } else if (i < 0) { \ 1883 if (l >= u) \ 1884 t = (l - u) / (-i) + 1; \ 1885 else \ 1886 t = 0; \ 1887 } else { \ 1888 if (u >= l) \ 1889 t = (u - l) / i + 1; \ 1890 else \ 1891 t = 0; \ 1892 } \ 1893 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1894 } 1895 #else 1896 #define KMP_STATS_LOOP_END /* Nothing */ 1897 #endif 1898 1899 template <typename T> 1900 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1901 T *p_lb, T *p_ub, 1902 typename traits_t<T>::signed_t *p_st 1903 #if OMPT_SUPPORT && OMPT_OPTIONAL 1904 , 1905 void *codeptr 1906 #endif 1907 ) { 1908 1909 typedef typename traits_t<T>::unsigned_t UT; 1910 typedef typename traits_t<T>::signed_t ST; 1911 // This is potentially slightly misleading, schedule(runtime) will appear here 1912 // even if the actual runtime schedule is static. (Which points out a 1913 // disadvantage of schedule(runtime): even when static scheduling is used it 1914 // costs more than a compile time choice to use static scheduling would.) 1915 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1916 1917 int status; 1918 dispatch_private_info_template<T> *pr; 1919 __kmp_assert_valid_gtid(gtid); 1920 kmp_info_t *th = __kmp_threads[gtid]; 1921 kmp_team_t *team = th->th.th_team; 1922 1923 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1924 KD_TRACE( 1925 1000, 1926 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1927 gtid, p_lb, p_ub, p_st, p_last)); 1928 1929 if (team->t.t_serialized) { 1930 /* NOTE: serialize this dispatch because we are not at the active level */ 1931 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1932 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1933 KMP_DEBUG_ASSERT(pr); 1934 1935 if ((status = (pr->u.p.tc != 0)) == 0) { 1936 *p_lb = 0; 1937 *p_ub = 0; 1938 // if ( p_last != NULL ) 1939 // *p_last = 0; 1940 if (p_st != NULL) 1941 *p_st = 0; 1942 if (__kmp_env_consistency_check) { 1943 if (pr->pushed_ws != ct_none) { 1944 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1945 } 1946 } 1947 } else if (pr->flags.nomerge) { 1948 kmp_int32 last; 1949 T start; 1950 UT limit, trip, init; 1951 ST incr; 1952 T chunk = pr->u.p.parm1; 1953 1954 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1955 gtid)); 1956 1957 init = chunk * pr->u.p.count++; 1958 trip = pr->u.p.tc - 1; 1959 1960 if ((status = (init <= trip)) == 0) { 1961 *p_lb = 0; 1962 *p_ub = 0; 1963 // if ( p_last != NULL ) 1964 // *p_last = 0; 1965 if (p_st != NULL) 1966 *p_st = 0; 1967 if (__kmp_env_consistency_check) { 1968 if (pr->pushed_ws != ct_none) { 1969 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1970 } 1971 } 1972 } else { 1973 start = pr->u.p.lb; 1974 limit = chunk + init - 1; 1975 incr = pr->u.p.st; 1976 1977 if ((last = (limit >= trip)) != 0) { 1978 limit = trip; 1979 #if KMP_OS_WINDOWS 1980 pr->u.p.last_upper = pr->u.p.ub; 1981 #endif /* KMP_OS_WINDOWS */ 1982 } 1983 if (p_last != NULL) 1984 *p_last = last; 1985 if (p_st != NULL) 1986 *p_st = incr; 1987 if (incr == 1) { 1988 *p_lb = start + init; 1989 *p_ub = start + limit; 1990 } else { 1991 *p_lb = start + init * incr; 1992 *p_ub = start + limit * incr; 1993 } 1994 1995 if (pr->flags.ordered) { 1996 pr->u.p.ordered_lower = init; 1997 pr->u.p.ordered_upper = limit; 1998 #ifdef KMP_DEBUG 1999 { 2000 char *buff; 2001 // create format specifiers before the debug output 2002 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2003 "ordered_lower:%%%s ordered_upper:%%%s\n", 2004 traits_t<UT>::spec, traits_t<UT>::spec); 2005 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2006 pr->u.p.ordered_upper)); 2007 __kmp_str_free(&buff); 2008 } 2009 #endif 2010 } // if 2011 } // if 2012 } else { 2013 pr->u.p.tc = 0; 2014 *p_lb = pr->u.p.lb; 2015 *p_ub = pr->u.p.ub; 2016 #if KMP_OS_WINDOWS 2017 pr->u.p.last_upper = *p_ub; 2018 #endif /* KMP_OS_WINDOWS */ 2019 if (p_last != NULL) 2020 *p_last = TRUE; 2021 if (p_st != NULL) 2022 *p_st = pr->u.p.st; 2023 } // if 2024 #ifdef KMP_DEBUG 2025 { 2026 char *buff; 2027 // create format specifiers before the debug output 2028 buff = __kmp_str_format( 2029 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2030 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2031 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2032 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2033 (p_last ? *p_last : 0), status)); 2034 __kmp_str_free(&buff); 2035 } 2036 #endif 2037 #if INCLUDE_SSC_MARKS 2038 SSC_MARK_DISPATCH_NEXT(); 2039 #endif 2040 OMPT_LOOP_END; 2041 KMP_STATS_LOOP_END; 2042 return status; 2043 } else { 2044 kmp_int32 last = 0; 2045 dispatch_shared_info_template<T> volatile *sh; 2046 2047 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2048 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2049 2050 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2051 th->th.th_dispatch->th_dispatch_pr_current); 2052 KMP_DEBUG_ASSERT(pr); 2053 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2054 th->th.th_dispatch->th_dispatch_sh_current); 2055 KMP_DEBUG_ASSERT(sh); 2056 2057 #if KMP_USE_HIER_SCHED 2058 if (pr->flags.use_hier) 2059 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2060 else 2061 #endif // KMP_USE_HIER_SCHED 2062 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2063 p_st, th->th.th_team_nproc, 2064 th->th.th_info.ds.ds_tid); 2065 // status == 0: no more iterations to execute 2066 if (status == 0) { 2067 UT num_done; 2068 2069 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2070 #ifdef KMP_DEBUG 2071 { 2072 char *buff; 2073 // create format specifiers before the debug output 2074 buff = __kmp_str_format( 2075 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2076 traits_t<UT>::spec); 2077 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2078 __kmp_str_free(&buff); 2079 } 2080 #endif 2081 2082 #if KMP_USE_HIER_SCHED 2083 pr->flags.use_hier = FALSE; 2084 #endif 2085 if ((ST)num_done == th->th.th_team_nproc - 1) { 2086 #if (KMP_STATIC_STEAL_ENABLED) 2087 if (pr->schedule == kmp_sch_static_steal && 2088 traits_t<T>::type_size > 4) { 2089 int i; 2090 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2091 __kmp_dispatch_num_buffers; // current loop index 2092 kmp_info_t **other_threads = team->t.t_threads; 2093 // loop complete, safe to destroy locks used for stealing 2094 for (i = 0; i < th->th.th_team_nproc; ++i) { 2095 dispatch_private_info_template<T> *buf = 2096 reinterpret_cast<dispatch_private_info_template<T> *>( 2097 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2098 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2099 KMP_ASSERT(lck != NULL); 2100 __kmp_destroy_lock(lck); 2101 __kmp_free(lck); 2102 buf->u.p.th_steal_lock = NULL; 2103 } 2104 } 2105 #endif 2106 /* NOTE: release this buffer to be reused */ 2107 2108 KMP_MB(); /* Flush all pending memory write invalidates. */ 2109 2110 sh->u.s.num_done = 0; 2111 sh->u.s.iteration = 0; 2112 2113 /* TODO replace with general release procedure? */ 2114 if (pr->flags.ordered) { 2115 sh->u.s.ordered_iteration = 0; 2116 } 2117 2118 KMP_MB(); /* Flush all pending memory write invalidates. */ 2119 2120 sh->buffer_index += __kmp_dispatch_num_buffers; 2121 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2122 gtid, sh->buffer_index)); 2123 2124 KMP_MB(); /* Flush all pending memory write invalidates. */ 2125 2126 } // if 2127 if (__kmp_env_consistency_check) { 2128 if (pr->pushed_ws != ct_none) { 2129 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2130 } 2131 } 2132 2133 th->th.th_dispatch->th_deo_fcn = NULL; 2134 th->th.th_dispatch->th_dxo_fcn = NULL; 2135 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2136 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2137 } // if (status == 0) 2138 #if KMP_OS_WINDOWS 2139 else if (last) { 2140 pr->u.p.last_upper = pr->u.p.ub; 2141 } 2142 #endif /* KMP_OS_WINDOWS */ 2143 if (p_last != NULL && status != 0) 2144 *p_last = last; 2145 } // if 2146 2147 #ifdef KMP_DEBUG 2148 { 2149 char *buff; 2150 // create format specifiers before the debug output 2151 buff = __kmp_str_format( 2152 "__kmp_dispatch_next: T#%%d normal case: " 2153 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2154 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2155 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2156 (p_last ? *p_last : 0), status)); 2157 __kmp_str_free(&buff); 2158 } 2159 #endif 2160 #if INCLUDE_SSC_MARKS 2161 SSC_MARK_DISPATCH_NEXT(); 2162 #endif 2163 OMPT_LOOP_END; 2164 KMP_STATS_LOOP_END; 2165 return status; 2166 } 2167 2168 template <typename T> 2169 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2170 kmp_int32 *plastiter, T *plower, T *pupper, 2171 typename traits_t<T>::signed_t incr) { 2172 typedef typename traits_t<T>::unsigned_t UT; 2173 kmp_uint32 team_id; 2174 kmp_uint32 nteams; 2175 UT trip_count; 2176 kmp_team_t *team; 2177 kmp_info_t *th; 2178 2179 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2180 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2181 #ifdef KMP_DEBUG 2182 typedef typename traits_t<T>::signed_t ST; 2183 { 2184 char *buff; 2185 // create format specifiers before the debug output 2186 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2187 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2188 traits_t<T>::spec, traits_t<T>::spec, 2189 traits_t<ST>::spec, traits_t<T>::spec); 2190 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2191 __kmp_str_free(&buff); 2192 } 2193 #endif 2194 2195 if (__kmp_env_consistency_check) { 2196 if (incr == 0) { 2197 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2198 loc); 2199 } 2200 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2201 // The loop is illegal. 2202 // Some zero-trip loops maintained by compiler, e.g.: 2203 // for(i=10;i<0;++i) // lower >= upper - run-time check 2204 // for(i=0;i>10;--i) // lower <= upper - run-time check 2205 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2206 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2207 // Compiler does not check the following illegal loops: 2208 // for(i=0;i<10;i+=incr) // where incr<0 2209 // for(i=10;i>0;i-=incr) // where incr<0 2210 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2211 } 2212 } 2213 __kmp_assert_valid_gtid(gtid); 2214 th = __kmp_threads[gtid]; 2215 team = th->th.th_team; 2216 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2217 nteams = th->th.th_teams_size.nteams; 2218 team_id = team->t.t_master_tid; 2219 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2220 2221 // compute global trip count 2222 if (incr == 1) { 2223 trip_count = *pupper - *plower + 1; 2224 } else if (incr == -1) { 2225 trip_count = *plower - *pupper + 1; 2226 } else if (incr > 0) { 2227 // upper-lower can exceed the limit of signed type 2228 trip_count = (UT)(*pupper - *plower) / incr + 1; 2229 } else { 2230 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2231 } 2232 2233 if (trip_count <= nteams) { 2234 KMP_DEBUG_ASSERT( 2235 __kmp_static == kmp_sch_static_greedy || 2236 __kmp_static == 2237 kmp_sch_static_balanced); // Unknown static scheduling type. 2238 // only some teams get single iteration, others get nothing 2239 if (team_id < trip_count) { 2240 *pupper = *plower = *plower + team_id * incr; 2241 } else { 2242 *plower = *pupper + incr; // zero-trip loop 2243 } 2244 if (plastiter != NULL) 2245 *plastiter = (team_id == trip_count - 1); 2246 } else { 2247 if (__kmp_static == kmp_sch_static_balanced) { 2248 UT chunk = trip_count / nteams; 2249 UT extras = trip_count % nteams; 2250 *plower += 2251 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2252 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2253 if (plastiter != NULL) 2254 *plastiter = (team_id == nteams - 1); 2255 } else { 2256 T chunk_inc_count = 2257 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2258 T upper = *pupper; 2259 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2260 // Unknown static scheduling type. 2261 *plower += team_id * chunk_inc_count; 2262 *pupper = *plower + chunk_inc_count - incr; 2263 // Check/correct bounds if needed 2264 if (incr > 0) { 2265 if (*pupper < *plower) 2266 *pupper = traits_t<T>::max_value; 2267 if (plastiter != NULL) 2268 *plastiter = *plower <= upper && *pupper > upper - incr; 2269 if (*pupper > upper) 2270 *pupper = upper; // tracker C73258 2271 } else { 2272 if (*pupper > *plower) 2273 *pupper = traits_t<T>::min_value; 2274 if (plastiter != NULL) 2275 *plastiter = *plower >= upper && *pupper < upper - incr; 2276 if (*pupper < upper) 2277 *pupper = upper; // tracker C73258 2278 } 2279 } 2280 } 2281 } 2282 2283 //----------------------------------------------------------------------------- 2284 // Dispatch routines 2285 // Transfer call to template< type T > 2286 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2287 // T lb, T ub, ST st, ST chunk ) 2288 extern "C" { 2289 2290 /*! 2291 @ingroup WORK_SHARING 2292 @{ 2293 @param loc Source location 2294 @param gtid Global thread id 2295 @param schedule Schedule type 2296 @param lb Lower bound 2297 @param ub Upper bound 2298 @param st Step (or increment if you prefer) 2299 @param chunk The chunk size to block with 2300 2301 This function prepares the runtime to start a dynamically scheduled for loop, 2302 saving the loop arguments. 2303 These functions are all identical apart from the types of the arguments. 2304 */ 2305 2306 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2307 enum sched_type schedule, kmp_int32 lb, 2308 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2309 KMP_DEBUG_ASSERT(__kmp_init_serial); 2310 #if OMPT_SUPPORT && OMPT_OPTIONAL 2311 OMPT_STORE_RETURN_ADDRESS(gtid); 2312 #endif 2313 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2314 } 2315 /*! 2316 See @ref __kmpc_dispatch_init_4 2317 */ 2318 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2319 enum sched_type schedule, kmp_uint32 lb, 2320 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2321 KMP_DEBUG_ASSERT(__kmp_init_serial); 2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 OMPT_STORE_RETURN_ADDRESS(gtid); 2324 #endif 2325 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2326 } 2327 2328 /*! 2329 See @ref __kmpc_dispatch_init_4 2330 */ 2331 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2332 enum sched_type schedule, kmp_int64 lb, 2333 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2334 KMP_DEBUG_ASSERT(__kmp_init_serial); 2335 #if OMPT_SUPPORT && OMPT_OPTIONAL 2336 OMPT_STORE_RETURN_ADDRESS(gtid); 2337 #endif 2338 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2339 } 2340 2341 /*! 2342 See @ref __kmpc_dispatch_init_4 2343 */ 2344 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2345 enum sched_type schedule, kmp_uint64 lb, 2346 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2347 KMP_DEBUG_ASSERT(__kmp_init_serial); 2348 #if OMPT_SUPPORT && OMPT_OPTIONAL 2349 OMPT_STORE_RETURN_ADDRESS(gtid); 2350 #endif 2351 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2352 } 2353 2354 /*! 2355 See @ref __kmpc_dispatch_init_4 2356 2357 Difference from __kmpc_dispatch_init set of functions is these functions 2358 are called for composite distribute parallel for construct. Thus before 2359 regular iterations dispatching we need to calc per-team iteration space. 2360 2361 These functions are all identical apart from the types of the arguments. 2362 */ 2363 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2364 enum sched_type schedule, kmp_int32 *p_last, 2365 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2366 kmp_int32 chunk) { 2367 KMP_DEBUG_ASSERT(__kmp_init_serial); 2368 #if OMPT_SUPPORT && OMPT_OPTIONAL 2369 OMPT_STORE_RETURN_ADDRESS(gtid); 2370 #endif 2371 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2372 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2373 } 2374 2375 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2376 enum sched_type schedule, kmp_int32 *p_last, 2377 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2378 kmp_int32 chunk) { 2379 KMP_DEBUG_ASSERT(__kmp_init_serial); 2380 #if OMPT_SUPPORT && OMPT_OPTIONAL 2381 OMPT_STORE_RETURN_ADDRESS(gtid); 2382 #endif 2383 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2384 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2385 } 2386 2387 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2388 enum sched_type schedule, kmp_int32 *p_last, 2389 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2390 kmp_int64 chunk) { 2391 KMP_DEBUG_ASSERT(__kmp_init_serial); 2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 OMPT_STORE_RETURN_ADDRESS(gtid); 2394 #endif 2395 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2396 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2397 } 2398 2399 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2400 enum sched_type schedule, kmp_int32 *p_last, 2401 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2402 kmp_int64 chunk) { 2403 KMP_DEBUG_ASSERT(__kmp_init_serial); 2404 #if OMPT_SUPPORT && OMPT_OPTIONAL 2405 OMPT_STORE_RETURN_ADDRESS(gtid); 2406 #endif 2407 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2408 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2409 } 2410 2411 /*! 2412 @param loc Source code location 2413 @param gtid Global thread id 2414 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2415 otherwise 2416 @param p_lb Pointer to the lower bound for the next chunk of work 2417 @param p_ub Pointer to the upper bound for the next chunk of work 2418 @param p_st Pointer to the stride for the next chunk of work 2419 @return one if there is work to be done, zero otherwise 2420 2421 Get the next dynamically allocated chunk of work for this thread. 2422 If there is no more work, then the lb,ub and stride need not be modified. 2423 */ 2424 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2425 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2426 #if OMPT_SUPPORT && OMPT_OPTIONAL 2427 OMPT_STORE_RETURN_ADDRESS(gtid); 2428 #endif 2429 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2430 #if OMPT_SUPPORT && OMPT_OPTIONAL 2431 , 2432 OMPT_LOAD_RETURN_ADDRESS(gtid) 2433 #endif 2434 ); 2435 } 2436 2437 /*! 2438 See @ref __kmpc_dispatch_next_4 2439 */ 2440 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2441 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2442 kmp_int32 *p_st) { 2443 #if OMPT_SUPPORT && OMPT_OPTIONAL 2444 OMPT_STORE_RETURN_ADDRESS(gtid); 2445 #endif 2446 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2447 #if OMPT_SUPPORT && OMPT_OPTIONAL 2448 , 2449 OMPT_LOAD_RETURN_ADDRESS(gtid) 2450 #endif 2451 ); 2452 } 2453 2454 /*! 2455 See @ref __kmpc_dispatch_next_4 2456 */ 2457 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2458 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2459 #if OMPT_SUPPORT && OMPT_OPTIONAL 2460 OMPT_STORE_RETURN_ADDRESS(gtid); 2461 #endif 2462 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2463 #if OMPT_SUPPORT && OMPT_OPTIONAL 2464 , 2465 OMPT_LOAD_RETURN_ADDRESS(gtid) 2466 #endif 2467 ); 2468 } 2469 2470 /*! 2471 See @ref __kmpc_dispatch_next_4 2472 */ 2473 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2474 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2475 kmp_int64 *p_st) { 2476 #if OMPT_SUPPORT && OMPT_OPTIONAL 2477 OMPT_STORE_RETURN_ADDRESS(gtid); 2478 #endif 2479 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2480 #if OMPT_SUPPORT && OMPT_OPTIONAL 2481 , 2482 OMPT_LOAD_RETURN_ADDRESS(gtid) 2483 #endif 2484 ); 2485 } 2486 2487 /*! 2488 @param loc Source code location 2489 @param gtid Global thread id 2490 2491 Mark the end of a dynamic loop. 2492 */ 2493 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2494 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2495 } 2496 2497 /*! 2498 See @ref __kmpc_dispatch_fini_4 2499 */ 2500 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2501 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2502 } 2503 2504 /*! 2505 See @ref __kmpc_dispatch_fini_4 2506 */ 2507 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2508 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2509 } 2510 2511 /*! 2512 See @ref __kmpc_dispatch_fini_4 2513 */ 2514 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2515 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2516 } 2517 /*! @} */ 2518 2519 //----------------------------------------------------------------------------- 2520 // Non-template routines from kmp_dispatch.cpp used in other sources 2521 2522 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2523 return value == checker; 2524 } 2525 2526 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2527 return value != checker; 2528 } 2529 2530 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2531 return value < checker; 2532 } 2533 2534 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2535 return value >= checker; 2536 } 2537 2538 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2539 return value <= checker; 2540 } 2541 2542 kmp_uint32 2543 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2544 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2545 void *obj // Higher-level synchronization object, or NULL. 2546 ) { 2547 // note: we may not belong to a team at this point 2548 volatile kmp_uint32 *spin = spinner; 2549 kmp_uint32 check = checker; 2550 kmp_uint32 spins; 2551 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2552 kmp_uint32 r; 2553 2554 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2555 KMP_INIT_YIELD(spins); 2556 // main wait spin loop 2557 while (!f(r = TCR_4(*spin), check)) { 2558 KMP_FSYNC_SPIN_PREPARE(obj); 2559 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2560 split. It causes problems with infinite recursion because of exit lock */ 2561 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2562 __kmp_abort_thread(); */ 2563 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2564 } 2565 KMP_FSYNC_SPIN_ACQUIRED(obj); 2566 return r; 2567 } 2568 2569 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2570 kmp_uint32 (*pred)(void *, kmp_uint32), 2571 void *obj // Higher-level synchronization object, or NULL. 2572 ) { 2573 // note: we may not belong to a team at this point 2574 void *spin = spinner; 2575 kmp_uint32 check = checker; 2576 kmp_uint32 spins; 2577 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2578 2579 KMP_FSYNC_SPIN_INIT(obj, spin); 2580 KMP_INIT_YIELD(spins); 2581 // main wait spin loop 2582 while (!f(spin, check)) { 2583 KMP_FSYNC_SPIN_PREPARE(obj); 2584 /* if we have waited a bit, or are noversubscribed, yield */ 2585 /* pause is in the following code */ 2586 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2587 } 2588 KMP_FSYNC_SPIN_ACQUIRED(obj); 2589 } 2590 2591 } // extern "C" 2592 2593 #ifdef KMP_GOMP_COMPAT 2594 2595 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2596 enum sched_type schedule, kmp_int32 lb, 2597 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2598 int push_ws) { 2599 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2600 push_ws); 2601 } 2602 2603 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2604 enum sched_type schedule, kmp_uint32 lb, 2605 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2606 int push_ws) { 2607 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2608 push_ws); 2609 } 2610 2611 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2612 enum sched_type schedule, kmp_int64 lb, 2613 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2614 int push_ws) { 2615 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2616 push_ws); 2617 } 2618 2619 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2620 enum sched_type schedule, kmp_uint64 lb, 2621 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2622 int push_ws) { 2623 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2624 push_ws); 2625 } 2626 2627 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2628 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2629 } 2630 2631 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2632 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2633 } 2634 2635 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2636 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2637 } 2638 2639 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2640 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2641 } 2642 2643 #endif /* KMP_GOMP_COMPAT */ 2644 2645 /* ------------------------------------------------------------------------ */ 2646