1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 /* Dynamic scheduling initialization and dispatch. 14 * 15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 16 * it may change values between parallel regions. __kmp_max_nth 17 * is the largest value __kmp_nth may take, 1 is the smallest. 18 */ 19 20 #include "kmp.h" 21 #include "kmp_error.h" 22 #include "kmp_i18n.h" 23 #include "kmp_itt.h" 24 #include "kmp_stats.h" 25 #include "kmp_str.h" 26 #if KMP_USE_X87CONTROL 27 #include <float.h> 28 #endif 29 #include "kmp_lock.h" 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 33 #endif 34 35 #if OMPT_SUPPORT 36 #include "ompt-specific.h" 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 43 kmp_info_t *th; 44 45 KMP_DEBUG_ASSERT(gtid_ref); 46 47 if (__kmp_env_consistency_check) { 48 th = __kmp_threads[*gtid_ref]; 49 if (th->th.th_root->r.r_active && 50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 53 #else 54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 55 #endif 56 } 57 } 58 } 59 60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 kmp_info_t *th; 62 63 if (__kmp_env_consistency_check) { 64 th = __kmp_threads[*gtid_ref]; 65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 67 } 68 } 69 } 70 71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC 72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, 73 bool use_hier = false) { 74 // Pick up the nonmonotonic/monotonic bits from the scheduling type 75 // TODO: make nonmonotonic when static_steal is fixed 76 int monotonicity = SCHEDULE_MONOTONIC; 77 78 // Let default be monotonic for executables 79 // compiled with OpenMP* 4.5 or less compilers 80 if (loc->get_openmp_version() < 50) 81 monotonicity = SCHEDULE_MONOTONIC; 82 83 if (use_hier || __kmp_force_monotonic) 84 monotonicity = SCHEDULE_MONOTONIC; 85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 86 monotonicity = SCHEDULE_NONMONOTONIC; 87 else if (SCHEDULE_HAS_MONOTONIC(schedule)) 88 monotonicity = SCHEDULE_MONOTONIC; 89 90 return monotonicity; 91 } 92 93 // Initialize a dispatch_private_info_template<T> buffer for a particular 94 // type of schedule,chunk. The loop description is found in lb (lower bound), 95 // ub (upper bound), and st (stride). nproc is the number of threads relevant 96 // to the scheduling (often the number of threads in a team, but not always if 97 // hierarchical scheduling is used). tid is the id of the thread calling 98 // the function within the group of nproc threads. It will have a value 99 // between 0 and nproc - 1. This is often just the thread id within a team, but 100 // is not necessarily the case when using hierarchical scheduling. 101 // loc is the source file location of the corresponding loop 102 // gtid is the global thread id 103 template <typename T> 104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, 105 dispatch_private_info_template<T> *pr, 106 enum sched_type schedule, T lb, T ub, 107 typename traits_t<T>::signed_t st, 108 #if USE_ITT_BUILD 109 kmp_uint64 *cur_chunk, 110 #endif 111 typename traits_t<T>::signed_t chunk, 112 T nproc, T tid) { 113 typedef typename traits_t<T>::unsigned_t UT; 114 typedef typename traits_t<T>::floating_t DBL; 115 116 int active; 117 T tc; 118 kmp_info_t *th; 119 kmp_team_t *team; 120 int monotonicity; 121 bool use_hier; 122 123 #ifdef KMP_DEBUG 124 typedef typename traits_t<T>::signed_t ST; 125 { 126 char *buff; 127 // create format specifiers before the debug output 128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " 129 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", 131 traits_t<T>::spec, traits_t<T>::spec, 132 traits_t<ST>::spec, traits_t<ST>::spec, 133 traits_t<T>::spec, traits_t<T>::spec); 134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid)); 135 __kmp_str_free(&buff); 136 } 137 #endif 138 /* setup data */ 139 th = __kmp_threads[gtid]; 140 team = th->th.th_team; 141 active = !team->t.t_serialized; 142 143 #if USE_ITT_BUILD 144 int itt_need_metadata_reporting = 145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 147 team->t.t_active_level == 1; 148 #endif 149 150 #if KMP_USE_HIER_SCHED 151 use_hier = pr->flags.use_hier; 152 #else 153 use_hier = false; 154 #endif 155 156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ 157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 159 160 /* Pick up the nomerge/ordered bits from the scheduling type */ 161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 162 pr->flags.nomerge = TRUE; 163 schedule = 164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 165 } else { 166 pr->flags.nomerge = FALSE; 167 } 168 pr->type_size = traits_t<T>::type_size; // remember the size of variables 169 if (kmp_ord_lower & schedule) { 170 pr->flags.ordered = TRUE; 171 schedule = 172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 173 } else { 174 pr->flags.ordered = FALSE; 175 } 176 // Ordered overrides nonmonotonic 177 if (pr->flags.ordered) { 178 monotonicity = SCHEDULE_MONOTONIC; 179 } 180 181 if (schedule == kmp_sch_static) { 182 schedule = __kmp_static; 183 } else { 184 if (schedule == kmp_sch_runtime) { 185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 186 // not specified) 187 schedule = team->t.t_sched.r_sched_type; 188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 190 // Detail the schedule if needed (global controls are differentiated 191 // appropriately) 192 if (schedule == kmp_sch_guided_chunked) { 193 schedule = __kmp_guided; 194 } else if (schedule == kmp_sch_static) { 195 schedule = __kmp_static; 196 } 197 // Use the chunk size specified by OMP_SCHEDULE (or default if not 198 // specified) 199 chunk = team->t.t_sched.chunk; 200 #if USE_ITT_BUILD 201 if (cur_chunk) 202 *cur_chunk = chunk; 203 #endif 204 #ifdef KMP_DEBUG 205 { 206 char *buff; 207 // create format specifiers before the debug output 208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " 209 "schedule:%%d chunk:%%%s\n", 210 traits_t<ST>::spec); 211 KD_TRACE(10, (buff, gtid, schedule, chunk)); 212 __kmp_str_free(&buff); 213 } 214 #endif 215 } else { 216 if (schedule == kmp_sch_guided_chunked) { 217 schedule = __kmp_guided; 218 } 219 if (chunk <= 0) { 220 chunk = KMP_DEFAULT_CHUNK; 221 } 222 } 223 224 if (schedule == kmp_sch_auto) { 225 // mapping and differentiation: in the __kmp_do_serial_initialize() 226 schedule = __kmp_auto; 227 #ifdef KMP_DEBUG 228 { 229 char *buff; 230 // create format specifiers before the debug output 231 buff = __kmp_str_format( 232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 233 "schedule:%%d chunk:%%%s\n", 234 traits_t<ST>::spec); 235 KD_TRACE(10, (buff, gtid, schedule, chunk)); 236 __kmp_str_free(&buff); 237 } 238 #endif 239 } 240 #if KMP_STATIC_STEAL_ENABLED 241 // map nonmonotonic:dynamic to static steal 242 if (schedule == kmp_sch_dynamic_chunked) { 243 if (monotonicity == SCHEDULE_NONMONOTONIC) 244 schedule = kmp_sch_static_steal; 245 } 246 #endif 247 /* guided analytical not safe for too many threads */ 248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { 249 schedule = kmp_sch_guided_iterative_chunked; 250 KMP_WARNING(DispatchManyThreads); 251 } 252 if (schedule == kmp_sch_runtime_simd) { 253 // compiler provides simd_width in the chunk parameter 254 schedule = team->t.t_sched.r_sched_type; 255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); 256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 257 // Detail the schedule if needed (global controls are differentiated 258 // appropriately) 259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 260 schedule == __kmp_static) { 261 schedule = kmp_sch_static_balanced_chunked; 262 } else { 263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 264 schedule = kmp_sch_guided_simd; 265 } 266 chunk = team->t.t_sched.chunk * chunk; 267 } 268 #if USE_ITT_BUILD 269 if (cur_chunk) 270 *cur_chunk = chunk; 271 #endif 272 #ifdef KMP_DEBUG 273 { 274 char *buff; 275 // create format specifiers before the debug output 276 buff = __kmp_str_format( 277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 278 " chunk:%%%s\n", 279 traits_t<ST>::spec); 280 KD_TRACE(10, (buff, gtid, schedule, chunk)); 281 __kmp_str_free(&buff); 282 } 283 #endif 284 } 285 pr->u.p.parm1 = chunk; 286 } 287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 288 "unknown scheduling type"); 289 290 pr->u.p.count = 0; 291 292 if (__kmp_env_consistency_check) { 293 if (st == 0) { 294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); 296 } 297 } 298 // compute trip count 299 if (st == 1) { // most common case 300 if (ub >= lb) { 301 tc = ub - lb + 1; 302 } else { // ub < lb 303 tc = 0; // zero-trip 304 } 305 } else if (st < 0) { 306 if (lb >= ub) { 307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 308 // where the division needs to be unsigned regardless of the result type 309 tc = (UT)(lb - ub) / (-st) + 1; 310 } else { // lb < ub 311 tc = 0; // zero-trip 312 } 313 } else { // st > 0 314 if (ub >= lb) { 315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 316 // where the division needs to be unsigned regardless of the result type 317 tc = (UT)(ub - lb) / st + 1; 318 } else { // ub < lb 319 tc = 0; // zero-trip 320 } 321 } 322 323 #if KMP_STATS_ENABLED 324 if (KMP_MASTER_GTID(gtid)) { 325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc); 326 } 327 #endif 328 329 pr->u.p.lb = lb; 330 pr->u.p.ub = ub; 331 pr->u.p.st = st; 332 pr->u.p.tc = tc; 333 334 #if KMP_OS_WINDOWS 335 pr->u.p.last_upper = ub + st; 336 #endif /* KMP_OS_WINDOWS */ 337 338 /* NOTE: only the active parallel region(s) has active ordered sections */ 339 340 if (active) { 341 if (pr->flags.ordered) { 342 pr->ordered_bumped = 0; 343 pr->u.p.ordered_lower = 1; 344 pr->u.p.ordered_upper = 0; 345 } 346 } 347 348 switch (schedule) { 349 #if (KMP_STATIC_STEAL_ENABLED) 350 case kmp_sch_static_steal: { 351 T ntc, init; 352 353 KD_TRACE(100, 354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", 355 gtid)); 356 357 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 358 if (nproc > 1 && ntc >= nproc) { 359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); 360 T id = tid; 361 T small_chunk, extras; 362 363 small_chunk = ntc / nproc; 364 extras = ntc % nproc; 365 366 init = id * small_chunk + (id < extras ? id : extras); 367 pr->u.p.count = init; 368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 369 370 pr->u.p.parm2 = lb; 371 // parm3 is the number of times to attempt stealing which is 372 // proportional to the number of chunks per thread up until 373 // the maximum value of nproc. 374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); 375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 376 pr->u.p.st = st; 377 if (traits_t<T>::type_size > 4) { 378 // AC: TODO: check if 16-byte CAS available and use it to 379 // improve performance (probably wait for explicit request 380 // before spending time on this). 381 // For now use dynamically allocated per-thread lock, 382 // free memory in __kmp_dispatch_next when status==0. 383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); 384 pr->u.p.th_steal_lock = 385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 386 __kmp_init_lock(pr->u.p.th_steal_lock); 387 } 388 break; 389 } else { 390 /* too few chunks: switching to kmp_sch_dynamic_chunked */ 391 schedule = kmp_sch_dynamic_chunked; 392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to " 393 "kmp_sch_dynamic_chunked\n", 394 gtid)); 395 if (pr->u.p.parm1 <= 0) 396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 397 break; 398 } // if 399 } // case 400 #endif 401 case kmp_sch_static_balanced: { 402 T init, limit; 403 404 KD_TRACE( 405 100, 406 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n", 407 gtid)); 408 409 if (nproc > 1) { 410 T id = tid; 411 412 if (tc < nproc) { 413 if (id < tc) { 414 init = id; 415 limit = id; 416 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 417 } else { 418 pr->u.p.count = 1; /* means no more chunks to execute */ 419 pr->u.p.parm1 = FALSE; 420 break; 421 } 422 } else { 423 T small_chunk = tc / nproc; 424 T extras = tc % nproc; 425 init = id * small_chunk + (id < extras ? id : extras); 426 limit = init + small_chunk - (id < extras ? 0 : 1); 427 pr->u.p.parm1 = (id == nproc - 1); 428 } 429 } else { 430 if (tc > 0) { 431 init = 0; 432 limit = tc - 1; 433 pr->u.p.parm1 = TRUE; 434 } else { 435 // zero trip count 436 pr->u.p.count = 1; /* means no more chunks to execute */ 437 pr->u.p.parm1 = FALSE; 438 break; 439 } 440 } 441 #if USE_ITT_BUILD 442 // Calculate chunk for metadata report 443 if (itt_need_metadata_reporting) 444 if (cur_chunk) 445 *cur_chunk = limit - init + 1; 446 #endif 447 if (st == 1) { 448 pr->u.p.lb = lb + init; 449 pr->u.p.ub = lb + limit; 450 } else { 451 // calculated upper bound, "ub" is user-defined upper bound 452 T ub_tmp = lb + limit * st; 453 pr->u.p.lb = lb + init * st; 454 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 455 // it exactly 456 if (st > 0) { 457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 458 } else { 459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 460 } 461 } 462 if (pr->flags.ordered) { 463 pr->u.p.ordered_lower = init; 464 pr->u.p.ordered_upper = limit; 465 } 466 break; 467 } // case 468 case kmp_sch_static_balanced_chunked: { 469 // similar to balanced, but chunk adjusted to multiple of simd width 470 T nth = nproc; 471 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 472 " -> falling-through to static_greedy\n", 473 gtid)); 474 schedule = kmp_sch_static_greedy; 475 if (nth > 1) 476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 477 else 478 pr->u.p.parm1 = tc; 479 break; 480 } // case 481 case kmp_sch_guided_simd: 482 case kmp_sch_guided_iterative_chunked: { 483 KD_TRACE( 484 100, 485 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 486 " case\n", 487 gtid)); 488 489 if (nproc > 1) { 490 if ((2L * chunk + 1) * nproc >= tc) { 491 /* chunk size too large, switch to dynamic */ 492 schedule = kmp_sch_dynamic_chunked; 493 } else { 494 // when remaining iters become less than parm2 - switch to dynamic 495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 496 *(double *)&pr->u.p.parm3 = 497 guided_flt_param / (double)nproc; // may occupy parm3 and parm4 498 } 499 } else { 500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 501 "kmp_sch_static_greedy\n", 502 gtid)); 503 schedule = kmp_sch_static_greedy; 504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 505 KD_TRACE( 506 100, 507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 508 gtid)); 509 pr->u.p.parm1 = tc; 510 } // if 511 } // case 512 break; 513 case kmp_sch_guided_analytical_chunked: { 514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 515 "kmp_sch_guided_analytical_chunked case\n", 516 gtid)); 517 518 if (nproc > 1) { 519 if ((2L * chunk + 1) * nproc >= tc) { 520 /* chunk size too large, switch to dynamic */ 521 schedule = kmp_sch_dynamic_chunked; 522 } else { 523 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 524 DBL x; 525 526 #if KMP_USE_X87CONTROL 527 /* Linux* OS already has 64-bit computation by default for long double, 528 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 529 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 530 instead of the default 53-bit. Even though long double doesn't work 531 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 532 expected to impact the correctness of the algorithm, but this has not 533 been mathematically proven. */ 534 // save original FPCW and set precision to 64-bit, as 535 // Windows* OS on IA-32 architecture defaults to 53-bit 536 unsigned int oldFpcw = _control87(0, 0); 537 _control87(_PC_64, _MCW_PC); // 0,0x30000 538 #endif 539 /* value used for comparison in solver for cross-over point */ 540 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 541 542 /* crossover point--chunk indexes equal to or greater than 543 this point switch to dynamic-style scheduling */ 544 UT cross; 545 546 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 547 x = 1.0 - 0.5 / (double)nproc; 548 549 #ifdef KMP_DEBUG 550 { // test natural alignment 551 struct _test_a { 552 char a; 553 union { 554 char b; 555 DBL d; 556 }; 557 } t; 558 ptrdiff_t natural_alignment = 559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 560 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 561 // long)natural_alignment ); 562 KMP_DEBUG_ASSERT( 563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 564 } 565 #endif // KMP_DEBUG 566 567 /* save the term in thread private dispatch structure */ 568 *(DBL *)&pr->u.p.parm3 = x; 569 570 /* solve for the crossover point to the nearest integer i for which C_i 571 <= chunk */ 572 { 573 UT left, right, mid; 574 long double p; 575 576 /* estimate initial upper and lower bound */ 577 578 /* doesn't matter what value right is as long as it is positive, but 579 it affects performance of the solver */ 580 right = 229; 581 p = __kmp_pow<UT>(x, right); 582 if (p > target) { 583 do { 584 p *= p; 585 right <<= 1; 586 } while (p > target && right < (1 << 27)); 587 /* lower bound is previous (failed) estimate of upper bound */ 588 left = right >> 1; 589 } else { 590 left = 0; 591 } 592 593 /* bisection root-finding method */ 594 while (left + 1 < right) { 595 mid = (left + right) / 2; 596 if (__kmp_pow<UT>(x, mid) > target) { 597 left = mid; 598 } else { 599 right = mid; 600 } 601 } // while 602 cross = right; 603 } 604 /* assert sanity of computed crossover point */ 605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 606 __kmp_pow<UT>(x, cross) <= target); 607 608 /* save the crossover point in thread private dispatch structure */ 609 pr->u.p.parm2 = cross; 610 611 // C75803 612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 614 #else 615 #define GUIDED_ANALYTICAL_WORKAROUND (x) 616 #endif 617 /* dynamic-style scheduling offset */ 618 pr->u.p.count = tc - 619 __kmp_dispatch_guided_remaining( 620 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 621 cross * chunk; 622 #if KMP_USE_X87CONTROL 623 // restore FPCW 624 _control87(oldFpcw, _MCW_PC); 625 #endif 626 } // if 627 } else { 628 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to " 629 "kmp_sch_static_greedy\n", 630 gtid)); 631 schedule = kmp_sch_static_greedy; 632 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 633 pr->u.p.parm1 = tc; 634 } // if 635 } // case 636 break; 637 case kmp_sch_static_greedy: 638 KD_TRACE( 639 100, 640 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n", 641 gtid)); 642 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; 643 break; 644 case kmp_sch_static_chunked: 645 case kmp_sch_dynamic_chunked: 646 if (pr->u.p.parm1 <= 0) { 647 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 648 } 649 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " 650 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 651 gtid)); 652 break; 653 case kmp_sch_trapezoidal: { 654 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 655 656 T parm1, parm2, parm3, parm4; 657 KD_TRACE(100, 658 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n", 659 gtid)); 660 661 parm1 = chunk; 662 663 /* F : size of the first cycle */ 664 parm2 = (tc / (2 * nproc)); 665 666 if (parm2 < 1) { 667 parm2 = 1; 668 } 669 670 /* L : size of the last cycle. Make sure the last cycle is not larger 671 than the first cycle. */ 672 if (parm1 < 1) { 673 parm1 = 1; 674 } else if (parm1 > parm2) { 675 parm1 = parm2; 676 } 677 678 /* N : number of cycles */ 679 parm3 = (parm2 + parm1); 680 parm3 = (2 * tc + parm3 - 1) / parm3; 681 682 if (parm3 < 2) { 683 parm3 = 2; 684 } 685 686 /* sigma : decreasing incr of the trapezoid */ 687 parm4 = (parm3 - 1); 688 parm4 = (parm2 - parm1) / parm4; 689 690 // pointless check, because parm4 >= 0 always 691 // if ( parm4 < 0 ) { 692 // parm4 = 0; 693 //} 694 695 pr->u.p.parm1 = parm1; 696 pr->u.p.parm2 = parm2; 697 pr->u.p.parm3 = parm3; 698 pr->u.p.parm4 = parm4; 699 } // case 700 break; 701 702 default: { 703 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 704 KMP_HNT(GetNewerLibrary), // Hint 705 __kmp_msg_null // Variadic argument list terminator 706 ); 707 } break; 708 } // switch 709 pr->schedule = schedule; 710 } 711 712 #if KMP_USE_HIER_SCHED 713 template <typename T> 714 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, 715 typename traits_t<T>::signed_t st); 716 template <> 717 inline void 718 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, 719 kmp_int32 ub, kmp_int32 st) { 720 __kmp_dispatch_init_hierarchy<kmp_int32>( 721 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 722 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 723 } 724 template <> 725 inline void 726 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, 727 kmp_uint32 ub, kmp_int32 st) { 728 __kmp_dispatch_init_hierarchy<kmp_uint32>( 729 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 730 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); 731 } 732 template <> 733 inline void 734 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, 735 kmp_int64 ub, kmp_int64 st) { 736 __kmp_dispatch_init_hierarchy<kmp_int64>( 737 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 738 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 739 } 740 template <> 741 inline void 742 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, 743 kmp_uint64 ub, kmp_int64 st) { 744 __kmp_dispatch_init_hierarchy<kmp_uint64>( 745 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, 746 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); 747 } 748 749 // free all the hierarchy scheduling memory associated with the team 750 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { 751 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 752 for (int i = 0; i < num_disp_buff; ++i) { 753 // type does not matter here so use kmp_int32 754 auto sh = 755 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( 756 &team->t.t_disp_buffer[i]); 757 if (sh->hier) { 758 sh->hier->deallocate(); 759 __kmp_free(sh->hier); 760 } 761 } 762 } 763 #endif 764 765 // UT - unsigned flavor of T, ST - signed flavor of T, 766 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 767 template <typename T> 768 static void 769 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 770 T ub, typename traits_t<T>::signed_t st, 771 typename traits_t<T>::signed_t chunk, int push_ws) { 772 typedef typename traits_t<T>::unsigned_t UT; 773 774 int active; 775 kmp_info_t *th; 776 kmp_team_t *team; 777 kmp_uint32 my_buffer_index; 778 dispatch_private_info_template<T> *pr; 779 dispatch_shared_info_template<T> volatile *sh; 780 781 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 782 sizeof(dispatch_private_info)); 783 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 784 sizeof(dispatch_shared_info)); 785 __kmp_assert_valid_gtid(gtid); 786 787 if (!TCR_4(__kmp_init_parallel)) 788 __kmp_parallel_initialize(); 789 790 __kmp_resume_if_soft_paused(); 791 792 #if INCLUDE_SSC_MARKS 793 SSC_MARK_DISPATCH_INIT(); 794 #endif 795 #ifdef KMP_DEBUG 796 typedef typename traits_t<T>::signed_t ST; 797 { 798 char *buff; 799 // create format specifiers before the debug output 800 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 801 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 802 traits_t<ST>::spec, traits_t<T>::spec, 803 traits_t<T>::spec, traits_t<ST>::spec); 804 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 805 __kmp_str_free(&buff); 806 } 807 #endif 808 /* setup data */ 809 th = __kmp_threads[gtid]; 810 team = th->th.th_team; 811 active = !team->t.t_serialized; 812 th->th.th_ident = loc; 813 814 // Any half-decent optimizer will remove this test when the blocks are empty 815 // since the macros expand to nothing 816 // when statistics are disabled. 817 if (schedule == __kmp_static) { 818 KMP_COUNT_BLOCK(OMP_LOOP_STATIC); 819 } else { 820 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC); 821 } 822 823 #if KMP_USE_HIER_SCHED 824 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable 825 // Hierarchical scheduling does not work with ordered, so if ordered is 826 // detected, then revert back to threaded scheduling. 827 bool ordered; 828 enum sched_type my_sched = schedule; 829 my_buffer_index = th->th.th_dispatch->th_disp_index; 830 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 831 &th->th.th_dispatch 832 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 833 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched); 834 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) 835 my_sched = 836 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); 837 ordered = (kmp_ord_lower & my_sched); 838 if (pr->flags.use_hier) { 839 if (ordered) { 840 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. " 841 "Disabling hierarchical scheduling.\n", 842 gtid)); 843 pr->flags.use_hier = FALSE; 844 } 845 } 846 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { 847 // Don't use hierarchical for ordered parallel loops and don't 848 // use the runtime hierarchy if one was specified in the program 849 if (!ordered && !pr->flags.use_hier) 850 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); 851 } 852 #endif // KMP_USE_HIER_SCHED 853 854 #if USE_ITT_BUILD 855 kmp_uint64 cur_chunk = chunk; 856 int itt_need_metadata_reporting = 857 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 858 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 859 team->t.t_active_level == 1; 860 #endif 861 if (!active) { 862 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 863 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 864 } else { 865 KMP_DEBUG_ASSERT(th->th.th_dispatch == 866 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 867 868 my_buffer_index = th->th.th_dispatch->th_disp_index++; 869 870 /* What happens when number of threads changes, need to resize buffer? */ 871 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 872 &th->th.th_dispatch 873 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 874 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 875 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 876 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, 877 my_buffer_index)); 878 } 879 880 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, 881 #if USE_ITT_BUILD 882 &cur_chunk, 883 #endif 884 chunk, (T)th->th.th_team_nproc, 885 (T)th->th.th_info.ds.ds_tid); 886 if (active) { 887 if (pr->flags.ordered == 0) { 888 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 889 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 890 } else { 891 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 892 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 893 } 894 } 895 896 if (active) { 897 /* The name of this buffer should be my_buffer_index when it's free to use 898 * it */ 899 900 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 901 "sh->buffer_index:%d\n", 902 gtid, my_buffer_index, sh->buffer_index)); 903 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, 904 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 905 // Note: KMP_WAIT() cannot be used there: buffer index and 906 // my_buffer_index are *always* 32-bit integers. 907 KMP_MB(); /* is this necessary? */ 908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 909 "sh->buffer_index:%d\n", 910 gtid, my_buffer_index, sh->buffer_index)); 911 912 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 913 th->th.th_dispatch->th_dispatch_sh_current = 914 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 915 #if USE_ITT_BUILD 916 if (pr->flags.ordered) { 917 __kmp_itt_ordered_init(gtid); 918 } 919 // Report loop metadata 920 if (itt_need_metadata_reporting) { 921 // Only report metadata by master of active team at level 1 922 kmp_uint64 schedtype = 0; 923 switch (schedule) { 924 case kmp_sch_static_chunked: 925 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 926 break; 927 case kmp_sch_static_greedy: 928 cur_chunk = pr->u.p.parm1; 929 break; 930 case kmp_sch_dynamic_chunked: 931 schedtype = 1; 932 break; 933 case kmp_sch_guided_iterative_chunked: 934 case kmp_sch_guided_analytical_chunked: 935 case kmp_sch_guided_simd: 936 schedtype = 2; 937 break; 938 default: 939 // Should we put this case under "static"? 940 // case kmp_sch_static_steal: 941 schedtype = 3; 942 break; 943 } 944 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); 945 } 946 #if KMP_USE_HIER_SCHED 947 if (pr->flags.use_hier) { 948 pr->u.p.count = 0; 949 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; 950 } 951 #endif // KMP_USER_HIER_SCHED 952 #endif /* USE_ITT_BUILD */ 953 } 954 955 #ifdef KMP_DEBUG 956 { 957 char *buff; 958 // create format specifiers before the debug output 959 buff = __kmp_str_format( 960 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 961 "lb:%%%s ub:%%%s" 962 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 963 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 964 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 965 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 966 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 967 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 968 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb, 969 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, 970 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 971 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4)); 972 __kmp_str_free(&buff); 973 } 974 #endif 975 #if (KMP_STATIC_STEAL_ENABLED) 976 // It cannot be guaranteed that after execution of a loop with some other 977 // schedule kind all the parm3 variables will contain the same value. Even if 978 // all parm3 will be the same, it still exists a bad case like using 0 and 1 979 // rather than program life-time increment. So the dedicated variable is 980 // required. The 'static_steal_counter' is used. 981 if (pr->schedule == kmp_sch_static_steal) { 982 // Other threads will inspect this variable when searching for a victim. 983 // This is a flag showing that other threads may steal from this thread 984 // since then. 985 volatile T *p = &pr->u.p.static_steal_counter; 986 *p = *p + 1; 987 } 988 #endif // ( KMP_STATIC_STEAL_ENABLED ) 989 990 #if OMPT_SUPPORT && OMPT_OPTIONAL 991 if (ompt_enabled.ompt_callback_work) { 992 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 993 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 994 ompt_callbacks.ompt_callback(ompt_callback_work)( 995 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 996 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 997 } 998 #endif 999 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic); 1000 } 1001 1002 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1003 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1004 * every chunk of iterations. If the ordered section(s) were not executed 1005 * for this iteration (or every iteration in this chunk), we need to set the 1006 * ordered iteration counters so that the next thread can proceed. */ 1007 template <typename UT> 1008 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1009 typedef typename traits_t<UT>::signed_t ST; 1010 __kmp_assert_valid_gtid(gtid); 1011 kmp_info_t *th = __kmp_threads[gtid]; 1012 1013 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1014 if (!th->th.th_team->t.t_serialized) { 1015 1016 dispatch_private_info_template<UT> *pr = 1017 reinterpret_cast<dispatch_private_info_template<UT> *>( 1018 th->th.th_dispatch->th_dispatch_pr_current); 1019 dispatch_shared_info_template<UT> volatile *sh = 1020 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1021 th->th.th_dispatch->th_dispatch_sh_current); 1022 KMP_DEBUG_ASSERT(pr); 1023 KMP_DEBUG_ASSERT(sh); 1024 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1025 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1026 1027 if (pr->ordered_bumped) { 1028 KD_TRACE( 1029 1000, 1030 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1031 gtid)); 1032 pr->ordered_bumped = 0; 1033 } else { 1034 UT lower = pr->u.p.ordered_lower; 1035 1036 #ifdef KMP_DEBUG 1037 { 1038 char *buff; 1039 // create format specifiers before the debug output 1040 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1041 "ordered_iteration:%%%s lower:%%%s\n", 1042 traits_t<UT>::spec, traits_t<UT>::spec); 1043 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1044 __kmp_str_free(&buff); 1045 } 1046 #endif 1047 1048 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1049 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1050 KMP_MB(); /* is this necessary? */ 1051 #ifdef KMP_DEBUG 1052 { 1053 char *buff; 1054 // create format specifiers before the debug output 1055 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1056 "ordered_iteration:%%%s lower:%%%s\n", 1057 traits_t<UT>::spec, traits_t<UT>::spec); 1058 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1059 __kmp_str_free(&buff); 1060 } 1061 #endif 1062 1063 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1064 } // if 1065 } // if 1066 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1067 } 1068 1069 #ifdef KMP_GOMP_COMPAT 1070 1071 template <typename UT> 1072 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1073 typedef typename traits_t<UT>::signed_t ST; 1074 __kmp_assert_valid_gtid(gtid); 1075 kmp_info_t *th = __kmp_threads[gtid]; 1076 1077 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1078 if (!th->th.th_team->t.t_serialized) { 1079 // int cid; 1080 dispatch_private_info_template<UT> *pr = 1081 reinterpret_cast<dispatch_private_info_template<UT> *>( 1082 th->th.th_dispatch->th_dispatch_pr_current); 1083 dispatch_shared_info_template<UT> volatile *sh = 1084 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1085 th->th.th_dispatch->th_dispatch_sh_current); 1086 KMP_DEBUG_ASSERT(pr); 1087 KMP_DEBUG_ASSERT(sh); 1088 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1089 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1090 1091 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1092 UT lower = pr->u.p.ordered_lower; 1093 UT upper = pr->u.p.ordered_upper; 1094 UT inc = upper - lower + 1; 1095 1096 if (pr->ordered_bumped == inc) { 1097 KD_TRACE( 1098 1000, 1099 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1100 gtid)); 1101 pr->ordered_bumped = 0; 1102 } else { 1103 inc -= pr->ordered_bumped; 1104 1105 #ifdef KMP_DEBUG 1106 { 1107 char *buff; 1108 // create format specifiers before the debug output 1109 buff = __kmp_str_format( 1110 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1111 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1112 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1113 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1114 __kmp_str_free(&buff); 1115 } 1116 #endif 1117 1118 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, 1119 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1120 1121 KMP_MB(); /* is this necessary? */ 1122 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1123 "ordered_bumped to zero\n", 1124 gtid)); 1125 pr->ordered_bumped = 0; 1126 //!!!!! TODO check if the inc should be unsigned, or signed??? 1127 #ifdef KMP_DEBUG 1128 { 1129 char *buff; 1130 // create format specifiers before the debug output 1131 buff = __kmp_str_format( 1132 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1133 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1134 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1135 traits_t<UT>::spec); 1136 KD_TRACE(1000, 1137 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1138 __kmp_str_free(&buff); 1139 } 1140 #endif 1141 1142 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1143 } 1144 // } 1145 } 1146 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1147 } 1148 1149 #endif /* KMP_GOMP_COMPAT */ 1150 1151 template <typename T> 1152 int __kmp_dispatch_next_algorithm(int gtid, 1153 dispatch_private_info_template<T> *pr, 1154 dispatch_shared_info_template<T> volatile *sh, 1155 kmp_int32 *p_last, T *p_lb, T *p_ub, 1156 typename traits_t<T>::signed_t *p_st, T nproc, 1157 T tid) { 1158 typedef typename traits_t<T>::unsigned_t UT; 1159 typedef typename traits_t<T>::signed_t ST; 1160 typedef typename traits_t<T>::floating_t DBL; 1161 int status = 0; 1162 bool last = false; 1163 T start; 1164 ST incr; 1165 UT limit, trip, init; 1166 kmp_info_t *th = __kmp_threads[gtid]; 1167 kmp_team_t *team = th->th.th_team; 1168 1169 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1170 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1171 KMP_DEBUG_ASSERT(pr); 1172 KMP_DEBUG_ASSERT(sh); 1173 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc); 1174 #ifdef KMP_DEBUG 1175 { 1176 char *buff; 1177 // create format specifiers before the debug output 1178 buff = 1179 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1180 "sh:%%p nproc:%%%s tid:%%%s\n", 1181 traits_t<T>::spec, traits_t<T>::spec); 1182 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid)); 1183 __kmp_str_free(&buff); 1184 } 1185 #endif 1186 1187 // zero trip count 1188 if (pr->u.p.tc == 0) { 1189 KD_TRACE(10, 1190 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1191 "zero status:%d\n", 1192 gtid, status)); 1193 return 0; 1194 } 1195 1196 switch (pr->schedule) { 1197 #if (KMP_STATIC_STEAL_ENABLED) 1198 case kmp_sch_static_steal: { 1199 T chunk = pr->u.p.parm1; 1200 1201 KD_TRACE(100, 1202 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", 1203 gtid)); 1204 1205 trip = pr->u.p.tc - 1; 1206 1207 if (traits_t<T>::type_size > 4) { 1208 // use lock for 8-byte and CAS for 4-byte induction 1209 // variable. TODO (optional): check and use 16-byte CAS 1210 kmp_lock_t *lck = pr->u.p.th_steal_lock; 1211 KMP_DEBUG_ASSERT(lck != NULL); 1212 if (pr->u.p.count < (UT)pr->u.p.ub) { 1213 __kmp_acquire_lock(lck, gtid); 1214 // try to get own chunk of iterations 1215 init = (pr->u.p.count)++; 1216 status = (init < (UT)pr->u.p.ub); 1217 __kmp_release_lock(lck, gtid); 1218 } else { 1219 status = 0; // no own chunks 1220 } 1221 if (!status) { // try to steal 1222 kmp_info_t **other_threads = team->t.t_threads; 1223 T while_limit = pr->u.p.parm3; 1224 T while_index = 0; 1225 T id = pr->u.p.static_steal_counter; // loop id 1226 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1227 __kmp_dispatch_num_buffers; // current loop index 1228 // note: victim thread can potentially execute another loop 1229 // TODO: algorithm of searching for a victim 1230 // should be cleaned up and measured 1231 while ((!status) && (while_limit != ++while_index)) { 1232 dispatch_private_info_template<T> *victim; 1233 T remaining; 1234 T victimIdx = pr->u.p.parm4; 1235 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1236 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1237 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1238 KMP_DEBUG_ASSERT(victim); 1239 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1240 oldVictimIdx != victimIdx) { 1241 victimIdx = (victimIdx + 1) % nproc; 1242 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1243 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1244 KMP_DEBUG_ASSERT(victim); 1245 } 1246 if (victim == pr || id != victim->u.p.static_steal_counter) { 1247 continue; // try once more (nproc attempts in total) 1248 // no victim is ready yet to participate in stealing 1249 // because no victim passed kmp_init_dispatch yet 1250 } 1251 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1252 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1253 continue; // not enough chunks to steal, goto next victim 1254 } 1255 1256 lck = victim->u.p.th_steal_lock; 1257 KMP_ASSERT(lck != NULL); 1258 __kmp_acquire_lock(lck, gtid); 1259 limit = victim->u.p.ub; // keep initial ub 1260 if (victim->u.p.count >= limit || 1261 (remaining = limit - victim->u.p.count) < 2) { 1262 __kmp_release_lock(lck, gtid); 1263 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1264 continue; // not enough chunks to steal 1265 } 1266 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or 1267 // by 1 1268 if (remaining > 3) { 1269 // steal 1/4 of remaining 1270 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); 1271 init = (victim->u.p.ub -= (remaining >> 2)); 1272 } else { 1273 // steal 1 chunk of 2 or 3 remaining 1274 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); 1275 init = (victim->u.p.ub -= 1); 1276 } 1277 __kmp_release_lock(lck, gtid); 1278 1279 KMP_DEBUG_ASSERT(init + 1 <= limit); 1280 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1281 status = 1; 1282 while_index = 0; 1283 // now update own count and ub with stolen range but init chunk 1284 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); 1285 pr->u.p.count = init + 1; 1286 pr->u.p.ub = limit; 1287 __kmp_release_lock(pr->u.p.th_steal_lock, gtid); 1288 } // while (search for victim) 1289 } // if (try to find victim and steal) 1290 } else { 1291 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1292 typedef union { 1293 struct { 1294 UT count; 1295 T ub; 1296 } p; 1297 kmp_int64 b; 1298 } union_i4; 1299 // All operations on 'count' or 'ub' must be combined atomically 1300 // together. 1301 { 1302 union_i4 vold, vnew; 1303 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1304 vnew = vold; 1305 vnew.p.count++; 1306 while (!KMP_COMPARE_AND_STORE_ACQ64( 1307 (volatile kmp_int64 *)&pr->u.p.count, 1308 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1309 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1310 KMP_CPU_PAUSE(); 1311 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1312 vnew = vold; 1313 vnew.p.count++; 1314 } 1315 vnew = vold; 1316 init = vnew.p.count; 1317 status = (init < (UT)vnew.p.ub); 1318 } 1319 1320 if (!status) { 1321 kmp_info_t **other_threads = team->t.t_threads; 1322 T while_limit = pr->u.p.parm3; 1323 T while_index = 0; 1324 T id = pr->u.p.static_steal_counter; // loop id 1325 int idx = (th->th.th_dispatch->th_disp_index - 1) % 1326 __kmp_dispatch_num_buffers; // current loop index 1327 // note: victim thread can potentially execute another loop 1328 // TODO: algorithm of searching for a victim 1329 // should be cleaned up and measured 1330 while ((!status) && (while_limit != ++while_index)) { 1331 dispatch_private_info_template<T> *victim; 1332 union_i4 vold, vnew; 1333 T remaining; 1334 T victimIdx = pr->u.p.parm4; 1335 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1336 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1337 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1338 KMP_DEBUG_ASSERT(victim); 1339 while ((victim == pr || id != victim->u.p.static_steal_counter) && 1340 oldVictimIdx != victimIdx) { 1341 victimIdx = (victimIdx + 1) % nproc; 1342 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1343 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); 1344 KMP_DEBUG_ASSERT(victim); 1345 } 1346 if (victim == pr || id != victim->u.p.static_steal_counter) { 1347 continue; // try once more (nproc attempts in total) 1348 // no victim is ready yet to participate in stealing 1349 // because no victim passed kmp_init_dispatch yet 1350 } 1351 pr->u.p.parm4 = victimIdx; // new victim found 1352 while (1) { // CAS loop if victim has enough chunks to steal 1353 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1354 vnew = vold; 1355 1356 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1357 if (vnew.p.count >= (UT)vnew.p.ub || 1358 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1359 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1360 break; // not enough chunks to steal, goto next victim 1361 } 1362 if (remaining > 3) { 1363 // try to steal 1/4 of remaining 1364 vnew.p.ub -= remaining >> 2; 1365 } else { 1366 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1367 } 1368 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1369 // TODO: Should this be acquire or release? 1370 if (KMP_COMPARE_AND_STORE_ACQ64( 1371 (volatile kmp_int64 *)&victim->u.p.count, 1372 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1373 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1374 // stealing succeeded 1375 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1376 vold.p.ub - vnew.p.ub); 1377 status = 1; 1378 while_index = 0; 1379 // now update own count and ub 1380 init = vnew.p.ub; 1381 vold.p.count = init + 1; 1382 #if KMP_ARCH_X86 1383 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b); 1384 #else 1385 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1386 #endif 1387 break; 1388 } // if (check CAS result) 1389 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt 1390 } // while (try to steal from particular victim) 1391 } // while (search for victim) 1392 } // if (try to find victim and steal) 1393 } // if (4-byte induction variable) 1394 if (!status) { 1395 *p_lb = 0; 1396 *p_ub = 0; 1397 if (p_st != NULL) 1398 *p_st = 0; 1399 } else { 1400 start = pr->u.p.parm2; 1401 init *= chunk; 1402 limit = chunk + init - 1; 1403 incr = pr->u.p.st; 1404 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); 1405 1406 KMP_DEBUG_ASSERT(init <= trip); 1407 if ((last = (limit >= trip)) != 0) 1408 limit = trip; 1409 if (p_st != NULL) 1410 *p_st = incr; 1411 1412 if (incr == 1) { 1413 *p_lb = start + init; 1414 *p_ub = start + limit; 1415 } else { 1416 *p_lb = start + init * incr; 1417 *p_ub = start + limit * incr; 1418 } 1419 1420 if (pr->flags.ordered) { 1421 pr->u.p.ordered_lower = init; 1422 pr->u.p.ordered_upper = limit; 1423 } // if 1424 } // if 1425 break; 1426 } // case 1427 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1428 case kmp_sch_static_balanced: { 1429 KD_TRACE( 1430 10, 1431 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n", 1432 gtid)); 1433 /* check if thread has any iteration to do */ 1434 if ((status = !pr->u.p.count) != 0) { 1435 pr->u.p.count = 1; 1436 *p_lb = pr->u.p.lb; 1437 *p_ub = pr->u.p.ub; 1438 last = (pr->u.p.parm1 != 0); 1439 if (p_st != NULL) 1440 *p_st = pr->u.p.st; 1441 } else { /* no iterations to do */ 1442 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1443 } 1444 } // case 1445 break; 1446 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1447 merged here */ 1448 case kmp_sch_static_chunked: { 1449 T parm1; 1450 1451 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1452 "kmp_sch_static_[affinity|chunked] case\n", 1453 gtid)); 1454 parm1 = pr->u.p.parm1; 1455 1456 trip = pr->u.p.tc - 1; 1457 init = parm1 * (pr->u.p.count + tid); 1458 1459 if ((status = (init <= trip)) != 0) { 1460 start = pr->u.p.lb; 1461 incr = pr->u.p.st; 1462 limit = parm1 + init - 1; 1463 1464 if ((last = (limit >= trip)) != 0) 1465 limit = trip; 1466 1467 if (p_st != NULL) 1468 *p_st = incr; 1469 1470 pr->u.p.count += nproc; 1471 1472 if (incr == 1) { 1473 *p_lb = start + init; 1474 *p_ub = start + limit; 1475 } else { 1476 *p_lb = start + init * incr; 1477 *p_ub = start + limit * incr; 1478 } 1479 1480 if (pr->flags.ordered) { 1481 pr->u.p.ordered_lower = init; 1482 pr->u.p.ordered_upper = limit; 1483 } // if 1484 } // if 1485 } // case 1486 break; 1487 1488 case kmp_sch_dynamic_chunked: { 1489 T chunk = pr->u.p.parm1; 1490 1491 KD_TRACE( 1492 100, 1493 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n", 1494 gtid)); 1495 1496 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1497 trip = pr->u.p.tc - 1; 1498 1499 if ((status = (init <= trip)) == 0) { 1500 *p_lb = 0; 1501 *p_ub = 0; 1502 if (p_st != NULL) 1503 *p_st = 0; 1504 } else { 1505 start = pr->u.p.lb; 1506 limit = chunk + init - 1; 1507 incr = pr->u.p.st; 1508 1509 if ((last = (limit >= trip)) != 0) 1510 limit = trip; 1511 1512 if (p_st != NULL) 1513 *p_st = incr; 1514 1515 if (incr == 1) { 1516 *p_lb = start + init; 1517 *p_ub = start + limit; 1518 } else { 1519 *p_lb = start + init * incr; 1520 *p_ub = start + limit * incr; 1521 } 1522 1523 if (pr->flags.ordered) { 1524 pr->u.p.ordered_lower = init; 1525 pr->u.p.ordered_upper = limit; 1526 } // if 1527 } // if 1528 } // case 1529 break; 1530 1531 case kmp_sch_guided_iterative_chunked: { 1532 T chunkspec = pr->u.p.parm1; 1533 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1534 "iterative case\n", 1535 gtid)); 1536 trip = pr->u.p.tc; 1537 // Start atomic part of calculations 1538 while (1) { 1539 ST remaining; // signed, because can be < 0 1540 init = sh->u.s.iteration; // shared value 1541 remaining = trip - init; 1542 if (remaining <= 0) { // AC: need to compare with 0 first 1543 // nothing to do, don't try atomic op 1544 status = 0; 1545 break; 1546 } 1547 if ((T)remaining < 1548 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1549 // use dynamic-style schedule 1550 // atomically increment iterations, get old value 1551 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1552 (ST)chunkspec); 1553 remaining = trip - init; 1554 if (remaining <= 0) { 1555 status = 0; // all iterations got by other threads 1556 } else { 1557 // got some iterations to work on 1558 status = 1; 1559 if ((T)remaining > chunkspec) { 1560 limit = init + chunkspec - 1; 1561 } else { 1562 last = true; // the last chunk 1563 limit = init + remaining - 1; 1564 } // if 1565 } // if 1566 break; 1567 } // if 1568 limit = init + (UT)((double)remaining * 1569 *(double *)&pr->u.p.parm3); // divide by K*nproc 1570 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1571 (ST)init, (ST)limit)) { 1572 // CAS was successful, chunk obtained 1573 status = 1; 1574 --limit; 1575 break; 1576 } // if 1577 } // while 1578 if (status != 0) { 1579 start = pr->u.p.lb; 1580 incr = pr->u.p.st; 1581 if (p_st != NULL) 1582 *p_st = incr; 1583 *p_lb = start + init * incr; 1584 *p_ub = start + limit * incr; 1585 if (pr->flags.ordered) { 1586 pr->u.p.ordered_lower = init; 1587 pr->u.p.ordered_upper = limit; 1588 } // if 1589 } else { 1590 *p_lb = 0; 1591 *p_ub = 0; 1592 if (p_st != NULL) 1593 *p_st = 0; 1594 } // if 1595 } // case 1596 break; 1597 1598 case kmp_sch_guided_simd: { 1599 // same as iterative but curr-chunk adjusted to be multiple of given 1600 // chunk 1601 T chunk = pr->u.p.parm1; 1602 KD_TRACE(100, 1603 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n", 1604 gtid)); 1605 trip = pr->u.p.tc; 1606 // Start atomic part of calculations 1607 while (1) { 1608 ST remaining; // signed, because can be < 0 1609 init = sh->u.s.iteration; // shared value 1610 remaining = trip - init; 1611 if (remaining <= 0) { // AC: need to compare with 0 first 1612 status = 0; // nothing to do, don't try atomic op 1613 break; 1614 } 1615 KMP_DEBUG_ASSERT(init % chunk == 0); 1616 // compare with K*nproc*(chunk+1), K=2 by default 1617 if ((T)remaining < pr->u.p.parm2) { 1618 // use dynamic-style schedule 1619 // atomically increment iterations, get old value 1620 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1621 (ST)chunk); 1622 remaining = trip - init; 1623 if (remaining <= 0) { 1624 status = 0; // all iterations got by other threads 1625 } else { 1626 // got some iterations to work on 1627 status = 1; 1628 if ((T)remaining > chunk) { 1629 limit = init + chunk - 1; 1630 } else { 1631 last = true; // the last chunk 1632 limit = init + remaining - 1; 1633 } // if 1634 } // if 1635 break; 1636 } // if 1637 // divide by K*nproc 1638 UT span; 1639 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), 1640 &span); 1641 UT rem = span % chunk; 1642 if (rem) // adjust so that span%chunk == 0 1643 span += chunk - rem; 1644 limit = init + span; 1645 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1646 (ST)init, (ST)limit)) { 1647 // CAS was successful, chunk obtained 1648 status = 1; 1649 --limit; 1650 break; 1651 } // if 1652 } // while 1653 if (status != 0) { 1654 start = pr->u.p.lb; 1655 incr = pr->u.p.st; 1656 if (p_st != NULL) 1657 *p_st = incr; 1658 *p_lb = start + init * incr; 1659 *p_ub = start + limit * incr; 1660 if (pr->flags.ordered) { 1661 pr->u.p.ordered_lower = init; 1662 pr->u.p.ordered_upper = limit; 1663 } // if 1664 } else { 1665 *p_lb = 0; 1666 *p_ub = 0; 1667 if (p_st != NULL) 1668 *p_st = 0; 1669 } // if 1670 } // case 1671 break; 1672 1673 case kmp_sch_guided_analytical_chunked: { 1674 T chunkspec = pr->u.p.parm1; 1675 UT chunkIdx; 1676 #if KMP_USE_X87CONTROL 1677 /* for storing original FPCW value for Windows* OS on 1678 IA-32 architecture 8-byte version */ 1679 unsigned int oldFpcw; 1680 unsigned int fpcwSet = 0; 1681 #endif 1682 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d " 1683 "kmp_sch_guided_analytical_chunked case\n", 1684 gtid)); 1685 1686 trip = pr->u.p.tc; 1687 1688 KMP_DEBUG_ASSERT(nproc > 1); 1689 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip); 1690 1691 while (1) { /* this while loop is a safeguard against unexpected zero 1692 chunk sizes */ 1693 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1694 if (chunkIdx >= (UT)pr->u.p.parm2) { 1695 --trip; 1696 /* use dynamic-style scheduling */ 1697 init = chunkIdx * chunkspec + pr->u.p.count; 1698 /* need to verify init > 0 in case of overflow in the above 1699 * calculation */ 1700 if ((status = (init > 0 && init <= trip)) != 0) { 1701 limit = init + chunkspec - 1; 1702 1703 if ((last = (limit >= trip)) != 0) 1704 limit = trip; 1705 } 1706 break; 1707 } else { 1708 /* use exponential-style scheduling */ 1709 /* The following check is to workaround the lack of long double precision on 1710 Windows* OS. 1711 This check works around the possible effect that init != 0 for chunkIdx == 0. 1712 */ 1713 #if KMP_USE_X87CONTROL 1714 /* If we haven't already done so, save original 1715 FPCW and set precision to 64-bit, as Windows* OS 1716 on IA-32 architecture defaults to 53-bit */ 1717 if (!fpcwSet) { 1718 oldFpcw = _control87(0, 0); 1719 _control87(_PC_64, _MCW_PC); 1720 fpcwSet = 0x30000; 1721 } 1722 #endif 1723 if (chunkIdx) { 1724 init = __kmp_dispatch_guided_remaining<T>( 1725 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 1726 KMP_DEBUG_ASSERT(init); 1727 init = trip - init; 1728 } else 1729 init = 0; 1730 limit = trip - __kmp_dispatch_guided_remaining<T>( 1731 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 1732 KMP_ASSERT(init <= limit); 1733 if (init < limit) { 1734 KMP_DEBUG_ASSERT(limit <= trip); 1735 --limit; 1736 status = 1; 1737 break; 1738 } // if 1739 } // if 1740 } // while (1) 1741 #if KMP_USE_X87CONTROL 1742 /* restore FPCW if necessary 1743 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1744 */ 1745 if (fpcwSet && (oldFpcw & fpcwSet)) 1746 _control87(oldFpcw, _MCW_PC); 1747 #endif 1748 if (status != 0) { 1749 start = pr->u.p.lb; 1750 incr = pr->u.p.st; 1751 if (p_st != NULL) 1752 *p_st = incr; 1753 *p_lb = start + init * incr; 1754 *p_ub = start + limit * incr; 1755 if (pr->flags.ordered) { 1756 pr->u.p.ordered_lower = init; 1757 pr->u.p.ordered_upper = limit; 1758 } 1759 } else { 1760 *p_lb = 0; 1761 *p_ub = 0; 1762 if (p_st != NULL) 1763 *p_st = 0; 1764 } 1765 } // case 1766 break; 1767 1768 case kmp_sch_trapezoidal: { 1769 UT index; 1770 T parm2 = pr->u.p.parm2; 1771 T parm3 = pr->u.p.parm3; 1772 T parm4 = pr->u.p.parm4; 1773 KD_TRACE(100, 1774 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n", 1775 gtid)); 1776 1777 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 1778 1779 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 1780 trip = pr->u.p.tc - 1; 1781 1782 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 1783 *p_lb = 0; 1784 *p_ub = 0; 1785 if (p_st != NULL) 1786 *p_st = 0; 1787 } else { 1788 start = pr->u.p.lb; 1789 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 1790 incr = pr->u.p.st; 1791 1792 if ((last = (limit >= trip)) != 0) 1793 limit = trip; 1794 1795 if (p_st != NULL) 1796 *p_st = incr; 1797 1798 if (incr == 1) { 1799 *p_lb = start + init; 1800 *p_ub = start + limit; 1801 } else { 1802 *p_lb = start + init * incr; 1803 *p_ub = start + limit * incr; 1804 } 1805 1806 if (pr->flags.ordered) { 1807 pr->u.p.ordered_lower = init; 1808 pr->u.p.ordered_upper = limit; 1809 } // if 1810 } // if 1811 } // case 1812 break; 1813 default: { 1814 status = 0; // to avoid complaints on uninitialized variable use 1815 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1816 KMP_HNT(GetNewerLibrary), // Hint 1817 __kmp_msg_null // Variadic argument list terminator 1818 ); 1819 } break; 1820 } // switch 1821 if (p_last) 1822 *p_last = last; 1823 #ifdef KMP_DEBUG 1824 if (pr->flags.ordered) { 1825 char *buff; 1826 // create format specifiers before the debug output 1827 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " 1828 "ordered_lower:%%%s ordered_upper:%%%s\n", 1829 traits_t<UT>::spec, traits_t<UT>::spec); 1830 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper)); 1831 __kmp_str_free(&buff); 1832 } 1833 { 1834 char *buff; 1835 // create format specifiers before the debug output 1836 buff = __kmp_str_format( 1837 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1838 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", 1839 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1840 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st)); 1841 __kmp_str_free(&buff); 1842 } 1843 #endif 1844 return status; 1845 } 1846 1847 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1848 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1849 is not called. */ 1850 #if OMPT_SUPPORT && OMPT_OPTIONAL 1851 #define OMPT_LOOP_END \ 1852 if (status == 0) { \ 1853 if (ompt_enabled.ompt_callback_work) { \ 1854 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1855 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1856 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1857 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1858 &(task_info->task_data), 0, codeptr); \ 1859 } \ 1860 } 1861 // TODO: implement count 1862 #else 1863 #define OMPT_LOOP_END // no-op 1864 #endif 1865 1866 #if KMP_STATS_ENABLED 1867 #define KMP_STATS_LOOP_END \ 1868 { \ 1869 kmp_int64 u, l, t, i; \ 1870 l = (kmp_int64)(*p_lb); \ 1871 u = (kmp_int64)(*p_ub); \ 1872 i = (kmp_int64)(pr->u.p.st); \ 1873 if (status == 0) { \ 1874 t = 0; \ 1875 KMP_POP_PARTITIONED_TIMER(); \ 1876 } else if (i == 1) { \ 1877 if (u >= l) \ 1878 t = u - l + 1; \ 1879 else \ 1880 t = 0; \ 1881 } else if (i < 0) { \ 1882 if (l >= u) \ 1883 t = (l - u) / (-i) + 1; \ 1884 else \ 1885 t = 0; \ 1886 } else { \ 1887 if (u >= l) \ 1888 t = (u - l) / i + 1; \ 1889 else \ 1890 t = 0; \ 1891 } \ 1892 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1893 } 1894 #else 1895 #define KMP_STATS_LOOP_END /* Nothing */ 1896 #endif 1897 1898 template <typename T> 1899 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1900 T *p_lb, T *p_ub, 1901 typename traits_t<T>::signed_t *p_st 1902 #if OMPT_SUPPORT && OMPT_OPTIONAL 1903 , 1904 void *codeptr 1905 #endif 1906 ) { 1907 1908 typedef typename traits_t<T>::unsigned_t UT; 1909 typedef typename traits_t<T>::signed_t ST; 1910 // This is potentially slightly misleading, schedule(runtime) will appear here 1911 // even if the actual runtime schedule is static. (Which points out a 1912 // disadvantage of schedule(runtime): even when static scheduling is used it 1913 // costs more than a compile time choice to use static scheduling would.) 1914 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling); 1915 1916 int status; 1917 dispatch_private_info_template<T> *pr; 1918 __kmp_assert_valid_gtid(gtid); 1919 kmp_info_t *th = __kmp_threads[gtid]; 1920 kmp_team_t *team = th->th.th_team; 1921 1922 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1923 KD_TRACE( 1924 1000, 1925 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n", 1926 gtid, p_lb, p_ub, p_st, p_last)); 1927 1928 if (team->t.t_serialized) { 1929 /* NOTE: serialize this dispatch because we are not at the active level */ 1930 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1931 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1932 KMP_DEBUG_ASSERT(pr); 1933 1934 if ((status = (pr->u.p.tc != 0)) == 0) { 1935 *p_lb = 0; 1936 *p_ub = 0; 1937 // if ( p_last != NULL ) 1938 // *p_last = 0; 1939 if (p_st != NULL) 1940 *p_st = 0; 1941 if (__kmp_env_consistency_check) { 1942 if (pr->pushed_ws != ct_none) { 1943 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1944 } 1945 } 1946 } else if (pr->flags.nomerge) { 1947 kmp_int32 last; 1948 T start; 1949 UT limit, trip, init; 1950 ST incr; 1951 T chunk = pr->u.p.parm1; 1952 1953 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1954 gtid)); 1955 1956 init = chunk * pr->u.p.count++; 1957 trip = pr->u.p.tc - 1; 1958 1959 if ((status = (init <= trip)) == 0) { 1960 *p_lb = 0; 1961 *p_ub = 0; 1962 // if ( p_last != NULL ) 1963 // *p_last = 0; 1964 if (p_st != NULL) 1965 *p_st = 0; 1966 if (__kmp_env_consistency_check) { 1967 if (pr->pushed_ws != ct_none) { 1968 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1969 } 1970 } 1971 } else { 1972 start = pr->u.p.lb; 1973 limit = chunk + init - 1; 1974 incr = pr->u.p.st; 1975 1976 if ((last = (limit >= trip)) != 0) { 1977 limit = trip; 1978 #if KMP_OS_WINDOWS 1979 pr->u.p.last_upper = pr->u.p.ub; 1980 #endif /* KMP_OS_WINDOWS */ 1981 } 1982 if (p_last != NULL) 1983 *p_last = last; 1984 if (p_st != NULL) 1985 *p_st = incr; 1986 if (incr == 1) { 1987 *p_lb = start + init; 1988 *p_ub = start + limit; 1989 } else { 1990 *p_lb = start + init * incr; 1991 *p_ub = start + limit * incr; 1992 } 1993 1994 if (pr->flags.ordered) { 1995 pr->u.p.ordered_lower = init; 1996 pr->u.p.ordered_upper = limit; 1997 #ifdef KMP_DEBUG 1998 { 1999 char *buff; 2000 // create format specifiers before the debug output 2001 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2002 "ordered_lower:%%%s ordered_upper:%%%s\n", 2003 traits_t<UT>::spec, traits_t<UT>::spec); 2004 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2005 pr->u.p.ordered_upper)); 2006 __kmp_str_free(&buff); 2007 } 2008 #endif 2009 } // if 2010 } // if 2011 } else { 2012 pr->u.p.tc = 0; 2013 *p_lb = pr->u.p.lb; 2014 *p_ub = pr->u.p.ub; 2015 #if KMP_OS_WINDOWS 2016 pr->u.p.last_upper = *p_ub; 2017 #endif /* KMP_OS_WINDOWS */ 2018 if (p_last != NULL) 2019 *p_last = TRUE; 2020 if (p_st != NULL) 2021 *p_st = pr->u.p.st; 2022 } // if 2023 #ifdef KMP_DEBUG 2024 { 2025 char *buff; 2026 // create format specifiers before the debug output 2027 buff = __kmp_str_format( 2028 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2029 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 2030 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2031 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, 2032 (p_last ? *p_last : 0), status)); 2033 __kmp_str_free(&buff); 2034 } 2035 #endif 2036 #if INCLUDE_SSC_MARKS 2037 SSC_MARK_DISPATCH_NEXT(); 2038 #endif 2039 OMPT_LOOP_END; 2040 KMP_STATS_LOOP_END; 2041 return status; 2042 } else { 2043 kmp_int32 last = 0; 2044 dispatch_shared_info_template<T> volatile *sh; 2045 2046 KMP_DEBUG_ASSERT(th->th.th_dispatch == 2047 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 2048 2049 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 2050 th->th.th_dispatch->th_dispatch_pr_current); 2051 KMP_DEBUG_ASSERT(pr); 2052 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( 2053 th->th.th_dispatch->th_dispatch_sh_current); 2054 KMP_DEBUG_ASSERT(sh); 2055 2056 #if KMP_USE_HIER_SCHED 2057 if (pr->flags.use_hier) 2058 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); 2059 else 2060 #endif // KMP_USE_HIER_SCHED 2061 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, 2062 p_st, th->th.th_team_nproc, 2063 th->th.th_info.ds.ds_tid); 2064 // status == 0: no more iterations to execute 2065 if (status == 0) { 2066 UT num_done; 2067 2068 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2069 #ifdef KMP_DEBUG 2070 { 2071 char *buff; 2072 // create format specifiers before the debug output 2073 buff = __kmp_str_format( 2074 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2075 traits_t<UT>::spec); 2076 KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); 2077 __kmp_str_free(&buff); 2078 } 2079 #endif 2080 2081 #if KMP_USE_HIER_SCHED 2082 pr->flags.use_hier = FALSE; 2083 #endif 2084 if ((ST)num_done == th->th.th_team_nproc - 1) { 2085 #if (KMP_STATIC_STEAL_ENABLED) 2086 if (pr->schedule == kmp_sch_static_steal && 2087 traits_t<T>::type_size > 4) { 2088 int i; 2089 int idx = (th->th.th_dispatch->th_disp_index - 1) % 2090 __kmp_dispatch_num_buffers; // current loop index 2091 kmp_info_t **other_threads = team->t.t_threads; 2092 // loop complete, safe to destroy locks used for stealing 2093 for (i = 0; i < th->th.th_team_nproc; ++i) { 2094 dispatch_private_info_template<T> *buf = 2095 reinterpret_cast<dispatch_private_info_template<T> *>( 2096 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); 2097 kmp_lock_t *lck = buf->u.p.th_steal_lock; 2098 KMP_ASSERT(lck != NULL); 2099 __kmp_destroy_lock(lck); 2100 __kmp_free(lck); 2101 buf->u.p.th_steal_lock = NULL; 2102 } 2103 } 2104 #endif 2105 /* NOTE: release this buffer to be reused */ 2106 2107 KMP_MB(); /* Flush all pending memory write invalidates. */ 2108 2109 sh->u.s.num_done = 0; 2110 sh->u.s.iteration = 0; 2111 2112 /* TODO replace with general release procedure? */ 2113 if (pr->flags.ordered) { 2114 sh->u.s.ordered_iteration = 0; 2115 } 2116 2117 KMP_MB(); /* Flush all pending memory write invalidates. */ 2118 2119 sh->buffer_index += __kmp_dispatch_num_buffers; 2120 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2121 gtid, sh->buffer_index)); 2122 2123 KMP_MB(); /* Flush all pending memory write invalidates. */ 2124 2125 } // if 2126 if (__kmp_env_consistency_check) { 2127 if (pr->pushed_ws != ct_none) { 2128 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2129 } 2130 } 2131 2132 th->th.th_dispatch->th_deo_fcn = NULL; 2133 th->th.th_dispatch->th_dxo_fcn = NULL; 2134 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2135 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2136 } // if (status == 0) 2137 #if KMP_OS_WINDOWS 2138 else if (last) { 2139 pr->u.p.last_upper = pr->u.p.ub; 2140 } 2141 #endif /* KMP_OS_WINDOWS */ 2142 if (p_last != NULL && status != 0) 2143 *p_last = last; 2144 } // if 2145 2146 #ifdef KMP_DEBUG 2147 { 2148 char *buff; 2149 // create format specifiers before the debug output 2150 buff = __kmp_str_format( 2151 "__kmp_dispatch_next: T#%%d normal case: " 2152 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", 2153 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2154 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, 2155 (p_last ? *p_last : 0), status)); 2156 __kmp_str_free(&buff); 2157 } 2158 #endif 2159 #if INCLUDE_SSC_MARKS 2160 SSC_MARK_DISPATCH_NEXT(); 2161 #endif 2162 OMPT_LOOP_END; 2163 KMP_STATS_LOOP_END; 2164 return status; 2165 } 2166 2167 template <typename T> 2168 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2169 kmp_int32 *plastiter, T *plower, T *pupper, 2170 typename traits_t<T>::signed_t incr) { 2171 typedef typename traits_t<T>::unsigned_t UT; 2172 kmp_uint32 team_id; 2173 kmp_uint32 nteams; 2174 UT trip_count; 2175 kmp_team_t *team; 2176 kmp_info_t *th; 2177 2178 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2179 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2180 #ifdef KMP_DEBUG 2181 typedef typename traits_t<T>::signed_t ST; 2182 { 2183 char *buff; 2184 // create format specifiers before the debug output 2185 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2186 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2187 traits_t<T>::spec, traits_t<T>::spec, 2188 traits_t<ST>::spec, traits_t<T>::spec); 2189 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2190 __kmp_str_free(&buff); 2191 } 2192 #endif 2193 2194 if (__kmp_env_consistency_check) { 2195 if (incr == 0) { 2196 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2197 loc); 2198 } 2199 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2200 // The loop is illegal. 2201 // Some zero-trip loops maintained by compiler, e.g.: 2202 // for(i=10;i<0;++i) // lower >= upper - run-time check 2203 // for(i=0;i>10;--i) // lower <= upper - run-time check 2204 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2205 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2206 // Compiler does not check the following illegal loops: 2207 // for(i=0;i<10;i+=incr) // where incr<0 2208 // for(i=10;i>0;i-=incr) // where incr<0 2209 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2210 } 2211 } 2212 __kmp_assert_valid_gtid(gtid); 2213 th = __kmp_threads[gtid]; 2214 team = th->th.th_team; 2215 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2216 nteams = th->th.th_teams_size.nteams; 2217 team_id = team->t.t_master_tid; 2218 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc); 2219 2220 // compute global trip count 2221 if (incr == 1) { 2222 trip_count = *pupper - *plower + 1; 2223 } else if (incr == -1) { 2224 trip_count = *plower - *pupper + 1; 2225 } else if (incr > 0) { 2226 // upper-lower can exceed the limit of signed type 2227 trip_count = (UT)(*pupper - *plower) / incr + 1; 2228 } else { 2229 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2230 } 2231 2232 if (trip_count <= nteams) { 2233 KMP_DEBUG_ASSERT( 2234 __kmp_static == kmp_sch_static_greedy || 2235 __kmp_static == 2236 kmp_sch_static_balanced); // Unknown static scheduling type. 2237 // only some teams get single iteration, others get nothing 2238 if (team_id < trip_count) { 2239 *pupper = *plower = *plower + team_id * incr; 2240 } else { 2241 *plower = *pupper + incr; // zero-trip loop 2242 } 2243 if (plastiter != NULL) 2244 *plastiter = (team_id == trip_count - 1); 2245 } else { 2246 if (__kmp_static == kmp_sch_static_balanced) { 2247 UT chunk = trip_count / nteams; 2248 UT extras = trip_count % nteams; 2249 *plower += 2250 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2251 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2252 if (plastiter != NULL) 2253 *plastiter = (team_id == nteams - 1); 2254 } else { 2255 T chunk_inc_count = 2256 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2257 T upper = *pupper; 2258 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2259 // Unknown static scheduling type. 2260 *plower += team_id * chunk_inc_count; 2261 *pupper = *plower + chunk_inc_count - incr; 2262 // Check/correct bounds if needed 2263 if (incr > 0) { 2264 if (*pupper < *plower) 2265 *pupper = traits_t<T>::max_value; 2266 if (plastiter != NULL) 2267 *plastiter = *plower <= upper && *pupper > upper - incr; 2268 if (*pupper > upper) 2269 *pupper = upper; // tracker C73258 2270 } else { 2271 if (*pupper > *plower) 2272 *pupper = traits_t<T>::min_value; 2273 if (plastiter != NULL) 2274 *plastiter = *plower >= upper && *pupper < upper - incr; 2275 if (*pupper < upper) 2276 *pupper = upper; // tracker C73258 2277 } 2278 } 2279 } 2280 } 2281 2282 //----------------------------------------------------------------------------- 2283 // Dispatch routines 2284 // Transfer call to template< type T > 2285 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2286 // T lb, T ub, ST st, ST chunk ) 2287 extern "C" { 2288 2289 /*! 2290 @ingroup WORK_SHARING 2291 @{ 2292 @param loc Source location 2293 @param gtid Global thread id 2294 @param schedule Schedule type 2295 @param lb Lower bound 2296 @param ub Upper bound 2297 @param st Step (or increment if you prefer) 2298 @param chunk The chunk size to block with 2299 2300 This function prepares the runtime to start a dynamically scheduled for loop, 2301 saving the loop arguments. 2302 These functions are all identical apart from the types of the arguments. 2303 */ 2304 2305 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2306 enum sched_type schedule, kmp_int32 lb, 2307 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2308 KMP_DEBUG_ASSERT(__kmp_init_serial); 2309 #if OMPT_SUPPORT && OMPT_OPTIONAL 2310 OMPT_STORE_RETURN_ADDRESS(gtid); 2311 #endif 2312 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2313 } 2314 /*! 2315 See @ref __kmpc_dispatch_init_4 2316 */ 2317 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2318 enum sched_type schedule, kmp_uint32 lb, 2319 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2320 KMP_DEBUG_ASSERT(__kmp_init_serial); 2321 #if OMPT_SUPPORT && OMPT_OPTIONAL 2322 OMPT_STORE_RETURN_ADDRESS(gtid); 2323 #endif 2324 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2325 } 2326 2327 /*! 2328 See @ref __kmpc_dispatch_init_4 2329 */ 2330 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2331 enum sched_type schedule, kmp_int64 lb, 2332 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2333 KMP_DEBUG_ASSERT(__kmp_init_serial); 2334 #if OMPT_SUPPORT && OMPT_OPTIONAL 2335 OMPT_STORE_RETURN_ADDRESS(gtid); 2336 #endif 2337 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2338 } 2339 2340 /*! 2341 See @ref __kmpc_dispatch_init_4 2342 */ 2343 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2344 enum sched_type schedule, kmp_uint64 lb, 2345 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2346 KMP_DEBUG_ASSERT(__kmp_init_serial); 2347 #if OMPT_SUPPORT && OMPT_OPTIONAL 2348 OMPT_STORE_RETURN_ADDRESS(gtid); 2349 #endif 2350 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2351 } 2352 2353 /*! 2354 See @ref __kmpc_dispatch_init_4 2355 2356 Difference from __kmpc_dispatch_init set of functions is these functions 2357 are called for composite distribute parallel for construct. Thus before 2358 regular iterations dispatching we need to calc per-team iteration space. 2359 2360 These functions are all identical apart from the types of the arguments. 2361 */ 2362 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2363 enum sched_type schedule, kmp_int32 *p_last, 2364 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2365 kmp_int32 chunk) { 2366 KMP_DEBUG_ASSERT(__kmp_init_serial); 2367 #if OMPT_SUPPORT && OMPT_OPTIONAL 2368 OMPT_STORE_RETURN_ADDRESS(gtid); 2369 #endif 2370 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2371 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2372 } 2373 2374 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2375 enum sched_type schedule, kmp_int32 *p_last, 2376 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2377 kmp_int32 chunk) { 2378 KMP_DEBUG_ASSERT(__kmp_init_serial); 2379 #if OMPT_SUPPORT && OMPT_OPTIONAL 2380 OMPT_STORE_RETURN_ADDRESS(gtid); 2381 #endif 2382 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2383 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2384 } 2385 2386 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2387 enum sched_type schedule, kmp_int32 *p_last, 2388 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2389 kmp_int64 chunk) { 2390 KMP_DEBUG_ASSERT(__kmp_init_serial); 2391 #if OMPT_SUPPORT && OMPT_OPTIONAL 2392 OMPT_STORE_RETURN_ADDRESS(gtid); 2393 #endif 2394 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2395 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2396 } 2397 2398 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2399 enum sched_type schedule, kmp_int32 *p_last, 2400 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2401 kmp_int64 chunk) { 2402 KMP_DEBUG_ASSERT(__kmp_init_serial); 2403 #if OMPT_SUPPORT && OMPT_OPTIONAL 2404 OMPT_STORE_RETURN_ADDRESS(gtid); 2405 #endif 2406 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2407 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2408 } 2409 2410 /*! 2411 @param loc Source code location 2412 @param gtid Global thread id 2413 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2414 otherwise 2415 @param p_lb Pointer to the lower bound for the next chunk of work 2416 @param p_ub Pointer to the upper bound for the next chunk of work 2417 @param p_st Pointer to the stride for the next chunk of work 2418 @return one if there is work to be done, zero otherwise 2419 2420 Get the next dynamically allocated chunk of work for this thread. 2421 If there is no more work, then the lb,ub and stride need not be modified. 2422 */ 2423 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2424 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2426 OMPT_STORE_RETURN_ADDRESS(gtid); 2427 #endif 2428 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2429 #if OMPT_SUPPORT && OMPT_OPTIONAL 2430 , 2431 OMPT_LOAD_RETURN_ADDRESS(gtid) 2432 #endif 2433 ); 2434 } 2435 2436 /*! 2437 See @ref __kmpc_dispatch_next_4 2438 */ 2439 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2440 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2441 kmp_int32 *p_st) { 2442 #if OMPT_SUPPORT && OMPT_OPTIONAL 2443 OMPT_STORE_RETURN_ADDRESS(gtid); 2444 #endif 2445 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2446 #if OMPT_SUPPORT && OMPT_OPTIONAL 2447 , 2448 OMPT_LOAD_RETURN_ADDRESS(gtid) 2449 #endif 2450 ); 2451 } 2452 2453 /*! 2454 See @ref __kmpc_dispatch_next_4 2455 */ 2456 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2457 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2458 #if OMPT_SUPPORT && OMPT_OPTIONAL 2459 OMPT_STORE_RETURN_ADDRESS(gtid); 2460 #endif 2461 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2462 #if OMPT_SUPPORT && OMPT_OPTIONAL 2463 , 2464 OMPT_LOAD_RETURN_ADDRESS(gtid) 2465 #endif 2466 ); 2467 } 2468 2469 /*! 2470 See @ref __kmpc_dispatch_next_4 2471 */ 2472 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2473 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2474 kmp_int64 *p_st) { 2475 #if OMPT_SUPPORT && OMPT_OPTIONAL 2476 OMPT_STORE_RETURN_ADDRESS(gtid); 2477 #endif 2478 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2479 #if OMPT_SUPPORT && OMPT_OPTIONAL 2480 , 2481 OMPT_LOAD_RETURN_ADDRESS(gtid) 2482 #endif 2483 ); 2484 } 2485 2486 /*! 2487 @param loc Source code location 2488 @param gtid Global thread id 2489 2490 Mark the end of a dynamic loop. 2491 */ 2492 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2493 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2494 } 2495 2496 /*! 2497 See @ref __kmpc_dispatch_fini_4 2498 */ 2499 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2500 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2501 } 2502 2503 /*! 2504 See @ref __kmpc_dispatch_fini_4 2505 */ 2506 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2507 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2508 } 2509 2510 /*! 2511 See @ref __kmpc_dispatch_fini_4 2512 */ 2513 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2514 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2515 } 2516 /*! @} */ 2517 2518 //----------------------------------------------------------------------------- 2519 // Non-template routines from kmp_dispatch.cpp used in other sources 2520 2521 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2522 return value == checker; 2523 } 2524 2525 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2526 return value != checker; 2527 } 2528 2529 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2530 return value < checker; 2531 } 2532 2533 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2534 return value >= checker; 2535 } 2536 2537 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2538 return value <= checker; 2539 } 2540 2541 kmp_uint32 2542 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2543 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2544 void *obj // Higher-level synchronization object, or NULL. 2545 ) { 2546 // note: we may not belong to a team at this point 2547 volatile kmp_uint32 *spin = spinner; 2548 kmp_uint32 check = checker; 2549 kmp_uint32 spins; 2550 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2551 kmp_uint32 r; 2552 2553 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2554 KMP_INIT_YIELD(spins); 2555 // main wait spin loop 2556 while (!f(r = TCR_4(*spin), check)) { 2557 KMP_FSYNC_SPIN_PREPARE(obj); 2558 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2559 split. It causes problems with infinite recursion because of exit lock */ 2560 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2561 __kmp_abort_thread(); */ 2562 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2563 } 2564 KMP_FSYNC_SPIN_ACQUIRED(obj); 2565 return r; 2566 } 2567 2568 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, 2569 kmp_uint32 (*pred)(void *, kmp_uint32), 2570 void *obj // Higher-level synchronization object, or NULL. 2571 ) { 2572 // note: we may not belong to a team at this point 2573 void *spin = spinner; 2574 kmp_uint32 check = checker; 2575 kmp_uint32 spins; 2576 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2577 2578 KMP_FSYNC_SPIN_INIT(obj, spin); 2579 KMP_INIT_YIELD(spins); 2580 // main wait spin loop 2581 while (!f(spin, check)) { 2582 KMP_FSYNC_SPIN_PREPARE(obj); 2583 /* if we have waited a bit, or are noversubscribed, yield */ 2584 /* pause is in the following code */ 2585 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 2586 } 2587 KMP_FSYNC_SPIN_ACQUIRED(obj); 2588 } 2589 2590 } // extern "C" 2591 2592 #ifdef KMP_GOMP_COMPAT 2593 2594 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2595 enum sched_type schedule, kmp_int32 lb, 2596 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2597 int push_ws) { 2598 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2599 push_ws); 2600 } 2601 2602 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2603 enum sched_type schedule, kmp_uint32 lb, 2604 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2605 int push_ws) { 2606 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2607 push_ws); 2608 } 2609 2610 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2611 enum sched_type schedule, kmp_int64 lb, 2612 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2613 int push_ws) { 2614 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2615 push_ws); 2616 } 2617 2618 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2619 enum sched_type schedule, kmp_uint64 lb, 2620 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2621 int push_ws) { 2622 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2623 push_ws); 2624 } 2625 2626 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2627 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2628 } 2629 2630 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2631 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2632 } 2633 2634 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2635 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2636 } 2637 2638 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2639 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2640 } 2641 2642 #endif /* KMP_GOMP_COMPAT */ 2643 2644 /* ------------------------------------------------------------------------ */ 2645