1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* Dynamic scheduling initialization and dispatch. 17 * 18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 19 * it may change values between parallel regions. __kmp_max_nth 20 * is the largest value __kmp_nth may take, 1 is the smallest. 21 */ 22 23 // Need to raise Win version from XP to Vista here for support of 24 // InterlockedExchange64 25 #if defined(_WIN32_WINNT) && defined(_M_IX86) 26 #undef _WIN32_WINNT 27 #define _WIN32_WINNT 0x0502 28 #endif 29 30 #include "kmp.h" 31 #include "kmp_error.h" 32 #include "kmp_i18n.h" 33 #include "kmp_itt.h" 34 #include "kmp_stats.h" 35 #include "kmp_str.h" 36 #if KMP_OS_WINDOWS && KMP_ARCH_X86 37 #include <float.h> 38 #endif 39 40 #if OMPT_SUPPORT 41 #include "ompt-internal.h" 42 #include "ompt-specific.h" 43 #endif 44 45 /* ------------------------------------------------------------------------ */ 46 47 #if KMP_STATIC_STEAL_ENABLED 48 49 // replaces dispatch_private_info{32,64} structures and 50 // dispatch_private_info{32,64}_t types 51 template <typename T> struct dispatch_private_infoXX_template { 52 typedef typename traits_t<T>::unsigned_t UT; 53 typedef typename traits_t<T>::signed_t ST; 54 UT count; // unsigned 55 T ub; 56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 57 T lb; 58 ST st; // signed 59 UT tc; // unsigned 60 T static_steal_counter; // for static_steal only; maybe better to put after ub 61 62 /* parm[1-4] are used in different ways by different scheduling algorithms */ 63 64 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 65 // a) parm3 is properly aligned and 66 // b) all parm1-4 are in the same cache line. 67 // Because of parm1-4 are used together, performance seems to be better 68 // if they are in the same line (not measured though). 69 70 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 71 T parm1; 72 T parm2; 73 T parm3; 74 T parm4; 75 }; 76 77 UT ordered_lower; // unsigned 78 UT ordered_upper; // unsigned 79 #if KMP_OS_WINDOWS 80 T last_upper; 81 #endif /* KMP_OS_WINDOWS */ 82 }; 83 84 #else /* KMP_STATIC_STEAL_ENABLED */ 85 86 // replaces dispatch_private_info{32,64} structures and 87 // dispatch_private_info{32,64}_t types 88 template <typename T> struct dispatch_private_infoXX_template { 89 typedef typename traits_t<T>::unsigned_t UT; 90 typedef typename traits_t<T>::signed_t ST; 91 T lb; 92 T ub; 93 ST st; // signed 94 UT tc; // unsigned 95 96 T parm1; 97 T parm2; 98 T parm3; 99 T parm4; 100 101 UT count; // unsigned 102 103 UT ordered_lower; // unsigned 104 UT ordered_upper; // unsigned 105 #if KMP_OS_WINDOWS 106 T last_upper; 107 #endif /* KMP_OS_WINDOWS */ 108 }; 109 110 #endif /* KMP_STATIC_STEAL_ENABLED */ 111 112 // replaces dispatch_private_info structure and dispatch_private_info_t type 113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 114 // duplicate alignment here, otherwise size of structure is not correct in our 115 // compiler 116 union KMP_ALIGN_CACHE private_info_tmpl { 117 dispatch_private_infoXX_template<T> p; 118 dispatch_private_info64_t p64; 119 } u; 120 enum sched_type schedule; /* scheduling algorithm */ 121 kmp_uint32 ordered; /* ordered clause specified */ 122 kmp_uint32 ordered_bumped; 123 // To retain the structure size after making ordered_iteration scalar 124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 126 kmp_uint32 nomerge; /* don't merge iters if serialized */ 127 kmp_uint32 type_size; 128 enum cons_type pushed_ws; 129 }; 130 131 // replaces dispatch_shared_info{32,64} structures and 132 // dispatch_shared_info{32,64}_t types 133 template <typename UT> struct dispatch_shared_infoXX_template { 134 /* chunk index under dynamic, number of idle threads under static-steal; 135 iteration index otherwise */ 136 volatile UT iteration; 137 volatile UT num_done; 138 volatile UT ordered_iteration; 139 // to retain the structure size making ordered_iteration scalar 140 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 141 }; 142 143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 144 template <typename UT> struct dispatch_shared_info_template { 145 // we need union here to keep the structure size 146 union shared_info_tmpl { 147 dispatch_shared_infoXX_template<UT> s; 148 dispatch_shared_info64_t s64; 149 } u; 150 volatile kmp_uint32 buffer_index; 151 #if OMP_45_ENABLED 152 volatile kmp_int32 doacross_buf_idx; // teamwise index 153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 154 kmp_int32 doacross_num_done; // count finished threads 155 #endif 156 #if KMP_USE_HWLOC 157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 158 // machines (> 48 cores). Performance analysis showed that a cache thrash 159 // was occurring and this padding helps alleviate the problem. 160 char padding[64]; 161 #endif 162 }; 163 164 /* ------------------------------------------------------------------------ */ 165 166 #undef USE_TEST_LOCKS 167 168 // test_then_add template (general template should NOT be used) 169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 170 171 template <> 172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 173 kmp_int32 d) { 174 kmp_int32 r; 175 r = KMP_TEST_THEN_ADD32(p, d); 176 return r; 177 } 178 179 template <> 180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 181 kmp_int64 d) { 182 kmp_int64 r; 183 r = KMP_TEST_THEN_ADD64(p, d); 184 return r; 185 } 186 187 // test_then_inc_acq template (general template should NOT be used) 188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 189 190 template <> 191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 192 kmp_int32 r; 193 r = KMP_TEST_THEN_INC_ACQ32(p); 194 return r; 195 } 196 197 template <> 198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 199 kmp_int64 r; 200 r = KMP_TEST_THEN_INC_ACQ64(p); 201 return r; 202 } 203 204 // test_then_inc template (general template should NOT be used) 205 template <typename T> static __forceinline T test_then_inc(volatile T *p); 206 207 template <> 208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 209 kmp_int32 r; 210 r = KMP_TEST_THEN_INC32(p); 211 return r; 212 } 213 214 template <> 215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 216 kmp_int64 r; 217 r = KMP_TEST_THEN_INC64(p); 218 return r; 219 } 220 221 // compare_and_swap template (general template should NOT be used) 222 template <typename T> 223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 224 225 template <> 226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 227 kmp_int32 c, kmp_int32 s) { 228 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 229 } 230 231 template <> 232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 233 kmp_int64 c, kmp_int64 s) { 234 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 235 } 236 237 /* Spin wait loop that first does pause, then yield. 238 Waits until function returns non-zero when called with *spinner and check. 239 Does NOT put threads to sleep. 240 #if USE_ITT_BUILD 241 Arguments: 242 obj -- is higher-level synchronization object to report to ittnotify. 243 It is used to report locks consistently. For example, if lock is 244 acquired immediately, its address is reported to ittnotify via 245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 247 same address, not an address of low-level spinner. 248 #endif // USE_ITT_BUILD 249 */ 250 template <typename UT> 251 // ToDo: make inline function (move to header file for icl) 252 static UT // unsigned 4- or 8-byte type 253 __kmp_wait_yield( 254 volatile UT *spinner, UT checker, 255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG( 256 void *obj) // Higher-level synchronization object, or NULL. 257 ) { 258 // note: we may not belong to a team at this point 259 volatile UT *spin = spinner; 260 UT check = checker; 261 kmp_uint32 spins; 262 kmp_uint32 (*f)(UT, UT) = pred; 263 UT r; 264 265 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 266 KMP_INIT_YIELD(spins); 267 // main wait spin loop 268 while (!f(r = *spin, check)) { 269 KMP_FSYNC_SPIN_PREPARE(obj); 270 /* GEH - remove this since it was accidentally introduced when kmp_wait was 271 split. It causes problems with infinite recursion because of exit lock */ 272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 273 __kmp_abort_thread(); */ 274 275 // if we are oversubscribed, or have waited a bit (and 276 // KMP_LIBRARY=throughput, then yield. pause is in the following code 277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 278 KMP_YIELD_SPIN(spins); 279 } 280 KMP_FSYNC_SPIN_ACQUIRED(obj); 281 return r; 282 } 283 284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 285 return value == checker; 286 } 287 288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 289 return value != checker; 290 } 291 292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 293 return value < checker; 294 } 295 296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 297 return value >= checker; 298 } 299 300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 301 return value <= checker; 302 } 303 304 /* ------------------------------------------------------------------------ */ 305 306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 307 ident_t *loc_ref) { 308 kmp_info_t *th; 309 310 KMP_DEBUG_ASSERT(gtid_ref); 311 312 if (__kmp_env_consistency_check) { 313 th = __kmp_threads[*gtid_ref]; 314 if (th->th.th_root->r.r_active && 315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 316 #if KMP_USE_DYNAMIC_LOCK 317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 318 #else 319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 320 #endif 321 } 322 } 323 } 324 325 template <typename UT> 326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 327 typedef typename traits_t<UT>::signed_t ST; 328 dispatch_private_info_template<UT> *pr; 329 330 int gtid = *gtid_ref; 331 // int cid = *cid_ref; 332 kmp_info_t *th = __kmp_threads[gtid]; 333 KMP_DEBUG_ASSERT(th->th.th_dispatch); 334 335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 336 if (__kmp_env_consistency_check) { 337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 338 th->th.th_dispatch->th_dispatch_pr_current); 339 if (pr->pushed_ws != ct_none) { 340 #if KMP_USE_DYNAMIC_LOCK 341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 342 #else 343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 344 #endif 345 } 346 } 347 348 if (!th->th.th_team->t.t_serialized) { 349 dispatch_shared_info_template<UT> *sh = 350 reinterpret_cast<dispatch_shared_info_template<UT> *>( 351 th->th.th_dispatch->th_dispatch_sh_current); 352 UT lower; 353 354 if (!__kmp_env_consistency_check) { 355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 356 th->th.th_dispatch->th_dispatch_pr_current); 357 } 358 lower = pr->u.p.ordered_lower; 359 360 #if !defined(KMP_GOMP_COMPAT) 361 if (__kmp_env_consistency_check) { 362 if (pr->ordered_bumped) { 363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 365 ct_ordered_in_pdo, loc_ref, 366 &p->stack_data[p->w_top]); 367 } 368 } 369 #endif /* !defined(KMP_GOMP_COMPAT) */ 370 371 KMP_MB(); 372 #ifdef KMP_DEBUG 373 { 374 const char *buff; 375 // create format specifiers before the debug output 376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 377 "ordered_iter:%%%s lower:%%%s\n", 378 traits_t<UT>::spec, traits_t<UT>::spec); 379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 380 __kmp_str_free(&buff); 381 } 382 #endif 383 384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 386 KMP_MB(); /* is this necessary? */ 387 #ifdef KMP_DEBUG 388 { 389 const char *buff; 390 // create format specifiers before the debug output 391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 392 "ordered_iter:%%%s lower:%%%s\n", 393 traits_t<UT>::spec, traits_t<UT>::spec); 394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 395 __kmp_str_free(&buff); 396 } 397 #endif 398 } 399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 400 } 401 402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 403 ident_t *loc_ref) { 404 kmp_info_t *th; 405 406 if (__kmp_env_consistency_check) { 407 th = __kmp_threads[*gtid_ref]; 408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 410 } 411 } 412 } 413 414 template <typename UT> 415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 416 typedef typename traits_t<UT>::signed_t ST; 417 dispatch_private_info_template<UT> *pr; 418 419 int gtid = *gtid_ref; 420 // int cid = *cid_ref; 421 kmp_info_t *th = __kmp_threads[gtid]; 422 KMP_DEBUG_ASSERT(th->th.th_dispatch); 423 424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 425 if (__kmp_env_consistency_check) { 426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 427 th->th.th_dispatch->th_dispatch_pr_current); 428 if (pr->pushed_ws != ct_none) { 429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 430 } 431 } 432 433 if (!th->th.th_team->t.t_serialized) { 434 dispatch_shared_info_template<UT> *sh = 435 reinterpret_cast<dispatch_shared_info_template<UT> *>( 436 th->th.th_dispatch->th_dispatch_sh_current); 437 438 if (!__kmp_env_consistency_check) { 439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 440 th->th.th_dispatch->th_dispatch_pr_current); 441 } 442 443 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 444 #if !defined(KMP_GOMP_COMPAT) 445 if (__kmp_env_consistency_check) { 446 if (pr->ordered_bumped != 0) { 447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 448 /* How to test it? - OM */ 449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 450 ct_ordered_in_pdo, loc_ref, 451 &p->stack_data[p->w_top]); 452 } 453 } 454 #endif /* !defined(KMP_GOMP_COMPAT) */ 455 456 KMP_MB(); /* Flush all pending memory write invalidates. */ 457 458 pr->ordered_bumped += 1; 459 460 KD_TRACE(1000, 461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 462 gtid, pr->ordered_bumped)); 463 464 KMP_MB(); /* Flush all pending memory write invalidates. */ 465 466 /* TODO use general release procedure? */ 467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 468 469 KMP_MB(); /* Flush all pending memory write invalidates. */ 470 } 471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 472 } 473 474 // Computes and returns x to the power of y, where y must a non-negative integer 475 template <typename UT> 476 static __forceinline long double __kmp_pow(long double x, UT y) { 477 long double s = 1.0L; 478 479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 481 while (y) { 482 if (y & 1) 483 s *= x; 484 x *= x; 485 y >>= 1; 486 } 487 return s; 488 } 489 490 /* Computes and returns the number of unassigned iterations after idx chunks 491 have been assigned (the total number of unassigned iterations in chunks with 492 index greater than or equal to idx). __forceinline seems to be broken so that 493 if we __forceinline this function, the behavior is wrong 494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 495 template <typename T> 496 static __inline typename traits_t<T>::unsigned_t 497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 498 typename traits_t<T>::unsigned_t idx) { 499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 500 ICL 8.1, long double arithmetic may not really have long double precision, 501 even with /Qlong_double. Currently, we workaround that in the caller code, 502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 503 of precision is not expected to be a correctness issue, though. */ 504 typedef typename traits_t<T>::unsigned_t UT; 505 506 long double x = tc * __kmp_pow<UT>(base, idx); 507 UT r = (UT)x; 508 if (x == r) 509 return r; 510 return r + 1; 511 } 512 513 // Parameters of the guided-iterative algorithm: 514 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 515 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 516 // by default n = 2. For example with n = 3 the chunks distribution will be more 517 // flat. 518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 519 static int guided_int_param = 2; 520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 521 522 // UT - unsigned flavor of T, ST - signed flavor of T, 523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 524 template <typename T> 525 static void 526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 527 T ub, typename traits_t<T>::signed_t st, 528 typename traits_t<T>::signed_t chunk, int push_ws) { 529 typedef typename traits_t<T>::unsigned_t UT; 530 typedef typename traits_t<T>::signed_t ST; 531 typedef typename traits_t<T>::floating_t DBL; 532 533 int active; 534 T tc; 535 kmp_info_t *th; 536 kmp_team_t *team; 537 kmp_uint32 my_buffer_index; 538 dispatch_private_info_template<T> *pr; 539 dispatch_shared_info_template<UT> volatile *sh; 540 541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 542 sizeof(dispatch_private_info)); 543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 544 sizeof(dispatch_shared_info)); 545 546 if (!TCR_4(__kmp_init_parallel)) 547 __kmp_parallel_initialize(); 548 549 #if INCLUDE_SSC_MARKS 550 SSC_MARK_DISPATCH_INIT(); 551 #endif 552 #ifdef KMP_DEBUG 553 { 554 const char *buff; 555 // create format specifiers before the debug output 556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 558 traits_t<ST>::spec, traits_t<T>::spec, 559 traits_t<T>::spec, traits_t<ST>::spec); 560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 561 __kmp_str_free(&buff); 562 } 563 #endif 564 /* setup data */ 565 th = __kmp_threads[gtid]; 566 team = th->th.th_team; 567 active = !team->t.t_serialized; 568 th->th.th_ident = loc; 569 570 #if USE_ITT_BUILD 571 kmp_uint64 cur_chunk = chunk; 572 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 573 __kmp_forkjoin_frames_mode == 3 && 574 KMP_MASTER_GTID(gtid) && 575 #if OMP_40_ENABLED 576 th->th.th_teams_microtask == NULL && 577 #endif 578 team->t.t_active_level == 1; 579 #endif 580 if (!active) { 581 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 583 } else { 584 KMP_DEBUG_ASSERT(th->th.th_dispatch == 585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 586 587 my_buffer_index = th->th.th_dispatch->th_disp_index++; 588 589 /* What happens when number of threads changes, need to resize buffer? */ 590 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 591 &th->th.th_dispatch 592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 595 } 596 597 #if (KMP_STATIC_STEAL_ENABLED) 598 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 599 // AC: we now have only one implementation of stealing, so use it 600 schedule = kmp_sch_static_steal; 601 else 602 #endif 603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 604 605 /* Pick up the nomerge/ordered bits from the scheduling type */ 606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 607 pr->nomerge = TRUE; 608 schedule = 609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 610 } else { 611 pr->nomerge = FALSE; 612 } 613 pr->type_size = traits_t<T>::type_size; // remember the size of variables 614 if (kmp_ord_lower & schedule) { 615 pr->ordered = TRUE; 616 schedule = 617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 618 } else { 619 pr->ordered = FALSE; 620 } 621 622 if (schedule == kmp_sch_static) { 623 schedule = __kmp_static; 624 } else { 625 if (schedule == kmp_sch_runtime) { 626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 627 // not specified) 628 schedule = team->t.t_sched.r_sched_type; 629 // Detail the schedule if needed (global controls are differentiated 630 // appropriately) 631 if (schedule == kmp_sch_guided_chunked) { 632 schedule = __kmp_guided; 633 } else if (schedule == kmp_sch_static) { 634 schedule = __kmp_static; 635 } 636 // Use the chunk size specified by OMP_SCHEDULE (or default if not 637 // specified) 638 chunk = team->t.t_sched.chunk; 639 #if USE_ITT_BUILD 640 cur_chunk = chunk; 641 #endif 642 #ifdef KMP_DEBUG 643 { 644 const char *buff; 645 // create format specifiers before the debug output 646 buff = __kmp_str_format( 647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 648 traits_t<ST>::spec); 649 KD_TRACE(10, (buff, gtid, schedule, chunk)); 650 __kmp_str_free(&buff); 651 } 652 #endif 653 } else { 654 if (schedule == kmp_sch_guided_chunked) { 655 schedule = __kmp_guided; 656 } 657 if (chunk <= 0) { 658 chunk = KMP_DEFAULT_CHUNK; 659 } 660 } 661 662 if (schedule == kmp_sch_auto) { 663 // mapping and differentiation: in the __kmp_do_serial_initialize() 664 schedule = __kmp_auto; 665 #ifdef KMP_DEBUG 666 { 667 const char *buff; 668 // create format specifiers before the debug output 669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 670 "schedule:%%d chunk:%%%s\n", 671 traits_t<ST>::spec); 672 KD_TRACE(10, (buff, gtid, schedule, chunk)); 673 __kmp_str_free(&buff); 674 } 675 #endif 676 } 677 678 /* guided analytical not safe for too many threads */ 679 if (schedule == kmp_sch_guided_analytical_chunked && 680 th->th.th_team_nproc > 1 << 20) { 681 schedule = kmp_sch_guided_iterative_chunked; 682 KMP_WARNING(DispatchManyThreads); 683 } 684 if (schedule == kmp_sch_runtime_simd) { 685 // compiler provides simd_width in the chunk parameter 686 schedule = team->t.t_sched.r_sched_type; 687 // Detail the schedule if needed (global controls are differentiated 688 // appropriately) 689 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 690 schedule == __kmp_static) { 691 schedule = kmp_sch_static_balanced_chunked; 692 } else { 693 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 694 schedule = kmp_sch_guided_simd; 695 } 696 chunk = team->t.t_sched.chunk * chunk; 697 } 698 #if USE_ITT_BUILD 699 cur_chunk = chunk; 700 #endif 701 #ifdef KMP_DEBUG 702 { 703 const char *buff; 704 // create format specifiers before the debug output 705 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 706 " chunk:%%%s\n", 707 traits_t<ST>::spec); 708 KD_TRACE(10, (buff, gtid, schedule, chunk)); 709 __kmp_str_free(&buff); 710 } 711 #endif 712 } 713 pr->u.p.parm1 = chunk; 714 } 715 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 716 "unknown scheduling type"); 717 718 pr->u.p.count = 0; 719 720 if (__kmp_env_consistency_check) { 721 if (st == 0) { 722 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 723 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 724 } 725 } 726 // compute trip count 727 if (st == 1) { // most common case 728 if (ub >= lb) { 729 tc = ub - lb + 1; 730 } else { // ub < lb 731 tc = 0; // zero-trip 732 } 733 } else if (st < 0) { 734 if (lb >= ub) { 735 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 736 // where the division needs to be unsigned regardless of the result type 737 tc = (UT)(lb - ub) / (-st) + 1; 738 } else { // lb < ub 739 tc = 0; // zero-trip 740 } 741 } else { // st > 0 742 if (ub >= lb) { 743 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 744 // where the division needs to be unsigned regardless of the result type 745 tc = (UT)(ub - lb) / st + 1; 746 } else { // ub < lb 747 tc = 0; // zero-trip 748 } 749 } 750 751 // Any half-decent optimizer will remove this test when the blocks are empty 752 // since the macros expand to nothing when statistics are disabled. 753 if (schedule == __kmp_static) { 754 KMP_COUNT_BLOCK(OMP_FOR_static); 755 KMP_COUNT_VALUE(FOR_static_iterations, tc); 756 } else { 757 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 758 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 759 } 760 761 pr->u.p.lb = lb; 762 pr->u.p.ub = ub; 763 pr->u.p.st = st; 764 pr->u.p.tc = tc; 765 766 #if KMP_OS_WINDOWS 767 pr->u.p.last_upper = ub + st; 768 #endif /* KMP_OS_WINDOWS */ 769 770 /* NOTE: only the active parallel region(s) has active ordered sections */ 771 772 if (active) { 773 if (pr->ordered == 0) { 774 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 775 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 776 } else { 777 pr->ordered_bumped = 0; 778 779 pr->u.p.ordered_lower = 1; 780 pr->u.p.ordered_upper = 0; 781 782 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 783 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 784 } 785 } 786 787 if (__kmp_env_consistency_check) { 788 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 789 if (push_ws) { 790 __kmp_push_workshare(gtid, ws, loc); 791 pr->pushed_ws = ws; 792 } else { 793 __kmp_check_workshare(gtid, ws, loc); 794 pr->pushed_ws = ct_none; 795 } 796 } 797 798 switch (schedule) { 799 #if (KMP_STATIC_STEAL_ENABLED) 800 case kmp_sch_static_steal: { 801 T nproc = th->th.th_team_nproc; 802 T ntc, init; 803 804 KD_TRACE(100, 805 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 806 807 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 808 if (nproc > 1 && ntc >= nproc) { 809 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 810 T id = __kmp_tid_from_gtid(gtid); 811 T small_chunk, extras; 812 813 small_chunk = ntc / nproc; 814 extras = ntc % nproc; 815 816 init = id * small_chunk + (id < extras ? id : extras); 817 pr->u.p.count = init; 818 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 819 820 pr->u.p.parm2 = lb; 821 // pr->pfields.parm3 = 0; // it's not used in static_steal 822 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 823 pr->u.p.st = st; 824 if (traits_t<T>::type_size > 4) { 825 // AC: TODO: check if 16-byte CAS available and use it to 826 // improve performance (probably wait for explicit request 827 // before spending time on this). 828 // For now use dynamically allocated per-thread lock, 829 // free memory in __kmp_dispatch_next when status==0. 830 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 831 th->th.th_dispatch->th_steal_lock = 832 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 833 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 834 } 835 break; 836 } else { 837 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 838 "kmp_sch_static_balanced\n", 839 gtid)); 840 schedule = kmp_sch_static_balanced; 841 /* too few iterations: fall-through to kmp_sch_static_balanced */ 842 } // if 843 /* FALL-THROUGH to static balanced */ 844 } // case 845 #endif 846 case kmp_sch_static_balanced: { 847 T nproc = th->th.th_team_nproc; 848 T init, limit; 849 850 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 851 gtid)); 852 853 if (nproc > 1) { 854 T id = __kmp_tid_from_gtid(gtid); 855 856 if (tc < nproc) { 857 if (id < tc) { 858 init = id; 859 limit = id; 860 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 861 } else { 862 pr->u.p.count = 1; /* means no more chunks to execute */ 863 pr->u.p.parm1 = FALSE; 864 break; 865 } 866 } else { 867 T small_chunk = tc / nproc; 868 T extras = tc % nproc; 869 init = id * small_chunk + (id < extras ? id : extras); 870 limit = init + small_chunk - (id < extras ? 0 : 1); 871 pr->u.p.parm1 = (id == nproc - 1); 872 } 873 } else { 874 if (tc > 0) { 875 init = 0; 876 limit = tc - 1; 877 pr->u.p.parm1 = TRUE; 878 } else { // zero trip count 879 pr->u.p.count = 1; /* means no more chunks to execute */ 880 pr->u.p.parm1 = FALSE; 881 break; 882 } 883 } 884 #if USE_ITT_BUILD 885 // Calculate chunk for metadata report 886 if (itt_need_metadata_reporting) 887 cur_chunk = limit - init + 1; 888 #endif 889 if (st == 1) { 890 pr->u.p.lb = lb + init; 891 pr->u.p.ub = lb + limit; 892 } else { 893 // calculated upper bound, "ub" is user-defined upper bound 894 T ub_tmp = lb + limit * st; 895 pr->u.p.lb = lb + init * st; 896 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 897 // it exactly 898 if (st > 0) { 899 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 900 } else { 901 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 902 } 903 } 904 if (pr->ordered) { 905 pr->u.p.ordered_lower = init; 906 pr->u.p.ordered_upper = limit; 907 } 908 break; 909 } // case 910 case kmp_sch_static_balanced_chunked: { 911 // similar to balanced, but chunk adjusted to multiple of simd width 912 T nth = th->th.th_team_nproc; 913 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" 914 " -> falling-through to static_greedy\n", 915 gtid)); 916 schedule = kmp_sch_static_greedy; 917 if (nth > 1) 918 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 919 else 920 pr->u.p.parm1 = tc; 921 break; 922 } // case 923 case kmp_sch_guided_iterative_chunked: 924 case kmp_sch_guided_simd: { 925 T nproc = th->th.th_team_nproc; 926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 927 " case\n", 928 gtid)); 929 930 if (nproc > 1) { 931 if ((2L * chunk + 1) * nproc >= tc) { 932 /* chunk size too large, switch to dynamic */ 933 schedule = kmp_sch_dynamic_chunked; 934 } else { 935 // when remaining iters become less than parm2 - switch to dynamic 936 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 937 *(double *)&pr->u.p.parm3 = 938 guided_flt_param / nproc; // may occupy parm3 and parm4 939 } 940 } else { 941 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 942 "kmp_sch_static_greedy\n", 943 gtid)); 944 schedule = kmp_sch_static_greedy; 945 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 946 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 947 gtid)); 948 pr->u.p.parm1 = tc; 949 } // if 950 } // case 951 break; 952 case kmp_sch_guided_analytical_chunked: { 953 T nproc = th->th.th_team_nproc; 954 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 955 " case\n", 956 gtid)); 957 if (nproc > 1) { 958 if ((2L * chunk + 1) * nproc >= tc) { 959 /* chunk size too large, switch to dynamic */ 960 schedule = kmp_sch_dynamic_chunked; 961 } else { 962 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 963 DBL x; 964 965 #if KMP_OS_WINDOWS && KMP_ARCH_X86 966 /* Linux* OS already has 64-bit computation by default for long double, 967 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 968 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 969 instead of the default 53-bit. Even though long double doesn't work 970 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 971 expected to impact the correctness of the algorithm, but this has not 972 been mathematically proven. */ 973 // save original FPCW and set precision to 64-bit, as 974 // Windows* OS on IA-32 architecture defaults to 53-bit 975 unsigned int oldFpcw = _control87(0, 0); 976 _control87(_PC_64, _MCW_PC); // 0,0x30000 977 #endif 978 /* value used for comparison in solver for cross-over point */ 979 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 980 981 /* crossover point--chunk indexes equal to or greater than 982 this point switch to dynamic-style scheduling */ 983 UT cross; 984 985 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 986 x = (long double)1.0 - (long double)0.5 / nproc; 987 988 #ifdef KMP_DEBUG 989 { // test natural alignment 990 struct _test_a { 991 char a; 992 union { 993 char b; 994 DBL d; 995 }; 996 } t; 997 ptrdiff_t natural_alignment = 998 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 999 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 1000 // long)natural_alignment ); 1001 KMP_DEBUG_ASSERT( 1002 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 1003 } 1004 #endif // KMP_DEBUG 1005 1006 /* save the term in thread private dispatch structure */ 1007 *(DBL *)&pr->u.p.parm3 = x; 1008 1009 /* solve for the crossover point to the nearest integer i for which C_i 1010 <= chunk */ 1011 { 1012 UT left, right, mid; 1013 long double p; 1014 1015 /* estimate initial upper and lower bound */ 1016 1017 /* doesn't matter what value right is as long as it is positive, but 1018 it affects performance of the solver */ 1019 right = 229; 1020 p = __kmp_pow<UT>(x, right); 1021 if (p > target) { 1022 do { 1023 p *= p; 1024 right <<= 1; 1025 } while (p > target && right < (1 << 27)); 1026 /* lower bound is previous (failed) estimate of upper bound */ 1027 left = right >> 1; 1028 } else { 1029 left = 0; 1030 } 1031 1032 /* bisection root-finding method */ 1033 while (left + 1 < right) { 1034 mid = (left + right) / 2; 1035 if (__kmp_pow<UT>(x, mid) > target) { 1036 left = mid; 1037 } else { 1038 right = mid; 1039 } 1040 } // while 1041 cross = right; 1042 } 1043 /* assert sanity of computed crossover point */ 1044 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1045 __kmp_pow<UT>(x, cross) <= target); 1046 1047 /* save the crossover point in thread private dispatch structure */ 1048 pr->u.p.parm2 = cross; 1049 1050 // C75803 1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1053 #else 1054 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1055 #endif 1056 /* dynamic-style scheduling offset */ 1057 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1058 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1059 cross * chunk; 1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1061 // restore FPCW 1062 _control87(oldFpcw, _MCW_PC); 1063 #endif 1064 } // if 1065 } else { 1066 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1067 "kmp_sch_static_greedy\n", 1068 gtid)); 1069 schedule = kmp_sch_static_greedy; 1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1071 pr->u.p.parm1 = tc; 1072 } // if 1073 } // case 1074 break; 1075 case kmp_sch_static_greedy: 1076 KD_TRACE(100, 1077 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1078 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1079 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1080 : tc; 1081 break; 1082 case kmp_sch_static_chunked: 1083 case kmp_sch_dynamic_chunked: 1084 if (pr->u.p.parm1 <= 0) { 1085 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1086 } 1087 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1088 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1089 gtid)); 1090 break; 1091 case kmp_sch_trapezoidal: { 1092 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1093 1094 T parm1, parm2, parm3, parm4; 1095 KD_TRACE(100, 1096 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1097 1098 parm1 = chunk; 1099 1100 /* F : size of the first cycle */ 1101 parm2 = (tc / (2 * th->th.th_team_nproc)); 1102 1103 if (parm2 < 1) { 1104 parm2 = 1; 1105 } 1106 1107 /* L : size of the last cycle. Make sure the last cycle is not larger 1108 than the first cycle. */ 1109 if (parm1 < 1) { 1110 parm1 = 1; 1111 } else if (parm1 > parm2) { 1112 parm1 = parm2; 1113 } 1114 1115 /* N : number of cycles */ 1116 parm3 = (parm2 + parm1); 1117 parm3 = (2 * tc + parm3 - 1) / parm3; 1118 1119 if (parm3 < 2) { 1120 parm3 = 2; 1121 } 1122 1123 /* sigma : decreasing incr of the trapezoid */ 1124 parm4 = (parm3 - 1); 1125 parm4 = (parm2 - parm1) / parm4; 1126 1127 // pointless check, because parm4 >= 0 always 1128 // if ( parm4 < 0 ) { 1129 // parm4 = 0; 1130 //} 1131 1132 pr->u.p.parm1 = parm1; 1133 pr->u.p.parm2 = parm2; 1134 pr->u.p.parm3 = parm3; 1135 pr->u.p.parm4 = parm4; 1136 } // case 1137 break; 1138 1139 default: { 1140 __kmp_msg(kmp_ms_fatal, // Severity 1141 KMP_MSG(UnknownSchedTypeDetected), // Primary message 1142 KMP_HNT(GetNewerLibrary), // Hint 1143 __kmp_msg_null // Variadic argument list terminator 1144 ); 1145 } break; 1146 } // switch 1147 pr->schedule = schedule; 1148 if (active) { 1149 /* The name of this buffer should be my_buffer_index when it's free to use 1150 * it */ 1151 1152 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1153 "sh->buffer_index:%d\n", 1154 gtid, my_buffer_index, sh->buffer_index)); 1155 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1156 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1157 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1158 // my_buffer_index are *always* 32-bit integers. 1159 KMP_MB(); /* is this necessary? */ 1160 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1161 "sh->buffer_index:%d\n", 1162 gtid, my_buffer_index, sh->buffer_index)); 1163 1164 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1165 th->th.th_dispatch->th_dispatch_sh_current = 1166 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 1167 #if USE_ITT_BUILD 1168 if (pr->ordered) { 1169 __kmp_itt_ordered_init(gtid); 1170 }; // if 1171 // Report loop metadata 1172 if (itt_need_metadata_reporting) { 1173 // Only report metadata by master of active team at level 1 1174 kmp_uint64 schedtype = 0; 1175 switch (schedule) { 1176 case kmp_sch_static_chunked: 1177 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1178 break; 1179 case kmp_sch_static_greedy: 1180 cur_chunk = pr->u.p.parm1; 1181 break; 1182 case kmp_sch_dynamic_chunked: 1183 schedtype = 1; 1184 break; 1185 case kmp_sch_guided_iterative_chunked: 1186 case kmp_sch_guided_analytical_chunked: 1187 case kmp_sch_guided_simd: 1188 schedtype = 2; 1189 break; 1190 default: 1191 // Should we put this case under "static"? 1192 // case kmp_sch_static_steal: 1193 schedtype = 3; 1194 break; 1195 } 1196 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1197 } 1198 #endif /* USE_ITT_BUILD */ 1199 }; // if 1200 1201 #ifdef KMP_DEBUG 1202 { 1203 const char *buff; 1204 // create format specifiers before the debug output 1205 buff = __kmp_str_format( 1206 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1207 "lb:%%%s ub:%%%s" 1208 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1209 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1210 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1211 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1212 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1213 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1214 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1215 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1216 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1217 pr->u.p.parm3, pr->u.p.parm4)); 1218 __kmp_str_free(&buff); 1219 } 1220 #endif 1221 #if (KMP_STATIC_STEAL_ENABLED) 1222 // It cannot be guaranteed that after execution of a loop with some other 1223 // schedule kind all the parm3 variables will contain the same value. Even if 1224 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1225 // rather than program life-time increment. So the dedicated variable is 1226 // required. The 'static_steal_counter' is used. 1227 if (schedule == kmp_sch_static_steal) { 1228 // Other threads will inspect this variable when searching for a victim. 1229 // This is a flag showing that other threads may steal from this thread 1230 // since then. 1231 volatile T *p = &pr->u.p.static_steal_counter; 1232 *p = *p + 1; 1233 } 1234 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1235 1236 #if OMPT_SUPPORT && OMPT_TRACE 1237 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1238 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1239 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1240 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1241 team_info->parallel_id, task_info->task_id, team_info->microtask); 1242 } 1243 #endif 1244 } 1245 1246 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1247 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1248 * every chunk of iterations. If the ordered section(s) were not executed 1249 * for this iteration (or every iteration in this chunk), we need to set the 1250 * ordered iteration counters so that the next thread can proceed. */ 1251 template <typename UT> 1252 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1253 typedef typename traits_t<UT>::signed_t ST; 1254 kmp_info_t *th = __kmp_threads[gtid]; 1255 1256 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1257 if (!th->th.th_team->t.t_serialized) { 1258 1259 dispatch_private_info_template<UT> *pr = 1260 reinterpret_cast<dispatch_private_info_template<UT> *>( 1261 th->th.th_dispatch->th_dispatch_pr_current); 1262 dispatch_shared_info_template<UT> volatile *sh = 1263 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1264 th->th.th_dispatch->th_dispatch_sh_current); 1265 KMP_DEBUG_ASSERT(pr); 1266 KMP_DEBUG_ASSERT(sh); 1267 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1268 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1269 1270 if (pr->ordered_bumped) { 1271 KD_TRACE( 1272 1000, 1273 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1274 gtid)); 1275 pr->ordered_bumped = 0; 1276 } else { 1277 UT lower = pr->u.p.ordered_lower; 1278 1279 #ifdef KMP_DEBUG 1280 { 1281 const char *buff; 1282 // create format specifiers before the debug output 1283 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1284 "ordered_iteration:%%%s lower:%%%s\n", 1285 traits_t<UT>::spec, traits_t<UT>::spec); 1286 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1287 __kmp_str_free(&buff); 1288 } 1289 #endif 1290 1291 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1292 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1293 KMP_MB(); /* is this necessary? */ 1294 #ifdef KMP_DEBUG 1295 { 1296 const char *buff; 1297 // create format specifiers before the debug output 1298 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1299 "ordered_iteration:%%%s lower:%%%s\n", 1300 traits_t<UT>::spec, traits_t<UT>::spec); 1301 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1302 __kmp_str_free(&buff); 1303 } 1304 #endif 1305 1306 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1307 } // if 1308 } // if 1309 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1310 } 1311 1312 #ifdef KMP_GOMP_COMPAT 1313 1314 template <typename UT> 1315 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1316 typedef typename traits_t<UT>::signed_t ST; 1317 kmp_info_t *th = __kmp_threads[gtid]; 1318 1319 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1320 if (!th->th.th_team->t.t_serialized) { 1321 // int cid; 1322 dispatch_private_info_template<UT> *pr = 1323 reinterpret_cast<dispatch_private_info_template<UT> *>( 1324 th->th.th_dispatch->th_dispatch_pr_current); 1325 dispatch_shared_info_template<UT> volatile *sh = 1326 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1327 th->th.th_dispatch->th_dispatch_sh_current); 1328 KMP_DEBUG_ASSERT(pr); 1329 KMP_DEBUG_ASSERT(sh); 1330 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1331 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1332 1333 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1334 UT lower = pr->u.p.ordered_lower; 1335 UT upper = pr->u.p.ordered_upper; 1336 UT inc = upper - lower + 1; 1337 1338 if (pr->ordered_bumped == inc) { 1339 KD_TRACE( 1340 1000, 1341 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1342 gtid)); 1343 pr->ordered_bumped = 0; 1344 } else { 1345 inc -= pr->ordered_bumped; 1346 1347 #ifdef KMP_DEBUG 1348 { 1349 const char *buff; 1350 // create format specifiers before the debug output 1351 buff = __kmp_str_format( 1352 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1353 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1354 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1355 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1356 __kmp_str_free(&buff); 1357 } 1358 #endif 1359 1360 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1361 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1362 1363 KMP_MB(); /* is this necessary? */ 1364 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1365 "ordered_bumped to zero\n", 1366 gtid)); 1367 pr->ordered_bumped = 0; 1368 //!!!!! TODO check if the inc should be unsigned, or signed??? 1369 #ifdef KMP_DEBUG 1370 { 1371 const char *buff; 1372 // create format specifiers before the debug output 1373 buff = __kmp_str_format( 1374 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1375 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1376 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1377 traits_t<UT>::spec); 1378 KD_TRACE(1000, 1379 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1380 __kmp_str_free(&buff); 1381 } 1382 #endif 1383 1384 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1385 } 1386 // } 1387 } 1388 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1389 } 1390 1391 #endif /* KMP_GOMP_COMPAT */ 1392 1393 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1394 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1395 is not called. */ 1396 #if OMPT_SUPPORT && OMPT_TRACE 1397 #define OMPT_LOOP_END \ 1398 if (status == 0) { \ 1399 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1400 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1401 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1402 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1403 team_info->parallel_id, task_info->task_id); \ 1404 } \ 1405 } 1406 #else 1407 #define OMPT_LOOP_END // no-op 1408 #endif 1409 1410 template <typename T> 1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1412 T *p_lb, T *p_ub, 1413 typename traits_t<T>::signed_t *p_st) { 1414 1415 typedef typename traits_t<T>::unsigned_t UT; 1416 typedef typename traits_t<T>::signed_t ST; 1417 typedef typename traits_t<T>::floating_t DBL; 1418 1419 // This is potentially slightly misleading, schedule(runtime) will appear here 1420 // even if the actual runtme schedule is static. (Which points out a 1421 // disadavantage of schedule(runtime): even when static scheduling is used it 1422 // costs more than a compile time choice to use static scheduling would.) 1423 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1424 1425 int status; 1426 dispatch_private_info_template<T> *pr; 1427 kmp_info_t *th = __kmp_threads[gtid]; 1428 kmp_team_t *team = th->th.th_team; 1429 1430 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1431 #ifdef KMP_DEBUG 1432 { 1433 const char *buff; 1434 // create format specifiers before the debug output 1435 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1436 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1437 traits_t<T>::spec, traits_t<T>::spec, 1438 traits_t<ST>::spec); 1439 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1440 __kmp_str_free(&buff); 1441 } 1442 #endif 1443 1444 if (team->t.t_serialized) { 1445 /* NOTE: serialize this dispatch becase we are not at the active level */ 1446 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1447 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1448 KMP_DEBUG_ASSERT(pr); 1449 1450 if ((status = (pr->u.p.tc != 0)) == 0) { 1451 *p_lb = 0; 1452 *p_ub = 0; 1453 // if ( p_last != NULL ) 1454 // *p_last = 0; 1455 if (p_st != NULL) 1456 *p_st = 0; 1457 if (__kmp_env_consistency_check) { 1458 if (pr->pushed_ws != ct_none) { 1459 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1460 } 1461 } 1462 } else if (pr->nomerge) { 1463 kmp_int32 last; 1464 T start; 1465 UT limit, trip, init; 1466 ST incr; 1467 T chunk = pr->u.p.parm1; 1468 1469 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1470 gtid)); 1471 1472 init = chunk * pr->u.p.count++; 1473 trip = pr->u.p.tc - 1; 1474 1475 if ((status = (init <= trip)) == 0) { 1476 *p_lb = 0; 1477 *p_ub = 0; 1478 // if ( p_last != NULL ) 1479 // *p_last = 0; 1480 if (p_st != NULL) 1481 *p_st = 0; 1482 if (__kmp_env_consistency_check) { 1483 if (pr->pushed_ws != ct_none) { 1484 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1485 } 1486 } 1487 } else { 1488 start = pr->u.p.lb; 1489 limit = chunk + init - 1; 1490 incr = pr->u.p.st; 1491 1492 if ((last = (limit >= trip)) != 0) { 1493 limit = trip; 1494 #if KMP_OS_WINDOWS 1495 pr->u.p.last_upper = pr->u.p.ub; 1496 #endif /* KMP_OS_WINDOWS */ 1497 } 1498 if (p_last != NULL) 1499 *p_last = last; 1500 if (p_st != NULL) 1501 *p_st = incr; 1502 if (incr == 1) { 1503 *p_lb = start + init; 1504 *p_ub = start + limit; 1505 } else { 1506 *p_lb = start + init * incr; 1507 *p_ub = start + limit * incr; 1508 } 1509 1510 if (pr->ordered) { 1511 pr->u.p.ordered_lower = init; 1512 pr->u.p.ordered_upper = limit; 1513 #ifdef KMP_DEBUG 1514 { 1515 const char *buff; 1516 // create format specifiers before the debug output 1517 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1518 "ordered_lower:%%%s ordered_upper:%%%s\n", 1519 traits_t<UT>::spec, traits_t<UT>::spec); 1520 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1521 pr->u.p.ordered_upper)); 1522 __kmp_str_free(&buff); 1523 } 1524 #endif 1525 } // if 1526 } // if 1527 } else { 1528 pr->u.p.tc = 0; 1529 *p_lb = pr->u.p.lb; 1530 *p_ub = pr->u.p.ub; 1531 #if KMP_OS_WINDOWS 1532 pr->u.p.last_upper = *p_ub; 1533 #endif /* KMP_OS_WINDOWS */ 1534 if (p_last != NULL) 1535 *p_last = TRUE; 1536 if (p_st != NULL) 1537 *p_st = pr->u.p.st; 1538 } // if 1539 #ifdef KMP_DEBUG 1540 { 1541 const char *buff; 1542 // create format specifiers before the debug output 1543 buff = __kmp_str_format( 1544 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1545 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1546 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1547 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1548 __kmp_str_free(&buff); 1549 } 1550 #endif 1551 #if INCLUDE_SSC_MARKS 1552 SSC_MARK_DISPATCH_NEXT(); 1553 #endif 1554 OMPT_LOOP_END; 1555 return status; 1556 } else { 1557 kmp_int32 last = 0; 1558 dispatch_shared_info_template<UT> *sh; 1559 T start; 1560 ST incr; 1561 UT limit, trip, init; 1562 1563 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1564 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1565 1566 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1567 th->th.th_dispatch->th_dispatch_pr_current); 1568 KMP_DEBUG_ASSERT(pr); 1569 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1570 th->th.th_dispatch->th_dispatch_sh_current); 1571 KMP_DEBUG_ASSERT(sh); 1572 1573 if (pr->u.p.tc == 0) { 1574 // zero trip count 1575 status = 0; 1576 } else { 1577 switch (pr->schedule) { 1578 #if (KMP_STATIC_STEAL_ENABLED) 1579 case kmp_sch_static_steal: { 1580 T chunk = pr->u.p.parm1; 1581 int nproc = th->th.th_team_nproc; 1582 1583 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1584 gtid)); 1585 1586 trip = pr->u.p.tc - 1; 1587 1588 if (traits_t<T>::type_size > 4) { 1589 // use lock for 8-byte and CAS for 4-byte induction 1590 // variable. TODO (optional): check and use 16-byte CAS 1591 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1592 KMP_DEBUG_ASSERT(lck != NULL); 1593 if (pr->u.p.count < (UT)pr->u.p.ub) { 1594 __kmp_acquire_lock(lck, gtid); 1595 // try to get own chunk of iterations 1596 init = (pr->u.p.count)++; 1597 status = (init < (UT)pr->u.p.ub); 1598 __kmp_release_lock(lck, gtid); 1599 } else { 1600 status = 0; // no own chunks 1601 } 1602 if (!status) { // try to steal 1603 kmp_info_t **other_threads = team->t.t_threads; 1604 int while_limit = nproc; // nproc attempts to find a victim 1605 int while_index = 0; 1606 // TODO: algorithm of searching for a victim 1607 // should be cleaned up and measured 1608 while ((!status) && (while_limit != ++while_index)) { 1609 T remaining; 1610 T victimIdx = pr->u.p.parm4; 1611 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1612 dispatch_private_info_template<T> *victim = 1613 reinterpret_cast<dispatch_private_info_template<T> *>( 1614 other_threads[victimIdx] 1615 ->th.th_dispatch->th_dispatch_pr_current); 1616 while ((victim == NULL || victim == pr || 1617 (*(volatile T *)&victim->u.p.static_steal_counter != 1618 *(volatile T *)&pr->u.p.static_steal_counter)) && 1619 oldVictimIdx != victimIdx) { 1620 victimIdx = (victimIdx + 1) % nproc; 1621 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1622 other_threads[victimIdx] 1623 ->th.th_dispatch->th_dispatch_pr_current); 1624 }; 1625 if (!victim || 1626 (*(volatile T *)&victim->u.p.static_steal_counter != 1627 *(volatile T *)&pr->u.p.static_steal_counter)) { 1628 continue; // try once more (nproc attempts in total) 1629 // no victim is ready yet to participate in stealing 1630 // because all victims are still in kmp_init_dispatch 1631 } 1632 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1633 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1634 continue; // not enough chunks to steal, goto next victim 1635 } 1636 1637 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1638 KMP_ASSERT(lck != NULL); 1639 __kmp_acquire_lock(lck, gtid); 1640 limit = victim->u.p.ub; // keep initial ub 1641 if (victim->u.p.count >= limit || 1642 (remaining = limit - victim->u.p.count) < 2) { 1643 __kmp_release_lock(lck, gtid); 1644 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1645 continue; // not enough chunks to steal 1646 } 1647 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1648 // or by 1 1649 if (remaining > 3) { 1650 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1651 init = (victim->u.p.ub -= 1652 (remaining >> 2)); // steal 1/4 of remaining 1653 } else { 1654 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1655 init = 1656 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1657 } 1658 __kmp_release_lock(lck, gtid); 1659 1660 KMP_DEBUG_ASSERT(init + 1 <= limit); 1661 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1662 status = 1; 1663 while_index = 0; 1664 // now update own count and ub with stolen range but init chunk 1665 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1666 pr->u.p.count = init + 1; 1667 pr->u.p.ub = limit; 1668 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1669 } // while (search for victim) 1670 } // if (try to find victim and steal) 1671 } else { 1672 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1673 typedef union { 1674 struct { 1675 UT count; 1676 T ub; 1677 } p; 1678 kmp_int64 b; 1679 } union_i4; 1680 // All operations on 'count' or 'ub' must be combined atomically 1681 // together. 1682 { 1683 union_i4 vold, vnew; 1684 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1685 vnew = vold; 1686 vnew.p.count++; 1687 while (!KMP_COMPARE_AND_STORE_ACQ64( 1688 (volatile kmp_int64 *)&pr->u.p.count, 1689 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1690 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1691 KMP_CPU_PAUSE(); 1692 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1693 vnew = vold; 1694 vnew.p.count++; 1695 } 1696 vnew = vold; 1697 init = vnew.p.count; 1698 status = (init < (UT)vnew.p.ub); 1699 } 1700 1701 if (!status) { 1702 kmp_info_t **other_threads = team->t.t_threads; 1703 int while_limit = nproc; // nproc attempts to find a victim 1704 int while_index = 0; 1705 1706 // TODO: algorithm of searching for a victim 1707 // should be cleaned up and measured 1708 while ((!status) && (while_limit != ++while_index)) { 1709 union_i4 vold, vnew; 1710 kmp_int32 remaining; 1711 T victimIdx = pr->u.p.parm4; 1712 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1713 dispatch_private_info_template<T> *victim = 1714 reinterpret_cast<dispatch_private_info_template<T> *>( 1715 other_threads[victimIdx] 1716 ->th.th_dispatch->th_dispatch_pr_current); 1717 while ((victim == NULL || victim == pr || 1718 (*(volatile T *)&victim->u.p.static_steal_counter != 1719 *(volatile T *)&pr->u.p.static_steal_counter)) && 1720 oldVictimIdx != victimIdx) { 1721 victimIdx = (victimIdx + 1) % nproc; 1722 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1723 other_threads[victimIdx] 1724 ->th.th_dispatch->th_dispatch_pr_current); 1725 }; 1726 if (!victim || 1727 (*(volatile T *)&victim->u.p.static_steal_counter != 1728 *(volatile T *)&pr->u.p.static_steal_counter)) { 1729 continue; // try once more (nproc attempts in total) 1730 // no victim is ready yet to participate in stealing 1731 // because all victims are still in kmp_init_dispatch 1732 } 1733 pr->u.p.parm4 = victimIdx; // new victim found 1734 while (1) { // CAS loop if victim has enough chunks to steal 1735 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1736 vnew = vold; 1737 1738 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1739 if (vnew.p.count >= (UT)vnew.p.ub || 1740 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1741 pr->u.p.parm4 = 1742 (victimIdx + 1) % nproc; // shift start victim id 1743 break; // not enough chunks to steal, goto next victim 1744 } 1745 if (remaining > 3) { 1746 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1747 } else { 1748 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1749 } 1750 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1751 // TODO: Should this be acquire or release? 1752 if (KMP_COMPARE_AND_STORE_ACQ64( 1753 (volatile kmp_int64 *)&victim->u.p.count, 1754 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1755 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1756 // stealing succeeded 1757 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1758 vold.p.ub - vnew.p.ub); 1759 status = 1; 1760 while_index = 0; 1761 // now update own count and ub 1762 init = vnew.p.ub; 1763 vold.p.count = init + 1; 1764 #if KMP_ARCH_X86 1765 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1766 vold.b); 1767 #else 1768 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1769 #endif 1770 break; 1771 } // if (check CAS result) 1772 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1773 } // while (try to steal from particular victim) 1774 } // while (search for victim) 1775 } // if (try to find victim and steal) 1776 } // if (4-byte induction variable) 1777 if (!status) { 1778 *p_lb = 0; 1779 *p_ub = 0; 1780 if (p_st != NULL) 1781 *p_st = 0; 1782 } else { 1783 start = pr->u.p.parm2; 1784 init *= chunk; 1785 limit = chunk + init - 1; 1786 incr = pr->u.p.st; 1787 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1788 1789 KMP_DEBUG_ASSERT(init <= trip); 1790 if ((last = (limit >= trip)) != 0) 1791 limit = trip; 1792 if (p_st != NULL) 1793 *p_st = incr; 1794 1795 if (incr == 1) { 1796 *p_lb = start + init; 1797 *p_ub = start + limit; 1798 } else { 1799 *p_lb = start + init * incr; 1800 *p_ub = start + limit * incr; 1801 } 1802 1803 if (pr->ordered) { 1804 pr->u.p.ordered_lower = init; 1805 pr->u.p.ordered_upper = limit; 1806 #ifdef KMP_DEBUG 1807 { 1808 const char *buff; 1809 // create format specifiers before the debug output 1810 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1811 "ordered_lower:%%%s ordered_upper:%%%s\n", 1812 traits_t<UT>::spec, traits_t<UT>::spec); 1813 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1814 pr->u.p.ordered_upper)); 1815 __kmp_str_free(&buff); 1816 } 1817 #endif 1818 } // if 1819 } // if 1820 break; 1821 } // case 1822 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1823 case kmp_sch_static_balanced: { 1824 KD_TRACE( 1825 100, 1826 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1827 if ((status = !pr->u.p.count) != 1828 0) { /* check if thread has any iteration to do */ 1829 pr->u.p.count = 1; 1830 *p_lb = pr->u.p.lb; 1831 *p_ub = pr->u.p.ub; 1832 last = pr->u.p.parm1; 1833 if (p_st != NULL) 1834 *p_st = pr->u.p.st; 1835 } else { /* no iterations to do */ 1836 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1837 } 1838 if (pr->ordered) { 1839 #ifdef KMP_DEBUG 1840 { 1841 const char *buff; 1842 // create format specifiers before the debug output 1843 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1844 "ordered_lower:%%%s ordered_upper:%%%s\n", 1845 traits_t<UT>::spec, traits_t<UT>::spec); 1846 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1847 pr->u.p.ordered_upper)); 1848 __kmp_str_free(&buff); 1849 } 1850 #endif 1851 } // if 1852 } // case 1853 break; 1854 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1855 merged here */ 1856 case kmp_sch_static_chunked: { 1857 T parm1; 1858 1859 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1860 "kmp_sch_static_[affinity|chunked] case\n", 1861 gtid)); 1862 parm1 = pr->u.p.parm1; 1863 1864 trip = pr->u.p.tc - 1; 1865 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1866 1867 if ((status = (init <= trip)) != 0) { 1868 start = pr->u.p.lb; 1869 incr = pr->u.p.st; 1870 limit = parm1 + init - 1; 1871 1872 if ((last = (limit >= trip)) != 0) 1873 limit = trip; 1874 1875 if (p_st != NULL) 1876 *p_st = incr; 1877 1878 pr->u.p.count += th->th.th_team_nproc; 1879 1880 if (incr == 1) { 1881 *p_lb = start + init; 1882 *p_ub = start + limit; 1883 } else { 1884 *p_lb = start + init * incr; 1885 *p_ub = start + limit * incr; 1886 } 1887 1888 if (pr->ordered) { 1889 pr->u.p.ordered_lower = init; 1890 pr->u.p.ordered_upper = limit; 1891 #ifdef KMP_DEBUG 1892 { 1893 const char *buff; 1894 // create format specifiers before the debug output 1895 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1896 "ordered_lower:%%%s ordered_upper:%%%s\n", 1897 traits_t<UT>::spec, traits_t<UT>::spec); 1898 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1899 pr->u.p.ordered_upper)); 1900 __kmp_str_free(&buff); 1901 } 1902 #endif 1903 } // if 1904 } // if 1905 } // case 1906 break; 1907 1908 case kmp_sch_dynamic_chunked: { 1909 T chunk = pr->u.p.parm1; 1910 1911 KD_TRACE( 1912 100, 1913 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1914 1915 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1916 trip = pr->u.p.tc - 1; 1917 1918 if ((status = (init <= trip)) == 0) { 1919 *p_lb = 0; 1920 *p_ub = 0; 1921 if (p_st != NULL) 1922 *p_st = 0; 1923 } else { 1924 start = pr->u.p.lb; 1925 limit = chunk + init - 1; 1926 incr = pr->u.p.st; 1927 1928 if ((last = (limit >= trip)) != 0) 1929 limit = trip; 1930 1931 if (p_st != NULL) 1932 *p_st = incr; 1933 1934 if (incr == 1) { 1935 *p_lb = start + init; 1936 *p_ub = start + limit; 1937 } else { 1938 *p_lb = start + init * incr; 1939 *p_ub = start + limit * incr; 1940 } 1941 1942 if (pr->ordered) { 1943 pr->u.p.ordered_lower = init; 1944 pr->u.p.ordered_upper = limit; 1945 #ifdef KMP_DEBUG 1946 { 1947 const char *buff; 1948 // create format specifiers before the debug output 1949 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1950 "ordered_lower:%%%s ordered_upper:%%%s\n", 1951 traits_t<UT>::spec, traits_t<UT>::spec); 1952 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1953 pr->u.p.ordered_upper)); 1954 __kmp_str_free(&buff); 1955 } 1956 #endif 1957 } // if 1958 } // if 1959 } // case 1960 break; 1961 1962 case kmp_sch_guided_iterative_chunked: { 1963 T chunkspec = pr->u.p.parm1; 1964 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1965 "iterative case\n", 1966 gtid)); 1967 trip = pr->u.p.tc; 1968 // Start atomic part of calculations 1969 while (1) { 1970 ST remaining; // signed, because can be < 0 1971 init = sh->u.s.iteration; // shared value 1972 remaining = trip - init; 1973 if (remaining <= 0) { // AC: need to compare with 0 first 1974 // nothing to do, don't try atomic op 1975 status = 0; 1976 break; 1977 } 1978 if ((T)remaining < 1979 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1980 // use dynamic-style shcedule 1981 // atomically inrement iterations, get old value 1982 init = test_then_add<ST>( 1983 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec); 1984 remaining = trip - init; 1985 if (remaining <= 0) { 1986 status = 0; // all iterations got by other threads 1987 } else { // got some iterations to work on 1988 status = 1; 1989 if ((T)remaining > chunkspec) { 1990 limit = init + chunkspec - 1; 1991 } else { 1992 last = 1; // the last chunk 1993 limit = init + remaining - 1; 1994 } // if 1995 } // if 1996 break; 1997 } // if 1998 limit = init + (UT)(remaining * 1999 *(double *)&pr->u.p.parm3); // divide by K*nproc 2000 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2001 (ST)init, (ST)limit)) { 2002 // CAS was successful, chunk obtained 2003 status = 1; 2004 --limit; 2005 break; 2006 } // if 2007 } // while 2008 if (status != 0) { 2009 start = pr->u.p.lb; 2010 incr = pr->u.p.st; 2011 if (p_st != NULL) 2012 *p_st = incr; 2013 *p_lb = start + init * incr; 2014 *p_ub = start + limit * incr; 2015 if (pr->ordered) { 2016 pr->u.p.ordered_lower = init; 2017 pr->u.p.ordered_upper = limit; 2018 #ifdef KMP_DEBUG 2019 { 2020 const char *buff; 2021 // create format specifiers before the debug output 2022 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2023 "ordered_lower:%%%s ordered_upper:%%%s\n", 2024 traits_t<UT>::spec, traits_t<UT>::spec); 2025 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2026 pr->u.p.ordered_upper)); 2027 __kmp_str_free(&buff); 2028 } 2029 #endif 2030 } // if 2031 } else { 2032 *p_lb = 0; 2033 *p_ub = 0; 2034 if (p_st != NULL) 2035 *p_st = 0; 2036 } // if 2037 } // case 2038 break; 2039 2040 case kmp_sch_guided_simd: { 2041 // same as iterative but curr-chunk adjusted to be multiple of given 2042 // chunk 2043 T chunk = pr->u.p.parm1; 2044 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", 2045 gtid)); 2046 trip = pr->u.p.tc; 2047 // Start atomic part of calculations 2048 while (1) { 2049 ST remaining; // signed, because can be < 0 2050 init = sh->u.s.iteration; // shared value 2051 remaining = trip - init; 2052 if (remaining <= 0) { // AC: need to compare with 0 first 2053 status = 0; // nothing to do, don't try atomic op 2054 break; 2055 } 2056 KMP_DEBUG_ASSERT(init % chunk == 0); 2057 // compare with K*nproc*(chunk+1), K=2 by default 2058 if ((T)remaining < pr->u.p.parm2) { 2059 // use dynamic-style shcedule 2060 // atomically inrement iterations, get old value 2061 init = test_then_add<ST>( 2062 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk); 2063 remaining = trip - init; 2064 if (remaining <= 0) { 2065 status = 0; // all iterations got by other threads 2066 } else { 2067 // got some iterations to work on 2068 status = 1; 2069 if ((T)remaining > chunk) { 2070 limit = init + chunk - 1; 2071 } else { 2072 last = 1; // the last chunk 2073 limit = init + remaining - 1; 2074 } // if 2075 } // if 2076 break; 2077 } // if 2078 // divide by K*nproc 2079 UT span = remaining * (*(double *)&pr->u.p.parm3); 2080 UT rem = span % chunk; 2081 if (rem) // adjust so that span%chunk == 0 2082 span += chunk - rem; 2083 limit = init + span; 2084 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2085 (ST)init, (ST)limit)) { 2086 // CAS was successful, chunk obtained 2087 status = 1; 2088 --limit; 2089 break; 2090 } // if 2091 } // while 2092 if (status != 0) { 2093 start = pr->u.p.lb; 2094 incr = pr->u.p.st; 2095 if (p_st != NULL) 2096 *p_st = incr; 2097 *p_lb = start + init * incr; 2098 *p_ub = start + limit * incr; 2099 if (pr->ordered) { 2100 pr->u.p.ordered_lower = init; 2101 pr->u.p.ordered_upper = limit; 2102 #ifdef KMP_DEBUG 2103 { 2104 const char *buff; 2105 // create format specifiers before the debug output 2106 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2107 "ordered_lower:%%%s ordered_upper:%%%s\n", 2108 traits_t<UT>::spec, traits_t<UT>::spec); 2109 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2110 pr->u.p.ordered_upper)); 2111 __kmp_str_free(&buff); 2112 } 2113 #endif 2114 } // if 2115 } else { 2116 *p_lb = 0; 2117 *p_ub = 0; 2118 if (p_st != NULL) 2119 *p_st = 0; 2120 } // if 2121 } // case 2122 break; 2123 2124 case kmp_sch_guided_analytical_chunked: { 2125 T chunkspec = pr->u.p.parm1; 2126 UT chunkIdx; 2127 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2128 /* for storing original FPCW value for Windows* OS on 2129 IA-32 architecture 8-byte version */ 2130 unsigned int oldFpcw; 2131 unsigned int fpcwSet = 0; 2132 #endif 2133 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2134 "analytical case\n", 2135 gtid)); 2136 2137 trip = pr->u.p.tc; 2138 2139 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2140 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2141 trip); 2142 2143 while (1) { /* this while loop is a safeguard against unexpected zero 2144 chunk sizes */ 2145 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2146 if (chunkIdx >= (UT)pr->u.p.parm2) { 2147 --trip; 2148 /* use dynamic-style scheduling */ 2149 init = chunkIdx * chunkspec + pr->u.p.count; 2150 /* need to verify init > 0 in case of overflow in the above 2151 * calculation */ 2152 if ((status = (init > 0 && init <= trip)) != 0) { 2153 limit = init + chunkspec - 1; 2154 2155 if ((last = (limit >= trip)) != 0) 2156 limit = trip; 2157 } 2158 break; 2159 } else { 2160 /* use exponential-style scheduling */ 2161 /* The following check is to workaround the lack of long double precision on 2162 Windows* OS. 2163 This check works around the possible effect that init != 0 for chunkIdx == 0. 2164 */ 2165 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2166 /* If we haven't already done so, save original FPCW and set 2167 precision to 64-bit, as Windows* OS on IA-32 architecture 2168 defaults to 53-bit */ 2169 if (!fpcwSet) { 2170 oldFpcw = _control87(0, 0); 2171 _control87(_PC_64, _MCW_PC); 2172 fpcwSet = 0x30000; 2173 } 2174 #endif 2175 if (chunkIdx) { 2176 init = __kmp_dispatch_guided_remaining<T>( 2177 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2178 KMP_DEBUG_ASSERT(init); 2179 init = trip - init; 2180 } else 2181 init = 0; 2182 limit = trip - __kmp_dispatch_guided_remaining<T>( 2183 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2184 KMP_ASSERT(init <= limit); 2185 if (init < limit) { 2186 KMP_DEBUG_ASSERT(limit <= trip); 2187 --limit; 2188 status = 1; 2189 break; 2190 } // if 2191 } // if 2192 } // while (1) 2193 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2194 /* restore FPCW if necessary 2195 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2196 here */ 2197 if (fpcwSet && (oldFpcw & fpcwSet)) 2198 _control87(oldFpcw, _MCW_PC); 2199 #endif 2200 if (status != 0) { 2201 start = pr->u.p.lb; 2202 incr = pr->u.p.st; 2203 if (p_st != NULL) 2204 *p_st = incr; 2205 *p_lb = start + init * incr; 2206 *p_ub = start + limit * incr; 2207 if (pr->ordered) { 2208 pr->u.p.ordered_lower = init; 2209 pr->u.p.ordered_upper = limit; 2210 #ifdef KMP_DEBUG 2211 { 2212 const char *buff; 2213 // create format specifiers before the debug output 2214 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2215 "ordered_lower:%%%s ordered_upper:%%%s\n", 2216 traits_t<UT>::spec, traits_t<UT>::spec); 2217 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2218 pr->u.p.ordered_upper)); 2219 __kmp_str_free(&buff); 2220 } 2221 #endif 2222 } 2223 } else { 2224 *p_lb = 0; 2225 *p_ub = 0; 2226 if (p_st != NULL) 2227 *p_st = 0; 2228 } 2229 } // case 2230 break; 2231 2232 case kmp_sch_trapezoidal: { 2233 UT index; 2234 T parm2 = pr->u.p.parm2; 2235 T parm3 = pr->u.p.parm3; 2236 T parm4 = pr->u.p.parm4; 2237 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2238 gtid)); 2239 2240 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2241 2242 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2243 trip = pr->u.p.tc - 1; 2244 2245 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2246 *p_lb = 0; 2247 *p_ub = 0; 2248 if (p_st != NULL) 2249 *p_st = 0; 2250 } else { 2251 start = pr->u.p.lb; 2252 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2253 incr = pr->u.p.st; 2254 2255 if ((last = (limit >= trip)) != 0) 2256 limit = trip; 2257 2258 if (p_st != NULL) 2259 *p_st = incr; 2260 2261 if (incr == 1) { 2262 *p_lb = start + init; 2263 *p_ub = start + limit; 2264 } else { 2265 *p_lb = start + init * incr; 2266 *p_ub = start + limit * incr; 2267 } 2268 2269 if (pr->ordered) { 2270 pr->u.p.ordered_lower = init; 2271 pr->u.p.ordered_upper = limit; 2272 #ifdef KMP_DEBUG 2273 { 2274 const char *buff; 2275 // create format specifiers before the debug output 2276 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2277 "ordered_lower:%%%s ordered_upper:%%%s\n", 2278 traits_t<UT>::spec, traits_t<UT>::spec); 2279 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2280 pr->u.p.ordered_upper)); 2281 __kmp_str_free(&buff); 2282 } 2283 #endif 2284 } // if 2285 } // if 2286 } // case 2287 break; 2288 default: { 2289 status = 0; // to avoid complaints on uninitialized variable use 2290 __kmp_msg(kmp_ms_fatal, // Severity 2291 KMP_MSG(UnknownSchedTypeDetected), // Primary message 2292 KMP_HNT(GetNewerLibrary), // Hint 2293 __kmp_msg_null // Variadic argument list terminator 2294 ); 2295 } break; 2296 } // switch 2297 } // if tc == 0; 2298 2299 if (status == 0) { 2300 UT num_done; 2301 2302 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2303 #ifdef KMP_DEBUG 2304 { 2305 const char *buff; 2306 // create format specifiers before the debug output 2307 buff = __kmp_str_format( 2308 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2309 traits_t<UT>::spec); 2310 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2311 __kmp_str_free(&buff); 2312 } 2313 #endif 2314 2315 if ((ST)num_done == th->th.th_team_nproc - 1) { 2316 #if (KMP_STATIC_STEAL_ENABLED) 2317 if (pr->schedule == kmp_sch_static_steal && 2318 traits_t<T>::type_size > 4) { 2319 int i; 2320 kmp_info_t **other_threads = team->t.t_threads; 2321 // loop complete, safe to destroy locks used for stealing 2322 for (i = 0; i < th->th.th_team_nproc; ++i) { 2323 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2324 KMP_ASSERT(lck != NULL); 2325 __kmp_destroy_lock(lck); 2326 __kmp_free(lck); 2327 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2328 } 2329 } 2330 #endif 2331 /* NOTE: release this buffer to be reused */ 2332 2333 KMP_MB(); /* Flush all pending memory write invalidates. */ 2334 2335 sh->u.s.num_done = 0; 2336 sh->u.s.iteration = 0; 2337 2338 /* TODO replace with general release procedure? */ 2339 if (pr->ordered) { 2340 sh->u.s.ordered_iteration = 0; 2341 } 2342 2343 KMP_MB(); /* Flush all pending memory write invalidates. */ 2344 2345 sh->buffer_index += __kmp_dispatch_num_buffers; 2346 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2347 gtid, sh->buffer_index)); 2348 2349 KMP_MB(); /* Flush all pending memory write invalidates. */ 2350 2351 } // if 2352 if (__kmp_env_consistency_check) { 2353 if (pr->pushed_ws != ct_none) { 2354 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2355 } 2356 } 2357 2358 th->th.th_dispatch->th_deo_fcn = NULL; 2359 th->th.th_dispatch->th_dxo_fcn = NULL; 2360 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2361 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2362 } // if (status == 0) 2363 #if KMP_OS_WINDOWS 2364 else if (last) { 2365 pr->u.p.last_upper = pr->u.p.ub; 2366 } 2367 #endif /* KMP_OS_WINDOWS */ 2368 if (p_last != NULL && status != 0) 2369 *p_last = last; 2370 } // if 2371 2372 #ifdef KMP_DEBUG 2373 { 2374 const char *buff; 2375 // create format specifiers before the debug output 2376 buff = __kmp_str_format( 2377 "__kmp_dispatch_next: T#%%d normal case: " 2378 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2379 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2380 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2381 __kmp_str_free(&buff); 2382 } 2383 #endif 2384 #if INCLUDE_SSC_MARKS 2385 SSC_MARK_DISPATCH_NEXT(); 2386 #endif 2387 OMPT_LOOP_END; 2388 return status; 2389 } 2390 2391 template <typename T> 2392 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2393 kmp_int32 *plastiter, T *plower, T *pupper, 2394 typename traits_t<T>::signed_t incr) { 2395 typedef typename traits_t<T>::unsigned_t UT; 2396 typedef typename traits_t<T>::signed_t ST; 2397 kmp_uint32 team_id; 2398 kmp_uint32 nteams; 2399 UT trip_count; 2400 kmp_team_t *team; 2401 kmp_info_t *th; 2402 2403 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2404 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2405 #ifdef KMP_DEBUG 2406 { 2407 const char *buff; 2408 // create format specifiers before the debug output 2409 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2410 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2411 traits_t<T>::spec, traits_t<T>::spec, 2412 traits_t<ST>::spec, traits_t<T>::spec); 2413 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2414 __kmp_str_free(&buff); 2415 } 2416 #endif 2417 2418 if (__kmp_env_consistency_check) { 2419 if (incr == 0) { 2420 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2421 loc); 2422 } 2423 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2424 // The loop is illegal. 2425 // Some zero-trip loops maintained by compiler, e.g.: 2426 // for(i=10;i<0;++i) // lower >= upper - run-time check 2427 // for(i=0;i>10;--i) // lower <= upper - run-time check 2428 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2429 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2430 // Compiler does not check the following illegal loops: 2431 // for(i=0;i<10;i+=incr) // where incr<0 2432 // for(i=10;i>0;i-=incr) // where incr<0 2433 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2434 } 2435 } 2436 th = __kmp_threads[gtid]; 2437 team = th->th.th_team; 2438 #if OMP_40_ENABLED 2439 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2440 nteams = th->th.th_teams_size.nteams; 2441 #endif 2442 team_id = team->t.t_master_tid; 2443 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2444 2445 // compute global trip count 2446 if (incr == 1) { 2447 trip_count = *pupper - *plower + 1; 2448 } else if (incr == -1) { 2449 trip_count = *plower - *pupper + 1; 2450 } else if (incr > 0) { 2451 // upper-lower can exceed the limit of signed type 2452 trip_count = (UT)(*pupper - *plower) / incr + 1; 2453 } else { 2454 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2455 } 2456 2457 if (trip_count <= nteams) { 2458 KMP_DEBUG_ASSERT( 2459 __kmp_static == kmp_sch_static_greedy || 2460 __kmp_static == 2461 kmp_sch_static_balanced); // Unknown static scheduling type. 2462 // only some teams get single iteration, others get nothing 2463 if (team_id < trip_count) { 2464 *pupper = *plower = *plower + team_id * incr; 2465 } else { 2466 *plower = *pupper + incr; // zero-trip loop 2467 } 2468 if (plastiter != NULL) 2469 *plastiter = (team_id == trip_count - 1); 2470 } else { 2471 if (__kmp_static == kmp_sch_static_balanced) { 2472 UT chunk = trip_count / nteams; 2473 UT extras = trip_count % nteams; 2474 *plower += 2475 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2476 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2477 if (plastiter != NULL) 2478 *plastiter = (team_id == nteams - 1); 2479 } else { 2480 T chunk_inc_count = 2481 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2482 T upper = *pupper; 2483 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2484 // Unknown static scheduling type. 2485 *plower += team_id * chunk_inc_count; 2486 *pupper = *plower + chunk_inc_count - incr; 2487 // Check/correct bounds if needed 2488 if (incr > 0) { 2489 if (*pupper < *plower) 2490 *pupper = traits_t<T>::max_value; 2491 if (plastiter != NULL) 2492 *plastiter = *plower <= upper && *pupper > upper - incr; 2493 if (*pupper > upper) 2494 *pupper = upper; // tracker C73258 2495 } else { 2496 if (*pupper > *plower) 2497 *pupper = traits_t<T>::min_value; 2498 if (plastiter != NULL) 2499 *plastiter = *plower >= upper && *pupper < upper - incr; 2500 if (*pupper < upper) 2501 *pupper = upper; // tracker C73258 2502 } 2503 } 2504 } 2505 } 2506 2507 //----------------------------------------------------------------------------- 2508 // Dispatch routines 2509 // Transfer call to template< type T > 2510 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2511 // T lb, T ub, ST st, ST chunk ) 2512 extern "C" { 2513 2514 /*! 2515 @ingroup WORK_SHARING 2516 @{ 2517 @param loc Source location 2518 @param gtid Global thread id 2519 @param schedule Schedule type 2520 @param lb Lower bound 2521 @param ub Upper bound 2522 @param st Step (or increment if you prefer) 2523 @param chunk The chunk size to block with 2524 2525 This function prepares the runtime to start a dynamically scheduled for loop, 2526 saving the loop arguments. 2527 These functions are all identical apart from the types of the arguments. 2528 */ 2529 2530 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2531 enum sched_type schedule, kmp_int32 lb, 2532 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2533 KMP_DEBUG_ASSERT(__kmp_init_serial); 2534 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2535 } 2536 /*! 2537 See @ref __kmpc_dispatch_init_4 2538 */ 2539 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2540 enum sched_type schedule, kmp_uint32 lb, 2541 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2542 KMP_DEBUG_ASSERT(__kmp_init_serial); 2543 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2544 } 2545 2546 /*! 2547 See @ref __kmpc_dispatch_init_4 2548 */ 2549 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2550 enum sched_type schedule, kmp_int64 lb, 2551 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2552 KMP_DEBUG_ASSERT(__kmp_init_serial); 2553 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2554 } 2555 2556 /*! 2557 See @ref __kmpc_dispatch_init_4 2558 */ 2559 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2560 enum sched_type schedule, kmp_uint64 lb, 2561 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2562 KMP_DEBUG_ASSERT(__kmp_init_serial); 2563 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2564 } 2565 2566 /*! 2567 See @ref __kmpc_dispatch_init_4 2568 2569 Difference from __kmpc_dispatch_init set of functions is these functions 2570 are called for composite distribute parallel for construct. Thus before 2571 regular iterations dispatching we need to calc per-team iteration space. 2572 2573 These functions are all identical apart from the types of the arguments. 2574 */ 2575 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2576 enum sched_type schedule, kmp_int32 *p_last, 2577 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2578 kmp_int32 chunk) { 2579 KMP_DEBUG_ASSERT(__kmp_init_serial); 2580 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2581 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2582 } 2583 2584 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2585 enum sched_type schedule, kmp_int32 *p_last, 2586 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2587 kmp_int32 chunk) { 2588 KMP_DEBUG_ASSERT(__kmp_init_serial); 2589 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2590 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2591 } 2592 2593 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2594 enum sched_type schedule, kmp_int32 *p_last, 2595 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2596 kmp_int64 chunk) { 2597 KMP_DEBUG_ASSERT(__kmp_init_serial); 2598 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2599 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2600 } 2601 2602 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2603 enum sched_type schedule, kmp_int32 *p_last, 2604 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2605 kmp_int64 chunk) { 2606 KMP_DEBUG_ASSERT(__kmp_init_serial); 2607 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2608 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2609 } 2610 2611 /*! 2612 @param loc Source code location 2613 @param gtid Global thread id 2614 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2615 otherwise 2616 @param p_lb Pointer to the lower bound for the next chunk of work 2617 @param p_ub Pointer to the upper bound for the next chunk of work 2618 @param p_st Pointer to the stride for the next chunk of work 2619 @return one if there is work to be done, zero otherwise 2620 2621 Get the next dynamically allocated chunk of work for this thread. 2622 If there is no more work, then the lb,ub and stride need not be modified. 2623 */ 2624 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2625 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2626 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2627 } 2628 2629 /*! 2630 See @ref __kmpc_dispatch_next_4 2631 */ 2632 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2633 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2634 kmp_int32 *p_st) { 2635 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2636 } 2637 2638 /*! 2639 See @ref __kmpc_dispatch_next_4 2640 */ 2641 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2642 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2643 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2644 } 2645 2646 /*! 2647 See @ref __kmpc_dispatch_next_4 2648 */ 2649 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2650 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2651 kmp_int64 *p_st) { 2652 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2653 } 2654 2655 /*! 2656 @param loc Source code location 2657 @param gtid Global thread id 2658 2659 Mark the end of a dynamic loop. 2660 */ 2661 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2662 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2663 } 2664 2665 /*! 2666 See @ref __kmpc_dispatch_fini_4 2667 */ 2668 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2669 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2670 } 2671 2672 /*! 2673 See @ref __kmpc_dispatch_fini_4 2674 */ 2675 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2676 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2677 } 2678 2679 /*! 2680 See @ref __kmpc_dispatch_fini_4 2681 */ 2682 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2683 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2684 } 2685 /*! @} */ 2686 2687 //----------------------------------------------------------------------------- 2688 // Non-template routines from kmp_dispatch.cpp used in other sources 2689 2690 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2691 return value == checker; 2692 } 2693 2694 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2695 return value != checker; 2696 } 2697 2698 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2699 return value < checker; 2700 } 2701 2702 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2703 return value >= checker; 2704 } 2705 2706 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2707 return value <= checker; 2708 } 2709 2710 kmp_uint32 2711 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2712 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2713 void *obj // Higher-level synchronization object, or NULL. 2714 ) { 2715 // note: we may not belong to a team at this point 2716 volatile kmp_uint32 *spin = spinner; 2717 kmp_uint32 check = checker; 2718 kmp_uint32 spins; 2719 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2720 kmp_uint32 r; 2721 2722 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2723 KMP_INIT_YIELD(spins); 2724 // main wait spin loop 2725 while (!f(r = TCR_4(*spin), check)) { 2726 KMP_FSYNC_SPIN_PREPARE(obj); 2727 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2728 split. It causes problems with infinite recursion because of exit lock */ 2729 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2730 __kmp_abort_thread(); */ 2731 2732 /* if we have waited a bit, or are oversubscribed, yield */ 2733 /* pause is in the following code */ 2734 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2735 KMP_YIELD_SPIN(spins); 2736 } 2737 KMP_FSYNC_SPIN_ACQUIRED(obj); 2738 return r; 2739 } 2740 2741 void __kmp_wait_yield_4_ptr( 2742 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2743 void *obj // Higher-level synchronization object, or NULL. 2744 ) { 2745 // note: we may not belong to a team at this point 2746 void *spin = spinner; 2747 kmp_uint32 check = checker; 2748 kmp_uint32 spins; 2749 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2750 2751 KMP_FSYNC_SPIN_INIT(obj, spin); 2752 KMP_INIT_YIELD(spins); 2753 // main wait spin loop 2754 while (!f(spin, check)) { 2755 KMP_FSYNC_SPIN_PREPARE(obj); 2756 /* if we have waited a bit, or are oversubscribed, yield */ 2757 /* pause is in the following code */ 2758 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2759 KMP_YIELD_SPIN(spins); 2760 } 2761 KMP_FSYNC_SPIN_ACQUIRED(obj); 2762 } 2763 2764 } // extern "C" 2765 2766 #ifdef KMP_GOMP_COMPAT 2767 2768 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2769 enum sched_type schedule, kmp_int32 lb, 2770 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2771 int push_ws) { 2772 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2773 push_ws); 2774 } 2775 2776 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2777 enum sched_type schedule, kmp_uint32 lb, 2778 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2779 int push_ws) { 2780 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2781 push_ws); 2782 } 2783 2784 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2785 enum sched_type schedule, kmp_int64 lb, 2786 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2787 int push_ws) { 2788 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2789 push_ws); 2790 } 2791 2792 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2793 enum sched_type schedule, kmp_uint64 lb, 2794 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2795 int push_ws) { 2796 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2797 push_ws); 2798 } 2799 2800 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2801 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2802 } 2803 2804 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2805 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2806 } 2807 2808 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2809 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2810 } 2811 2812 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2813 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2814 } 2815 2816 #endif /* KMP_GOMP_COMPAT */ 2817 2818 /* ------------------------------------------------------------------------ */ 2819