1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 45 #if KMP_STATIC_STEAL_ENABLED 46 47 // replaces dispatch_private_info{32,64} structures and 48 // dispatch_private_info{32,64}_t types 49 template <typename T> struct dispatch_private_infoXX_template { 50 typedef typename traits_t<T>::unsigned_t UT; 51 typedef typename traits_t<T>::signed_t ST; 52 UT count; // unsigned 53 T ub; 54 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 55 T lb; 56 ST st; // signed 57 UT tc; // unsigned 58 T static_steal_counter; // for static_steal only; maybe better to put after ub 59 60 /* parm[1-4] are used in different ways by different scheduling algorithms */ 61 62 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 63 // a) parm3 is properly aligned and 64 // b) all parm1-4 are in the same cache line. 65 // Because of parm1-4 are used together, performance seems to be better 66 // if they are in the same line (not measured though). 67 68 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 69 T parm1; 70 T parm2; 71 T parm3; 72 T parm4; 73 }; 74 75 UT ordered_lower; // unsigned 76 UT ordered_upper; // unsigned 77 #if KMP_OS_WINDOWS 78 T last_upper; 79 #endif /* KMP_OS_WINDOWS */ 80 }; 81 82 #else /* KMP_STATIC_STEAL_ENABLED */ 83 84 // replaces dispatch_private_info{32,64} structures and 85 // dispatch_private_info{32,64}_t types 86 template <typename T> struct dispatch_private_infoXX_template { 87 typedef typename traits_t<T>::unsigned_t UT; 88 typedef typename traits_t<T>::signed_t ST; 89 T lb; 90 T ub; 91 ST st; // signed 92 UT tc; // unsigned 93 94 T parm1; 95 T parm2; 96 T parm3; 97 T parm4; 98 99 UT count; // unsigned 100 101 UT ordered_lower; // unsigned 102 UT ordered_upper; // unsigned 103 #if KMP_OS_WINDOWS 104 T last_upper; 105 #endif /* KMP_OS_WINDOWS */ 106 }; 107 108 #endif /* KMP_STATIC_STEAL_ENABLED */ 109 110 // replaces dispatch_private_info structure and dispatch_private_info_t type 111 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 112 // duplicate alignment here, otherwise size of structure is not correct in our 113 // compiler 114 union KMP_ALIGN_CACHE private_info_tmpl { 115 dispatch_private_infoXX_template<T> p; 116 dispatch_private_info64_t p64; 117 } u; 118 enum sched_type schedule; /* scheduling algorithm */ 119 kmp_uint32 ordered; /* ordered clause specified */ 120 kmp_uint32 ordered_bumped; 121 // To retain the structure size after making ordered_iteration scalar 122 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 123 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 124 kmp_uint32 nomerge; /* don't merge iters if serialized */ 125 kmp_uint32 type_size; 126 enum cons_type pushed_ws; 127 }; 128 129 // replaces dispatch_shared_info{32,64} structures and 130 // dispatch_shared_info{32,64}_t types 131 template <typename UT> struct dispatch_shared_infoXX_template { 132 /* chunk index under dynamic, number of idle threads under static-steal; 133 iteration index otherwise */ 134 volatile UT iteration; 135 volatile UT num_done; 136 volatile UT ordered_iteration; 137 // to retain the structure size making ordered_iteration scalar 138 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 139 }; 140 141 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 142 template <typename UT> struct dispatch_shared_info_template { 143 // we need union here to keep the structure size 144 union shared_info_tmpl { 145 dispatch_shared_infoXX_template<UT> s; 146 dispatch_shared_info64_t s64; 147 } u; 148 volatile kmp_uint32 buffer_index; 149 #if OMP_45_ENABLED 150 volatile kmp_int32 doacross_buf_idx; // teamwise index 151 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 152 kmp_int32 doacross_num_done; // count finished threads 153 #endif 154 #if KMP_USE_HWLOC 155 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 156 // machines (> 48 cores). Performance analysis showed that a cache thrash 157 // was occurring and this padding helps alleviate the problem. 158 char padding[64]; 159 #endif 160 }; 161 162 /* ------------------------------------------------------------------------ */ 163 164 #undef USE_TEST_LOCKS 165 166 // test_then_add template (general template should NOT be used) 167 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 168 169 template <> 170 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 171 kmp_int32 d) { 172 kmp_int32 r; 173 r = KMP_TEST_THEN_ADD32(p, d); 174 return r; 175 } 176 177 template <> 178 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 179 kmp_int64 d) { 180 kmp_int64 r; 181 r = KMP_TEST_THEN_ADD64(p, d); 182 return r; 183 } 184 185 // test_then_inc_acq template (general template should NOT be used) 186 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 187 188 template <> 189 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 190 kmp_int32 r; 191 r = KMP_TEST_THEN_INC_ACQ32(p); 192 return r; 193 } 194 195 template <> 196 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 197 kmp_int64 r; 198 r = KMP_TEST_THEN_INC_ACQ64(p); 199 return r; 200 } 201 202 // test_then_inc template (general template should NOT be used) 203 template <typename T> static __forceinline T test_then_inc(volatile T *p); 204 205 template <> 206 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 207 kmp_int32 r; 208 r = KMP_TEST_THEN_INC32(p); 209 return r; 210 } 211 212 template <> 213 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 214 kmp_int64 r; 215 r = KMP_TEST_THEN_INC64(p); 216 return r; 217 } 218 219 // compare_and_swap template (general template should NOT be used) 220 template <typename T> 221 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 222 223 template <> 224 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 225 kmp_int32 c, kmp_int32 s) { 226 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 227 } 228 229 template <> 230 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 231 kmp_int64 c, kmp_int64 s) { 232 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 233 } 234 235 /* Spin wait loop that first does pause, then yield. 236 Waits until function returns non-zero when called with *spinner and check. 237 Does NOT put threads to sleep. 238 #if USE_ITT_BUILD 239 Arguments: 240 obj -- is higher-level synchronization object to report to ittnotify. 241 It is used to report locks consistently. For example, if lock is 242 acquired immediately, its address is reported to ittnotify via 243 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 244 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 245 same address, not an address of low-level spinner. 246 #endif // USE_ITT_BUILD 247 */ 248 template <typename UT> 249 // ToDo: make inline function (move to header file for icl) 250 static UT // unsigned 4- or 8-byte type 251 __kmp_wait_yield( 252 volatile UT *spinner, UT checker, 253 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG( 254 void *obj) // Higher-level synchronization object, or NULL. 255 ) { 256 // note: we may not belong to a team at this point 257 volatile UT *spin = spinner; 258 UT check = checker; 259 kmp_uint32 spins; 260 kmp_uint32 (*f)(UT, UT) = pred; 261 UT r; 262 263 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 264 KMP_INIT_YIELD(spins); 265 // main wait spin loop 266 while (!f(r = *spin, check)) { 267 KMP_FSYNC_SPIN_PREPARE(obj); 268 /* GEH - remove this since it was accidentally introduced when kmp_wait was 269 split. It causes problems with infinite recursion because of exit lock */ 270 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 271 __kmp_abort_thread(); */ 272 273 // if we are oversubscribed, or have waited a bit (and 274 // KMP_LIBRARY=throughput, then yield. pause is in the following code 275 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 276 KMP_YIELD_SPIN(spins); 277 } 278 KMP_FSYNC_SPIN_ACQUIRED(obj); 279 return r; 280 } 281 282 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 283 return value == checker; 284 } 285 286 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 287 return value != checker; 288 } 289 290 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 291 return value < checker; 292 } 293 294 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 295 return value >= checker; 296 } 297 298 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 299 return value <= checker; 300 } 301 302 /* ------------------------------------------------------------------------ */ 303 304 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 305 ident_t *loc_ref) { 306 kmp_info_t *th; 307 308 KMP_DEBUG_ASSERT(gtid_ref); 309 310 if (__kmp_env_consistency_check) { 311 th = __kmp_threads[*gtid_ref]; 312 if (th->th.th_root->r.r_active && 313 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 314 #if KMP_USE_DYNAMIC_LOCK 315 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 316 #else 317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 318 #endif 319 } 320 } 321 } 322 323 template <typename UT> 324 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 325 typedef typename traits_t<UT>::signed_t ST; 326 dispatch_private_info_template<UT> *pr; 327 328 int gtid = *gtid_ref; 329 // int cid = *cid_ref; 330 kmp_info_t *th = __kmp_threads[gtid]; 331 KMP_DEBUG_ASSERT(th->th.th_dispatch); 332 333 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 334 if (__kmp_env_consistency_check) { 335 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 336 th->th.th_dispatch->th_dispatch_pr_current); 337 if (pr->pushed_ws != ct_none) { 338 #if KMP_USE_DYNAMIC_LOCK 339 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 340 #else 341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 342 #endif 343 } 344 } 345 346 if (!th->th.th_team->t.t_serialized) { 347 dispatch_shared_info_template<UT> *sh = 348 reinterpret_cast<dispatch_shared_info_template<UT> *>( 349 th->th.th_dispatch->th_dispatch_sh_current); 350 UT lower; 351 352 if (!__kmp_env_consistency_check) { 353 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 354 th->th.th_dispatch->th_dispatch_pr_current); 355 } 356 lower = pr->u.p.ordered_lower; 357 358 #if !defined(KMP_GOMP_COMPAT) 359 if (__kmp_env_consistency_check) { 360 if (pr->ordered_bumped) { 361 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 362 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 363 ct_ordered_in_pdo, loc_ref, 364 &p->stack_data[p->w_top]); 365 } 366 } 367 #endif /* !defined(KMP_GOMP_COMPAT) */ 368 369 KMP_MB(); 370 #ifdef KMP_DEBUG 371 { 372 const char *buff; 373 // create format specifiers before the debug output 374 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 375 "ordered_iter:%%%s lower:%%%s\n", 376 traits_t<UT>::spec, traits_t<UT>::spec); 377 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 378 __kmp_str_free(&buff); 379 } 380 #endif 381 382 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 383 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 384 KMP_MB(); /* is this necessary? */ 385 #ifdef KMP_DEBUG 386 { 387 const char *buff; 388 // create format specifiers before the debug output 389 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 390 "ordered_iter:%%%s lower:%%%s\n", 391 traits_t<UT>::spec, traits_t<UT>::spec); 392 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 393 __kmp_str_free(&buff); 394 } 395 #endif 396 } 397 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 398 } 399 400 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 401 ident_t *loc_ref) { 402 kmp_info_t *th; 403 404 if (__kmp_env_consistency_check) { 405 th = __kmp_threads[*gtid_ref]; 406 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 407 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 408 } 409 } 410 } 411 412 template <typename UT> 413 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 414 typedef typename traits_t<UT>::signed_t ST; 415 dispatch_private_info_template<UT> *pr; 416 417 int gtid = *gtid_ref; 418 // int cid = *cid_ref; 419 kmp_info_t *th = __kmp_threads[gtid]; 420 KMP_DEBUG_ASSERT(th->th.th_dispatch); 421 422 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 423 if (__kmp_env_consistency_check) { 424 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 425 th->th.th_dispatch->th_dispatch_pr_current); 426 if (pr->pushed_ws != ct_none) { 427 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 428 } 429 } 430 431 if (!th->th.th_team->t.t_serialized) { 432 dispatch_shared_info_template<UT> *sh = 433 reinterpret_cast<dispatch_shared_info_template<UT> *>( 434 th->th.th_dispatch->th_dispatch_sh_current); 435 436 if (!__kmp_env_consistency_check) { 437 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 438 th->th.th_dispatch->th_dispatch_pr_current); 439 } 440 441 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 442 #if !defined(KMP_GOMP_COMPAT) 443 if (__kmp_env_consistency_check) { 444 if (pr->ordered_bumped != 0) { 445 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 446 /* How to test it? - OM */ 447 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 448 ct_ordered_in_pdo, loc_ref, 449 &p->stack_data[p->w_top]); 450 } 451 } 452 #endif /* !defined(KMP_GOMP_COMPAT) */ 453 454 KMP_MB(); /* Flush all pending memory write invalidates. */ 455 456 pr->ordered_bumped += 1; 457 458 KD_TRACE(1000, 459 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 460 gtid, pr->ordered_bumped)); 461 462 KMP_MB(); /* Flush all pending memory write invalidates. */ 463 464 /* TODO use general release procedure? */ 465 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 466 467 KMP_MB(); /* Flush all pending memory write invalidates. */ 468 } 469 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 470 } 471 472 // Computes and returns x to the power of y, where y must a non-negative integer 473 template <typename UT> 474 static __forceinline long double __kmp_pow(long double x, UT y) { 475 long double s = 1.0L; 476 477 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 478 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 479 while (y) { 480 if (y & 1) 481 s *= x; 482 x *= x; 483 y >>= 1; 484 } 485 return s; 486 } 487 488 /* Computes and returns the number of unassigned iterations after idx chunks 489 have been assigned (the total number of unassigned iterations in chunks with 490 index greater than or equal to idx). __forceinline seems to be broken so that 491 if we __forceinline this function, the behavior is wrong 492 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 493 template <typename T> 494 static __inline typename traits_t<T>::unsigned_t 495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 496 typename traits_t<T>::unsigned_t idx) { 497 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 498 ICL 8.1, long double arithmetic may not really have long double precision, 499 even with /Qlong_double. Currently, we workaround that in the caller code, 500 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 501 of precision is not expected to be a correctness issue, though. */ 502 typedef typename traits_t<T>::unsigned_t UT; 503 504 long double x = tc * __kmp_pow<UT>(base, idx); 505 UT r = (UT)x; 506 if (x == r) 507 return r; 508 return r + 1; 509 } 510 511 // Parameters of the guided-iterative algorithm: 512 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 513 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 514 // by default n = 2. For example with n = 3 the chunks distribution will be more 515 // flat. 516 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 517 static int guided_int_param = 2; 518 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 519 520 // UT - unsigned flavor of T, ST - signed flavor of T, 521 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 522 template <typename T> 523 static void 524 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 525 T ub, typename traits_t<T>::signed_t st, 526 typename traits_t<T>::signed_t chunk, int push_ws) { 527 typedef typename traits_t<T>::unsigned_t UT; 528 typedef typename traits_t<T>::signed_t ST; 529 typedef typename traits_t<T>::floating_t DBL; 530 531 int active; 532 T tc; 533 kmp_info_t *th; 534 kmp_team_t *team; 535 kmp_uint32 my_buffer_index; 536 dispatch_private_info_template<T> *pr; 537 dispatch_shared_info_template<UT> volatile *sh; 538 539 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 540 sizeof(dispatch_private_info)); 541 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 542 sizeof(dispatch_shared_info)); 543 544 if (!TCR_4(__kmp_init_parallel)) 545 __kmp_parallel_initialize(); 546 547 #if INCLUDE_SSC_MARKS 548 SSC_MARK_DISPATCH_INIT(); 549 #endif 550 #ifdef KMP_DEBUG 551 { 552 const char *buff; 553 // create format specifiers before the debug output 554 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 555 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 556 traits_t<ST>::spec, traits_t<T>::spec, 557 traits_t<T>::spec, traits_t<ST>::spec); 558 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 559 __kmp_str_free(&buff); 560 } 561 #endif 562 /* setup data */ 563 th = __kmp_threads[gtid]; 564 team = th->th.th_team; 565 active = !team->t.t_serialized; 566 th->th.th_ident = loc; 567 568 #if USE_ITT_BUILD 569 kmp_uint64 cur_chunk = chunk; 570 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 571 __kmp_forkjoin_frames_mode == 3 && 572 KMP_MASTER_GTID(gtid) && 573 #if OMP_40_ENABLED 574 th->th.th_teams_microtask == NULL && 575 #endif 576 team->t.t_active_level == 1; 577 #endif 578 if (!active) { 579 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 580 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 581 } else { 582 KMP_DEBUG_ASSERT(th->th.th_dispatch == 583 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 584 585 my_buffer_index = th->th.th_dispatch->th_disp_index++; 586 587 /* What happens when number of threads changes, need to resize buffer? */ 588 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 589 &th->th.th_dispatch 590 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 591 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 592 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 593 } 594 595 #if (KMP_STATIC_STEAL_ENABLED) 596 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 597 // AC: we now have only one implementation of stealing, so use it 598 schedule = kmp_sch_static_steal; 599 else 600 #endif 601 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 602 603 /* Pick up the nomerge/ordered bits from the scheduling type */ 604 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 605 pr->nomerge = TRUE; 606 schedule = 607 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 608 } else { 609 pr->nomerge = FALSE; 610 } 611 pr->type_size = traits_t<T>::type_size; // remember the size of variables 612 if (kmp_ord_lower & schedule) { 613 pr->ordered = TRUE; 614 schedule = 615 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 616 } else { 617 pr->ordered = FALSE; 618 } 619 620 if (schedule == kmp_sch_static) { 621 schedule = __kmp_static; 622 } else { 623 if (schedule == kmp_sch_runtime) { 624 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 625 // not specified) 626 schedule = team->t.t_sched.r_sched_type; 627 // Detail the schedule if needed (global controls are differentiated 628 // appropriately) 629 if (schedule == kmp_sch_guided_chunked) { 630 schedule = __kmp_guided; 631 } else if (schedule == kmp_sch_static) { 632 schedule = __kmp_static; 633 } 634 // Use the chunk size specified by OMP_SCHEDULE (or default if not 635 // specified) 636 chunk = team->t.t_sched.chunk; 637 #if USE_ITT_BUILD 638 cur_chunk = chunk; 639 #endif 640 #ifdef KMP_DEBUG 641 { 642 const char *buff; 643 // create format specifiers before the debug output 644 buff = __kmp_str_format( 645 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 646 traits_t<ST>::spec); 647 KD_TRACE(10, (buff, gtid, schedule, chunk)); 648 __kmp_str_free(&buff); 649 } 650 #endif 651 } else { 652 if (schedule == kmp_sch_guided_chunked) { 653 schedule = __kmp_guided; 654 } 655 if (chunk <= 0) { 656 chunk = KMP_DEFAULT_CHUNK; 657 } 658 } 659 660 if (schedule == kmp_sch_auto) { 661 // mapping and differentiation: in the __kmp_do_serial_initialize() 662 schedule = __kmp_auto; 663 #ifdef KMP_DEBUG 664 { 665 const char *buff; 666 // create format specifiers before the debug output 667 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 668 "schedule:%%d chunk:%%%s\n", 669 traits_t<ST>::spec); 670 KD_TRACE(10, (buff, gtid, schedule, chunk)); 671 __kmp_str_free(&buff); 672 } 673 #endif 674 } 675 676 /* guided analytical not safe for too many threads */ 677 if (schedule == kmp_sch_guided_analytical_chunked && 678 th->th.th_team_nproc > 1 << 20) { 679 schedule = kmp_sch_guided_iterative_chunked; 680 KMP_WARNING(DispatchManyThreads); 681 } 682 if (schedule == kmp_sch_runtime_simd) { 683 // compiler provides simd_width in the chunk parameter 684 schedule = team->t.t_sched.r_sched_type; 685 // Detail the schedule if needed (global controls are differentiated 686 // appropriately) 687 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 688 schedule == __kmp_static) { 689 schedule = kmp_sch_static_balanced_chunked; 690 } else { 691 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 692 schedule = kmp_sch_guided_simd; 693 } 694 chunk = team->t.t_sched.chunk * chunk; 695 } 696 #if USE_ITT_BUILD 697 cur_chunk = chunk; 698 #endif 699 #ifdef KMP_DEBUG 700 { 701 const char *buff; 702 // create format specifiers before the debug output 703 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 704 " chunk:%%%s\n", 705 traits_t<ST>::spec); 706 KD_TRACE(10, (buff, gtid, schedule, chunk)); 707 __kmp_str_free(&buff); 708 } 709 #endif 710 } 711 pr->u.p.parm1 = chunk; 712 } 713 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 714 "unknown scheduling type"); 715 716 pr->u.p.count = 0; 717 718 if (__kmp_env_consistency_check) { 719 if (st == 0) { 720 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 721 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 722 } 723 } 724 // compute trip count 725 if (st == 1) { // most common case 726 if (ub >= lb) { 727 tc = ub - lb + 1; 728 } else { // ub < lb 729 tc = 0; // zero-trip 730 } 731 } else if (st < 0) { 732 if (lb >= ub) { 733 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 734 // where the division needs to be unsigned regardless of the result type 735 tc = (UT)(lb - ub) / (-st) + 1; 736 } else { // lb < ub 737 tc = 0; // zero-trip 738 } 739 } else { // st > 0 740 if (ub >= lb) { 741 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 742 // where the division needs to be unsigned regardless of the result type 743 tc = (UT)(ub - lb) / st + 1; 744 } else { // ub < lb 745 tc = 0; // zero-trip 746 } 747 } 748 749 // Any half-decent optimizer will remove this test when the blocks are empty 750 // since the macros expand to nothing when statistics are disabled. 751 if (schedule == __kmp_static) { 752 KMP_COUNT_BLOCK(OMP_FOR_static); 753 KMP_COUNT_VALUE(FOR_static_iterations, tc); 754 } else { 755 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 756 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 757 } 758 759 pr->u.p.lb = lb; 760 pr->u.p.ub = ub; 761 pr->u.p.st = st; 762 pr->u.p.tc = tc; 763 764 #if KMP_OS_WINDOWS 765 pr->u.p.last_upper = ub + st; 766 #endif /* KMP_OS_WINDOWS */ 767 768 /* NOTE: only the active parallel region(s) has active ordered sections */ 769 770 if (active) { 771 if (pr->ordered == 0) { 772 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 773 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 774 } else { 775 pr->ordered_bumped = 0; 776 777 pr->u.p.ordered_lower = 1; 778 pr->u.p.ordered_upper = 0; 779 780 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 781 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 782 } 783 } 784 785 if (__kmp_env_consistency_check) { 786 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 787 if (push_ws) { 788 __kmp_push_workshare(gtid, ws, loc); 789 pr->pushed_ws = ws; 790 } else { 791 __kmp_check_workshare(gtid, ws, loc); 792 pr->pushed_ws = ct_none; 793 } 794 } 795 796 switch (schedule) { 797 #if (KMP_STATIC_STEAL_ENABLED) 798 case kmp_sch_static_steal: { 799 T nproc = th->th.th_team_nproc; 800 T ntc, init; 801 802 KD_TRACE(100, 803 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 804 805 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 806 if (nproc > 1 && ntc >= nproc) { 807 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 808 T id = __kmp_tid_from_gtid(gtid); 809 T small_chunk, extras; 810 811 small_chunk = ntc / nproc; 812 extras = ntc % nproc; 813 814 init = id * small_chunk + (id < extras ? id : extras); 815 pr->u.p.count = init; 816 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 817 818 pr->u.p.parm2 = lb; 819 // pr->pfields.parm3 = 0; // it's not used in static_steal 820 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 821 pr->u.p.st = st; 822 if (traits_t<T>::type_size > 4) { 823 // AC: TODO: check if 16-byte CAS available and use it to 824 // improve performance (probably wait for explicit request 825 // before spending time on this). 826 // For now use dynamically allocated per-thread lock, 827 // free memory in __kmp_dispatch_next when status==0. 828 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 829 th->th.th_dispatch->th_steal_lock = 830 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 831 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 832 } 833 break; 834 } else { 835 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 836 "kmp_sch_static_balanced\n", 837 gtid)); 838 schedule = kmp_sch_static_balanced; 839 /* too few iterations: fall-through to kmp_sch_static_balanced */ 840 } // if 841 /* FALL-THROUGH to static balanced */ 842 } // case 843 #endif 844 case kmp_sch_static_balanced: { 845 T nproc = th->th.th_team_nproc; 846 T init, limit; 847 848 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 849 gtid)); 850 851 if (nproc > 1) { 852 T id = __kmp_tid_from_gtid(gtid); 853 854 if (tc < nproc) { 855 if (id < tc) { 856 init = id; 857 limit = id; 858 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 859 } else { 860 pr->u.p.count = 1; /* means no more chunks to execute */ 861 pr->u.p.parm1 = FALSE; 862 break; 863 } 864 } else { 865 T small_chunk = tc / nproc; 866 T extras = tc % nproc; 867 init = id * small_chunk + (id < extras ? id : extras); 868 limit = init + small_chunk - (id < extras ? 0 : 1); 869 pr->u.p.parm1 = (id == nproc - 1); 870 } 871 } else { 872 if (tc > 0) { 873 init = 0; 874 limit = tc - 1; 875 pr->u.p.parm1 = TRUE; 876 } else { // zero trip count 877 pr->u.p.count = 1; /* means no more chunks to execute */ 878 pr->u.p.parm1 = FALSE; 879 break; 880 } 881 } 882 #if USE_ITT_BUILD 883 // Calculate chunk for metadata report 884 if (itt_need_metadata_reporting) 885 cur_chunk = limit - init + 1; 886 #endif 887 if (st == 1) { 888 pr->u.p.lb = lb + init; 889 pr->u.p.ub = lb + limit; 890 } else { 891 // calculated upper bound, "ub" is user-defined upper bound 892 T ub_tmp = lb + limit * st; 893 pr->u.p.lb = lb + init * st; 894 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 895 // it exactly 896 if (st > 0) { 897 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 898 } else { 899 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 900 } 901 } 902 if (pr->ordered) { 903 pr->u.p.ordered_lower = init; 904 pr->u.p.ordered_upper = limit; 905 } 906 break; 907 } // case 908 case kmp_sch_static_balanced_chunked: { 909 // similar to balanced, but chunk adjusted to multiple of simd width 910 T nth = th->th.th_team_nproc; 911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" 912 " -> falling-through to static_greedy\n", 913 gtid)); 914 schedule = kmp_sch_static_greedy; 915 if (nth > 1) 916 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 917 else 918 pr->u.p.parm1 = tc; 919 break; 920 } // case 921 case kmp_sch_guided_iterative_chunked: 922 case kmp_sch_guided_simd: { 923 T nproc = th->th.th_team_nproc; 924 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 925 " case\n", 926 gtid)); 927 928 if (nproc > 1) { 929 if ((2L * chunk + 1) * nproc >= tc) { 930 /* chunk size too large, switch to dynamic */ 931 schedule = kmp_sch_dynamic_chunked; 932 } else { 933 // when remaining iters become less than parm2 - switch to dynamic 934 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 935 *(double *)&pr->u.p.parm3 = 936 guided_flt_param / nproc; // may occupy parm3 and parm4 937 } 938 } else { 939 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 940 "kmp_sch_static_greedy\n", 941 gtid)); 942 schedule = kmp_sch_static_greedy; 943 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 944 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 945 gtid)); 946 pr->u.p.parm1 = tc; 947 } // if 948 } // case 949 break; 950 case kmp_sch_guided_analytical_chunked: { 951 T nproc = th->th.th_team_nproc; 952 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 953 " case\n", 954 gtid)); 955 if (nproc > 1) { 956 if ((2L * chunk + 1) * nproc >= tc) { 957 /* chunk size too large, switch to dynamic */ 958 schedule = kmp_sch_dynamic_chunked; 959 } else { 960 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 961 DBL x; 962 963 #if KMP_OS_WINDOWS && KMP_ARCH_X86 964 /* Linux* OS already has 64-bit computation by default for long double, 965 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 966 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 967 instead of the default 53-bit. Even though long double doesn't work 968 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 969 expected to impact the correctness of the algorithm, but this has not 970 been mathematically proven. */ 971 // save original FPCW and set precision to 64-bit, as 972 // Windows* OS on IA-32 architecture defaults to 53-bit 973 unsigned int oldFpcw = _control87(0, 0); 974 _control87(_PC_64, _MCW_PC); // 0,0x30000 975 #endif 976 /* value used for comparison in solver for cross-over point */ 977 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 978 979 /* crossover point--chunk indexes equal to or greater than 980 this point switch to dynamic-style scheduling */ 981 UT cross; 982 983 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 984 x = (long double)1.0 - (long double)0.5 / nproc; 985 986 #ifdef KMP_DEBUG 987 { // test natural alignment 988 struct _test_a { 989 char a; 990 union { 991 char b; 992 DBL d; 993 }; 994 } t; 995 ptrdiff_t natural_alignment = 996 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 997 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 998 // long)natural_alignment ); 999 KMP_DEBUG_ASSERT( 1000 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 1001 } 1002 #endif // KMP_DEBUG 1003 1004 /* save the term in thread private dispatch structure */ 1005 *(DBL *)&pr->u.p.parm3 = x; 1006 1007 /* solve for the crossover point to the nearest integer i for which C_i 1008 <= chunk */ 1009 { 1010 UT left, right, mid; 1011 long double p; 1012 1013 /* estimate initial upper and lower bound */ 1014 1015 /* doesn't matter what value right is as long as it is positive, but 1016 it affects performance of the solver */ 1017 right = 229; 1018 p = __kmp_pow<UT>(x, right); 1019 if (p > target) { 1020 do { 1021 p *= p; 1022 right <<= 1; 1023 } while (p > target && right < (1 << 27)); 1024 /* lower bound is previous (failed) estimate of upper bound */ 1025 left = right >> 1; 1026 } else { 1027 left = 0; 1028 } 1029 1030 /* bisection root-finding method */ 1031 while (left + 1 < right) { 1032 mid = (left + right) / 2; 1033 if (__kmp_pow<UT>(x, mid) > target) { 1034 left = mid; 1035 } else { 1036 right = mid; 1037 } 1038 } // while 1039 cross = right; 1040 } 1041 /* assert sanity of computed crossover point */ 1042 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1043 __kmp_pow<UT>(x, cross) <= target); 1044 1045 /* save the crossover point in thread private dispatch structure */ 1046 pr->u.p.parm2 = cross; 1047 1048 // C75803 1049 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1050 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1051 #else 1052 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1053 #endif 1054 /* dynamic-style scheduling offset */ 1055 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1056 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1057 cross * chunk; 1058 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1059 // restore FPCW 1060 _control87(oldFpcw, _MCW_PC); 1061 #endif 1062 } // if 1063 } else { 1064 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1065 "kmp_sch_static_greedy\n", 1066 gtid)); 1067 schedule = kmp_sch_static_greedy; 1068 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1069 pr->u.p.parm1 = tc; 1070 } // if 1071 } // case 1072 break; 1073 case kmp_sch_static_greedy: 1074 KD_TRACE(100, 1075 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1076 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1077 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1078 : tc; 1079 break; 1080 case kmp_sch_static_chunked: 1081 case kmp_sch_dynamic_chunked: 1082 if (pr->u.p.parm1 <= 0) { 1083 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1084 } 1085 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1086 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1087 gtid)); 1088 break; 1089 case kmp_sch_trapezoidal: { 1090 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1091 1092 T parm1, parm2, parm3, parm4; 1093 KD_TRACE(100, 1094 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1095 1096 parm1 = chunk; 1097 1098 /* F : size of the first cycle */ 1099 parm2 = (tc / (2 * th->th.th_team_nproc)); 1100 1101 if (parm2 < 1) { 1102 parm2 = 1; 1103 } 1104 1105 /* L : size of the last cycle. Make sure the last cycle is not larger 1106 than the first cycle. */ 1107 if (parm1 < 1) { 1108 parm1 = 1; 1109 } else if (parm1 > parm2) { 1110 parm1 = parm2; 1111 } 1112 1113 /* N : number of cycles */ 1114 parm3 = (parm2 + parm1); 1115 parm3 = (2 * tc + parm3 - 1) / parm3; 1116 1117 if (parm3 < 2) { 1118 parm3 = 2; 1119 } 1120 1121 /* sigma : decreasing incr of the trapezoid */ 1122 parm4 = (parm3 - 1); 1123 parm4 = (parm2 - parm1) / parm4; 1124 1125 // pointless check, because parm4 >= 0 always 1126 // if ( parm4 < 0 ) { 1127 // parm4 = 0; 1128 //} 1129 1130 pr->u.p.parm1 = parm1; 1131 pr->u.p.parm2 = parm2; 1132 pr->u.p.parm3 = parm3; 1133 pr->u.p.parm4 = parm4; 1134 } // case 1135 break; 1136 1137 default: { 1138 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1139 KMP_HNT(GetNewerLibrary), // Hint 1140 __kmp_msg_null // Variadic argument list terminator 1141 ); 1142 } break; 1143 } // switch 1144 pr->schedule = schedule; 1145 if (active) { 1146 /* The name of this buffer should be my_buffer_index when it's free to use 1147 * it */ 1148 1149 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1150 "sh->buffer_index:%d\n", 1151 gtid, my_buffer_index, sh->buffer_index)); 1152 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1153 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1154 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1155 // my_buffer_index are *always* 32-bit integers. 1156 KMP_MB(); /* is this necessary? */ 1157 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1158 "sh->buffer_index:%d\n", 1159 gtid, my_buffer_index, sh->buffer_index)); 1160 1161 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1162 th->th.th_dispatch->th_dispatch_sh_current = 1163 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 1164 #if USE_ITT_BUILD 1165 if (pr->ordered) { 1166 __kmp_itt_ordered_init(gtid); 1167 } 1168 // Report loop metadata 1169 if (itt_need_metadata_reporting) { 1170 // Only report metadata by master of active team at level 1 1171 kmp_uint64 schedtype = 0; 1172 switch (schedule) { 1173 case kmp_sch_static_chunked: 1174 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1175 break; 1176 case kmp_sch_static_greedy: 1177 cur_chunk = pr->u.p.parm1; 1178 break; 1179 case kmp_sch_dynamic_chunked: 1180 schedtype = 1; 1181 break; 1182 case kmp_sch_guided_iterative_chunked: 1183 case kmp_sch_guided_analytical_chunked: 1184 case kmp_sch_guided_simd: 1185 schedtype = 2; 1186 break; 1187 default: 1188 // Should we put this case under "static"? 1189 // case kmp_sch_static_steal: 1190 schedtype = 3; 1191 break; 1192 } 1193 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1194 } 1195 #endif /* USE_ITT_BUILD */ 1196 } 1197 1198 #ifdef KMP_DEBUG 1199 { 1200 const char *buff; 1201 // create format specifiers before the debug output 1202 buff = __kmp_str_format( 1203 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1204 "lb:%%%s ub:%%%s" 1205 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1206 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1207 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1208 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1209 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1210 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1211 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1212 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1213 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1214 pr->u.p.parm3, pr->u.p.parm4)); 1215 __kmp_str_free(&buff); 1216 } 1217 #endif 1218 #if (KMP_STATIC_STEAL_ENABLED) 1219 // It cannot be guaranteed that after execution of a loop with some other 1220 // schedule kind all the parm3 variables will contain the same value. Even if 1221 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1222 // rather than program life-time increment. So the dedicated variable is 1223 // required. The 'static_steal_counter' is used. 1224 if (schedule == kmp_sch_static_steal) { 1225 // Other threads will inspect this variable when searching for a victim. 1226 // This is a flag showing that other threads may steal from this thread 1227 // since then. 1228 volatile T *p = &pr->u.p.static_steal_counter; 1229 *p = *p + 1; 1230 } 1231 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1232 1233 #if OMPT_SUPPORT && OMPT_TRACE 1234 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1235 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1236 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1237 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1238 team_info->parallel_id, task_info->task_id, team_info->microtask); 1239 } 1240 #endif 1241 } 1242 1243 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1244 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1245 * every chunk of iterations. If the ordered section(s) were not executed 1246 * for this iteration (or every iteration in this chunk), we need to set the 1247 * ordered iteration counters so that the next thread can proceed. */ 1248 template <typename UT> 1249 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1250 typedef typename traits_t<UT>::signed_t ST; 1251 kmp_info_t *th = __kmp_threads[gtid]; 1252 1253 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1254 if (!th->th.th_team->t.t_serialized) { 1255 1256 dispatch_private_info_template<UT> *pr = 1257 reinterpret_cast<dispatch_private_info_template<UT> *>( 1258 th->th.th_dispatch->th_dispatch_pr_current); 1259 dispatch_shared_info_template<UT> volatile *sh = 1260 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1261 th->th.th_dispatch->th_dispatch_sh_current); 1262 KMP_DEBUG_ASSERT(pr); 1263 KMP_DEBUG_ASSERT(sh); 1264 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1265 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1266 1267 if (pr->ordered_bumped) { 1268 KD_TRACE( 1269 1000, 1270 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1271 gtid)); 1272 pr->ordered_bumped = 0; 1273 } else { 1274 UT lower = pr->u.p.ordered_lower; 1275 1276 #ifdef KMP_DEBUG 1277 { 1278 const char *buff; 1279 // create format specifiers before the debug output 1280 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1281 "ordered_iteration:%%%s lower:%%%s\n", 1282 traits_t<UT>::spec, traits_t<UT>::spec); 1283 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1284 __kmp_str_free(&buff); 1285 } 1286 #endif 1287 1288 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1289 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1290 KMP_MB(); /* is this necessary? */ 1291 #ifdef KMP_DEBUG 1292 { 1293 const char *buff; 1294 // create format specifiers before the debug output 1295 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1296 "ordered_iteration:%%%s lower:%%%s\n", 1297 traits_t<UT>::spec, traits_t<UT>::spec); 1298 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1299 __kmp_str_free(&buff); 1300 } 1301 #endif 1302 1303 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1304 } // if 1305 } // if 1306 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1307 } 1308 1309 #ifdef KMP_GOMP_COMPAT 1310 1311 template <typename UT> 1312 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1313 typedef typename traits_t<UT>::signed_t ST; 1314 kmp_info_t *th = __kmp_threads[gtid]; 1315 1316 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1317 if (!th->th.th_team->t.t_serialized) { 1318 // int cid; 1319 dispatch_private_info_template<UT> *pr = 1320 reinterpret_cast<dispatch_private_info_template<UT> *>( 1321 th->th.th_dispatch->th_dispatch_pr_current); 1322 dispatch_shared_info_template<UT> volatile *sh = 1323 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1324 th->th.th_dispatch->th_dispatch_sh_current); 1325 KMP_DEBUG_ASSERT(pr); 1326 KMP_DEBUG_ASSERT(sh); 1327 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1328 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1329 1330 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1331 UT lower = pr->u.p.ordered_lower; 1332 UT upper = pr->u.p.ordered_upper; 1333 UT inc = upper - lower + 1; 1334 1335 if (pr->ordered_bumped == inc) { 1336 KD_TRACE( 1337 1000, 1338 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1339 gtid)); 1340 pr->ordered_bumped = 0; 1341 } else { 1342 inc -= pr->ordered_bumped; 1343 1344 #ifdef KMP_DEBUG 1345 { 1346 const char *buff; 1347 // create format specifiers before the debug output 1348 buff = __kmp_str_format( 1349 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1350 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1351 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1352 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1353 __kmp_str_free(&buff); 1354 } 1355 #endif 1356 1357 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1358 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1359 1360 KMP_MB(); /* is this necessary? */ 1361 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1362 "ordered_bumped to zero\n", 1363 gtid)); 1364 pr->ordered_bumped = 0; 1365 //!!!!! TODO check if the inc should be unsigned, or signed??? 1366 #ifdef KMP_DEBUG 1367 { 1368 const char *buff; 1369 // create format specifiers before the debug output 1370 buff = __kmp_str_format( 1371 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1372 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1373 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1374 traits_t<UT>::spec); 1375 KD_TRACE(1000, 1376 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1377 __kmp_str_free(&buff); 1378 } 1379 #endif 1380 1381 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1382 } 1383 // } 1384 } 1385 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1386 } 1387 1388 #endif /* KMP_GOMP_COMPAT */ 1389 1390 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1391 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1392 is not called. */ 1393 #if OMPT_SUPPORT && OMPT_TRACE 1394 #define OMPT_LOOP_END \ 1395 if (status == 0) { \ 1396 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1397 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1398 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1399 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1400 team_info->parallel_id, task_info->task_id); \ 1401 } \ 1402 } 1403 #else 1404 #define OMPT_LOOP_END // no-op 1405 #endif 1406 1407 template <typename T> 1408 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1409 T *p_lb, T *p_ub, 1410 typename traits_t<T>::signed_t *p_st) { 1411 1412 typedef typename traits_t<T>::unsigned_t UT; 1413 typedef typename traits_t<T>::signed_t ST; 1414 typedef typename traits_t<T>::floating_t DBL; 1415 1416 // This is potentially slightly misleading, schedule(runtime) will appear here 1417 // even if the actual runtme schedule is static. (Which points out a 1418 // disadavantage of schedule(runtime): even when static scheduling is used it 1419 // costs more than a compile time choice to use static scheduling would.) 1420 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1421 1422 int status; 1423 dispatch_private_info_template<T> *pr; 1424 kmp_info_t *th = __kmp_threads[gtid]; 1425 kmp_team_t *team = th->th.th_team; 1426 1427 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1428 #ifdef KMP_DEBUG 1429 { 1430 const char *buff; 1431 // create format specifiers before the debug output 1432 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1433 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1434 traits_t<T>::spec, traits_t<T>::spec, 1435 traits_t<ST>::spec); 1436 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1437 __kmp_str_free(&buff); 1438 } 1439 #endif 1440 1441 if (team->t.t_serialized) { 1442 /* NOTE: serialize this dispatch becase we are not at the active level */ 1443 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1444 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1445 KMP_DEBUG_ASSERT(pr); 1446 1447 if ((status = (pr->u.p.tc != 0)) == 0) { 1448 *p_lb = 0; 1449 *p_ub = 0; 1450 // if ( p_last != NULL ) 1451 // *p_last = 0; 1452 if (p_st != NULL) 1453 *p_st = 0; 1454 if (__kmp_env_consistency_check) { 1455 if (pr->pushed_ws != ct_none) { 1456 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1457 } 1458 } 1459 } else if (pr->nomerge) { 1460 kmp_int32 last; 1461 T start; 1462 UT limit, trip, init; 1463 ST incr; 1464 T chunk = pr->u.p.parm1; 1465 1466 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1467 gtid)); 1468 1469 init = chunk * pr->u.p.count++; 1470 trip = pr->u.p.tc - 1; 1471 1472 if ((status = (init <= trip)) == 0) { 1473 *p_lb = 0; 1474 *p_ub = 0; 1475 // if ( p_last != NULL ) 1476 // *p_last = 0; 1477 if (p_st != NULL) 1478 *p_st = 0; 1479 if (__kmp_env_consistency_check) { 1480 if (pr->pushed_ws != ct_none) { 1481 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1482 } 1483 } 1484 } else { 1485 start = pr->u.p.lb; 1486 limit = chunk + init - 1; 1487 incr = pr->u.p.st; 1488 1489 if ((last = (limit >= trip)) != 0) { 1490 limit = trip; 1491 #if KMP_OS_WINDOWS 1492 pr->u.p.last_upper = pr->u.p.ub; 1493 #endif /* KMP_OS_WINDOWS */ 1494 } 1495 if (p_last != NULL) 1496 *p_last = last; 1497 if (p_st != NULL) 1498 *p_st = incr; 1499 if (incr == 1) { 1500 *p_lb = start + init; 1501 *p_ub = start + limit; 1502 } else { 1503 *p_lb = start + init * incr; 1504 *p_ub = start + limit * incr; 1505 } 1506 1507 if (pr->ordered) { 1508 pr->u.p.ordered_lower = init; 1509 pr->u.p.ordered_upper = limit; 1510 #ifdef KMP_DEBUG 1511 { 1512 const char *buff; 1513 // create format specifiers before the debug output 1514 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1515 "ordered_lower:%%%s ordered_upper:%%%s\n", 1516 traits_t<UT>::spec, traits_t<UT>::spec); 1517 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1518 pr->u.p.ordered_upper)); 1519 __kmp_str_free(&buff); 1520 } 1521 #endif 1522 } // if 1523 } // if 1524 } else { 1525 pr->u.p.tc = 0; 1526 *p_lb = pr->u.p.lb; 1527 *p_ub = pr->u.p.ub; 1528 #if KMP_OS_WINDOWS 1529 pr->u.p.last_upper = *p_ub; 1530 #endif /* KMP_OS_WINDOWS */ 1531 if (p_last != NULL) 1532 *p_last = TRUE; 1533 if (p_st != NULL) 1534 *p_st = pr->u.p.st; 1535 } // if 1536 #ifdef KMP_DEBUG 1537 { 1538 const char *buff; 1539 // create format specifiers before the debug output 1540 buff = __kmp_str_format( 1541 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1542 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1543 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1544 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1545 __kmp_str_free(&buff); 1546 } 1547 #endif 1548 #if INCLUDE_SSC_MARKS 1549 SSC_MARK_DISPATCH_NEXT(); 1550 #endif 1551 OMPT_LOOP_END; 1552 return status; 1553 } else { 1554 kmp_int32 last = 0; 1555 dispatch_shared_info_template<UT> *sh; 1556 T start; 1557 ST incr; 1558 UT limit, trip, init; 1559 1560 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1561 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1562 1563 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1564 th->th.th_dispatch->th_dispatch_pr_current); 1565 KMP_DEBUG_ASSERT(pr); 1566 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1567 th->th.th_dispatch->th_dispatch_sh_current); 1568 KMP_DEBUG_ASSERT(sh); 1569 1570 if (pr->u.p.tc == 0) { 1571 // zero trip count 1572 status = 0; 1573 } else { 1574 switch (pr->schedule) { 1575 #if (KMP_STATIC_STEAL_ENABLED) 1576 case kmp_sch_static_steal: { 1577 T chunk = pr->u.p.parm1; 1578 int nproc = th->th.th_team_nproc; 1579 1580 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1581 gtid)); 1582 1583 trip = pr->u.p.tc - 1; 1584 1585 if (traits_t<T>::type_size > 4) { 1586 // use lock for 8-byte and CAS for 4-byte induction 1587 // variable. TODO (optional): check and use 16-byte CAS 1588 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1589 KMP_DEBUG_ASSERT(lck != NULL); 1590 if (pr->u.p.count < (UT)pr->u.p.ub) { 1591 __kmp_acquire_lock(lck, gtid); 1592 // try to get own chunk of iterations 1593 init = (pr->u.p.count)++; 1594 status = (init < (UT)pr->u.p.ub); 1595 __kmp_release_lock(lck, gtid); 1596 } else { 1597 status = 0; // no own chunks 1598 } 1599 if (!status) { // try to steal 1600 kmp_info_t **other_threads = team->t.t_threads; 1601 int while_limit = nproc; // nproc attempts to find a victim 1602 int while_index = 0; 1603 // TODO: algorithm of searching for a victim 1604 // should be cleaned up and measured 1605 while ((!status) && (while_limit != ++while_index)) { 1606 T remaining; 1607 T victimIdx = pr->u.p.parm4; 1608 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1609 dispatch_private_info_template<T> *victim = 1610 reinterpret_cast<dispatch_private_info_template<T> *>( 1611 other_threads[victimIdx] 1612 ->th.th_dispatch->th_dispatch_pr_current); 1613 while ((victim == NULL || victim == pr || 1614 (*(volatile T *)&victim->u.p.static_steal_counter != 1615 *(volatile T *)&pr->u.p.static_steal_counter)) && 1616 oldVictimIdx != victimIdx) { 1617 victimIdx = (victimIdx + 1) % nproc; 1618 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1619 other_threads[victimIdx] 1620 ->th.th_dispatch->th_dispatch_pr_current); 1621 } 1622 if (!victim || 1623 (*(volatile T *)&victim->u.p.static_steal_counter != 1624 *(volatile T *)&pr->u.p.static_steal_counter)) { 1625 continue; // try once more (nproc attempts in total) 1626 // no victim is ready yet to participate in stealing 1627 // because all victims are still in kmp_init_dispatch 1628 } 1629 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1630 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1631 continue; // not enough chunks to steal, goto next victim 1632 } 1633 1634 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1635 KMP_ASSERT(lck != NULL); 1636 __kmp_acquire_lock(lck, gtid); 1637 limit = victim->u.p.ub; // keep initial ub 1638 if (victim->u.p.count >= limit || 1639 (remaining = limit - victim->u.p.count) < 2) { 1640 __kmp_release_lock(lck, gtid); 1641 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1642 continue; // not enough chunks to steal 1643 } 1644 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1645 // or by 1 1646 if (remaining > 3) { 1647 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1648 init = (victim->u.p.ub -= 1649 (remaining >> 2)); // steal 1/4 of remaining 1650 } else { 1651 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1652 init = 1653 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1654 } 1655 __kmp_release_lock(lck, gtid); 1656 1657 KMP_DEBUG_ASSERT(init + 1 <= limit); 1658 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1659 status = 1; 1660 while_index = 0; 1661 // now update own count and ub with stolen range but init chunk 1662 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1663 pr->u.p.count = init + 1; 1664 pr->u.p.ub = limit; 1665 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1666 } // while (search for victim) 1667 } // if (try to find victim and steal) 1668 } else { 1669 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1670 typedef union { 1671 struct { 1672 UT count; 1673 T ub; 1674 } p; 1675 kmp_int64 b; 1676 } union_i4; 1677 // All operations on 'count' or 'ub' must be combined atomically 1678 // together. 1679 { 1680 union_i4 vold, vnew; 1681 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1682 vnew = vold; 1683 vnew.p.count++; 1684 while (!KMP_COMPARE_AND_STORE_ACQ64( 1685 (volatile kmp_int64 *)&pr->u.p.count, 1686 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1687 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1688 KMP_CPU_PAUSE(); 1689 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1690 vnew = vold; 1691 vnew.p.count++; 1692 } 1693 vnew = vold; 1694 init = vnew.p.count; 1695 status = (init < (UT)vnew.p.ub); 1696 } 1697 1698 if (!status) { 1699 kmp_info_t **other_threads = team->t.t_threads; 1700 int while_limit = nproc; // nproc attempts to find a victim 1701 int while_index = 0; 1702 1703 // TODO: algorithm of searching for a victim 1704 // should be cleaned up and measured 1705 while ((!status) && (while_limit != ++while_index)) { 1706 union_i4 vold, vnew; 1707 kmp_int32 remaining; 1708 T victimIdx = pr->u.p.parm4; 1709 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1710 dispatch_private_info_template<T> *victim = 1711 reinterpret_cast<dispatch_private_info_template<T> *>( 1712 other_threads[victimIdx] 1713 ->th.th_dispatch->th_dispatch_pr_current); 1714 while ((victim == NULL || victim == pr || 1715 (*(volatile T *)&victim->u.p.static_steal_counter != 1716 *(volatile T *)&pr->u.p.static_steal_counter)) && 1717 oldVictimIdx != victimIdx) { 1718 victimIdx = (victimIdx + 1) % nproc; 1719 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1720 other_threads[victimIdx] 1721 ->th.th_dispatch->th_dispatch_pr_current); 1722 } 1723 if (!victim || 1724 (*(volatile T *)&victim->u.p.static_steal_counter != 1725 *(volatile T *)&pr->u.p.static_steal_counter)) { 1726 continue; // try once more (nproc attempts in total) 1727 // no victim is ready yet to participate in stealing 1728 // because all victims are still in kmp_init_dispatch 1729 } 1730 pr->u.p.parm4 = victimIdx; // new victim found 1731 while (1) { // CAS loop if victim has enough chunks to steal 1732 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1733 vnew = vold; 1734 1735 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1736 if (vnew.p.count >= (UT)vnew.p.ub || 1737 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1738 pr->u.p.parm4 = 1739 (victimIdx + 1) % nproc; // shift start victim id 1740 break; // not enough chunks to steal, goto next victim 1741 } 1742 if (remaining > 3) { 1743 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1744 } else { 1745 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1746 } 1747 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1748 // TODO: Should this be acquire or release? 1749 if (KMP_COMPARE_AND_STORE_ACQ64( 1750 (volatile kmp_int64 *)&victim->u.p.count, 1751 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1752 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1753 // stealing succeeded 1754 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1755 vold.p.ub - vnew.p.ub); 1756 status = 1; 1757 while_index = 0; 1758 // now update own count and ub 1759 init = vnew.p.ub; 1760 vold.p.count = init + 1; 1761 #if KMP_ARCH_X86 1762 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1763 vold.b); 1764 #else 1765 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1766 #endif 1767 break; 1768 } // if (check CAS result) 1769 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1770 } // while (try to steal from particular victim) 1771 } // while (search for victim) 1772 } // if (try to find victim and steal) 1773 } // if (4-byte induction variable) 1774 if (!status) { 1775 *p_lb = 0; 1776 *p_ub = 0; 1777 if (p_st != NULL) 1778 *p_st = 0; 1779 } else { 1780 start = pr->u.p.parm2; 1781 init *= chunk; 1782 limit = chunk + init - 1; 1783 incr = pr->u.p.st; 1784 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1785 1786 KMP_DEBUG_ASSERT(init <= trip); 1787 if ((last = (limit >= trip)) != 0) 1788 limit = trip; 1789 if (p_st != NULL) 1790 *p_st = incr; 1791 1792 if (incr == 1) { 1793 *p_lb = start + init; 1794 *p_ub = start + limit; 1795 } else { 1796 *p_lb = start + init * incr; 1797 *p_ub = start + limit * incr; 1798 } 1799 1800 if (pr->ordered) { 1801 pr->u.p.ordered_lower = init; 1802 pr->u.p.ordered_upper = limit; 1803 #ifdef KMP_DEBUG 1804 { 1805 const char *buff; 1806 // create format specifiers before the debug output 1807 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1808 "ordered_lower:%%%s ordered_upper:%%%s\n", 1809 traits_t<UT>::spec, traits_t<UT>::spec); 1810 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1811 pr->u.p.ordered_upper)); 1812 __kmp_str_free(&buff); 1813 } 1814 #endif 1815 } // if 1816 } // if 1817 break; 1818 } // case 1819 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1820 case kmp_sch_static_balanced: { 1821 KD_TRACE( 1822 100, 1823 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1824 if ((status = !pr->u.p.count) != 1825 0) { /* check if thread has any iteration to do */ 1826 pr->u.p.count = 1; 1827 *p_lb = pr->u.p.lb; 1828 *p_ub = pr->u.p.ub; 1829 last = pr->u.p.parm1; 1830 if (p_st != NULL) 1831 *p_st = pr->u.p.st; 1832 } else { /* no iterations to do */ 1833 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1834 } 1835 if (pr->ordered) { 1836 #ifdef KMP_DEBUG 1837 { 1838 const char *buff; 1839 // create format specifiers before the debug output 1840 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1841 "ordered_lower:%%%s ordered_upper:%%%s\n", 1842 traits_t<UT>::spec, traits_t<UT>::spec); 1843 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1844 pr->u.p.ordered_upper)); 1845 __kmp_str_free(&buff); 1846 } 1847 #endif 1848 } // if 1849 } // case 1850 break; 1851 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1852 merged here */ 1853 case kmp_sch_static_chunked: { 1854 T parm1; 1855 1856 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1857 "kmp_sch_static_[affinity|chunked] case\n", 1858 gtid)); 1859 parm1 = pr->u.p.parm1; 1860 1861 trip = pr->u.p.tc - 1; 1862 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1863 1864 if ((status = (init <= trip)) != 0) { 1865 start = pr->u.p.lb; 1866 incr = pr->u.p.st; 1867 limit = parm1 + init - 1; 1868 1869 if ((last = (limit >= trip)) != 0) 1870 limit = trip; 1871 1872 if (p_st != NULL) 1873 *p_st = incr; 1874 1875 pr->u.p.count += th->th.th_team_nproc; 1876 1877 if (incr == 1) { 1878 *p_lb = start + init; 1879 *p_ub = start + limit; 1880 } else { 1881 *p_lb = start + init * incr; 1882 *p_ub = start + limit * incr; 1883 } 1884 1885 if (pr->ordered) { 1886 pr->u.p.ordered_lower = init; 1887 pr->u.p.ordered_upper = limit; 1888 #ifdef KMP_DEBUG 1889 { 1890 const char *buff; 1891 // create format specifiers before the debug output 1892 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1893 "ordered_lower:%%%s ordered_upper:%%%s\n", 1894 traits_t<UT>::spec, traits_t<UT>::spec); 1895 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1896 pr->u.p.ordered_upper)); 1897 __kmp_str_free(&buff); 1898 } 1899 #endif 1900 } // if 1901 } // if 1902 } // case 1903 break; 1904 1905 case kmp_sch_dynamic_chunked: { 1906 T chunk = pr->u.p.parm1; 1907 1908 KD_TRACE( 1909 100, 1910 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1911 1912 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1913 trip = pr->u.p.tc - 1; 1914 1915 if ((status = (init <= trip)) == 0) { 1916 *p_lb = 0; 1917 *p_ub = 0; 1918 if (p_st != NULL) 1919 *p_st = 0; 1920 } else { 1921 start = pr->u.p.lb; 1922 limit = chunk + init - 1; 1923 incr = pr->u.p.st; 1924 1925 if ((last = (limit >= trip)) != 0) 1926 limit = trip; 1927 1928 if (p_st != NULL) 1929 *p_st = incr; 1930 1931 if (incr == 1) { 1932 *p_lb = start + init; 1933 *p_ub = start + limit; 1934 } else { 1935 *p_lb = start + init * incr; 1936 *p_ub = start + limit * incr; 1937 } 1938 1939 if (pr->ordered) { 1940 pr->u.p.ordered_lower = init; 1941 pr->u.p.ordered_upper = limit; 1942 #ifdef KMP_DEBUG 1943 { 1944 const char *buff; 1945 // create format specifiers before the debug output 1946 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1947 "ordered_lower:%%%s ordered_upper:%%%s\n", 1948 traits_t<UT>::spec, traits_t<UT>::spec); 1949 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1950 pr->u.p.ordered_upper)); 1951 __kmp_str_free(&buff); 1952 } 1953 #endif 1954 } // if 1955 } // if 1956 } // case 1957 break; 1958 1959 case kmp_sch_guided_iterative_chunked: { 1960 T chunkspec = pr->u.p.parm1; 1961 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1962 "iterative case\n", 1963 gtid)); 1964 trip = pr->u.p.tc; 1965 // Start atomic part of calculations 1966 while (1) { 1967 ST remaining; // signed, because can be < 0 1968 init = sh->u.s.iteration; // shared value 1969 remaining = trip - init; 1970 if (remaining <= 0) { // AC: need to compare with 0 first 1971 // nothing to do, don't try atomic op 1972 status = 0; 1973 break; 1974 } 1975 if ((T)remaining < 1976 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1977 // use dynamic-style shcedule 1978 // atomically inrement iterations, get old value 1979 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1980 (ST)chunkspec); 1981 remaining = trip - init; 1982 if (remaining <= 0) { 1983 status = 0; // all iterations got by other threads 1984 } else { // got some iterations to work on 1985 status = 1; 1986 if ((T)remaining > chunkspec) { 1987 limit = init + chunkspec - 1; 1988 } else { 1989 last = 1; // the last chunk 1990 limit = init + remaining - 1; 1991 } // if 1992 } // if 1993 break; 1994 } // if 1995 limit = init + (UT)(remaining * 1996 *(double *)&pr->u.p.parm3); // divide by K*nproc 1997 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1998 (ST)init, (ST)limit)) { 1999 // CAS was successful, chunk obtained 2000 status = 1; 2001 --limit; 2002 break; 2003 } // if 2004 } // while 2005 if (status != 0) { 2006 start = pr->u.p.lb; 2007 incr = pr->u.p.st; 2008 if (p_st != NULL) 2009 *p_st = incr; 2010 *p_lb = start + init * incr; 2011 *p_ub = start + limit * incr; 2012 if (pr->ordered) { 2013 pr->u.p.ordered_lower = init; 2014 pr->u.p.ordered_upper = limit; 2015 #ifdef KMP_DEBUG 2016 { 2017 const char *buff; 2018 // create format specifiers before the debug output 2019 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2020 "ordered_lower:%%%s ordered_upper:%%%s\n", 2021 traits_t<UT>::spec, traits_t<UT>::spec); 2022 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2023 pr->u.p.ordered_upper)); 2024 __kmp_str_free(&buff); 2025 } 2026 #endif 2027 } // if 2028 } else { 2029 *p_lb = 0; 2030 *p_ub = 0; 2031 if (p_st != NULL) 2032 *p_st = 0; 2033 } // if 2034 } // case 2035 break; 2036 2037 case kmp_sch_guided_simd: { 2038 // same as iterative but curr-chunk adjusted to be multiple of given 2039 // chunk 2040 T chunk = pr->u.p.parm1; 2041 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", 2042 gtid)); 2043 trip = pr->u.p.tc; 2044 // Start atomic part of calculations 2045 while (1) { 2046 ST remaining; // signed, because can be < 0 2047 init = sh->u.s.iteration; // shared value 2048 remaining = trip - init; 2049 if (remaining <= 0) { // AC: need to compare with 0 first 2050 status = 0; // nothing to do, don't try atomic op 2051 break; 2052 } 2053 KMP_DEBUG_ASSERT(init % chunk == 0); 2054 // compare with K*nproc*(chunk+1), K=2 by default 2055 if ((T)remaining < pr->u.p.parm2) { 2056 // use dynamic-style shcedule 2057 // atomically inrement iterations, get old value 2058 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2059 (ST)chunk); 2060 remaining = trip - init; 2061 if (remaining <= 0) { 2062 status = 0; // all iterations got by other threads 2063 } else { 2064 // got some iterations to work on 2065 status = 1; 2066 if ((T)remaining > chunk) { 2067 limit = init + chunk - 1; 2068 } else { 2069 last = 1; // the last chunk 2070 limit = init + remaining - 1; 2071 } // if 2072 } // if 2073 break; 2074 } // if 2075 // divide by K*nproc 2076 UT span = remaining * (*(double *)&pr->u.p.parm3); 2077 UT rem = span % chunk; 2078 if (rem) // adjust so that span%chunk == 0 2079 span += chunk - rem; 2080 limit = init + span; 2081 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2082 (ST)init, (ST)limit)) { 2083 // CAS was successful, chunk obtained 2084 status = 1; 2085 --limit; 2086 break; 2087 } // if 2088 } // while 2089 if (status != 0) { 2090 start = pr->u.p.lb; 2091 incr = pr->u.p.st; 2092 if (p_st != NULL) 2093 *p_st = incr; 2094 *p_lb = start + init * incr; 2095 *p_ub = start + limit * incr; 2096 if (pr->ordered) { 2097 pr->u.p.ordered_lower = init; 2098 pr->u.p.ordered_upper = limit; 2099 #ifdef KMP_DEBUG 2100 { 2101 const char *buff; 2102 // create format specifiers before the debug output 2103 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2104 "ordered_lower:%%%s ordered_upper:%%%s\n", 2105 traits_t<UT>::spec, traits_t<UT>::spec); 2106 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2107 pr->u.p.ordered_upper)); 2108 __kmp_str_free(&buff); 2109 } 2110 #endif 2111 } // if 2112 } else { 2113 *p_lb = 0; 2114 *p_ub = 0; 2115 if (p_st != NULL) 2116 *p_st = 0; 2117 } // if 2118 } // case 2119 break; 2120 2121 case kmp_sch_guided_analytical_chunked: { 2122 T chunkspec = pr->u.p.parm1; 2123 UT chunkIdx; 2124 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2125 /* for storing original FPCW value for Windows* OS on 2126 IA-32 architecture 8-byte version */ 2127 unsigned int oldFpcw; 2128 unsigned int fpcwSet = 0; 2129 #endif 2130 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2131 "analytical case\n", 2132 gtid)); 2133 2134 trip = pr->u.p.tc; 2135 2136 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2137 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2138 trip); 2139 2140 while (1) { /* this while loop is a safeguard against unexpected zero 2141 chunk sizes */ 2142 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2143 if (chunkIdx >= (UT)pr->u.p.parm2) { 2144 --trip; 2145 /* use dynamic-style scheduling */ 2146 init = chunkIdx * chunkspec + pr->u.p.count; 2147 /* need to verify init > 0 in case of overflow in the above 2148 * calculation */ 2149 if ((status = (init > 0 && init <= trip)) != 0) { 2150 limit = init + chunkspec - 1; 2151 2152 if ((last = (limit >= trip)) != 0) 2153 limit = trip; 2154 } 2155 break; 2156 } else { 2157 /* use exponential-style scheduling */ 2158 /* The following check is to workaround the lack of long double precision on 2159 Windows* OS. 2160 This check works around the possible effect that init != 0 for chunkIdx == 0. 2161 */ 2162 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2163 /* If we haven't already done so, save original FPCW and set 2164 precision to 64-bit, as Windows* OS on IA-32 architecture 2165 defaults to 53-bit */ 2166 if (!fpcwSet) { 2167 oldFpcw = _control87(0, 0); 2168 _control87(_PC_64, _MCW_PC); 2169 fpcwSet = 0x30000; 2170 } 2171 #endif 2172 if (chunkIdx) { 2173 init = __kmp_dispatch_guided_remaining<T>( 2174 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2175 KMP_DEBUG_ASSERT(init); 2176 init = trip - init; 2177 } else 2178 init = 0; 2179 limit = trip - __kmp_dispatch_guided_remaining<T>( 2180 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2181 KMP_ASSERT(init <= limit); 2182 if (init < limit) { 2183 KMP_DEBUG_ASSERT(limit <= trip); 2184 --limit; 2185 status = 1; 2186 break; 2187 } // if 2188 } // if 2189 } // while (1) 2190 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2191 /* restore FPCW if necessary 2192 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2193 here */ 2194 if (fpcwSet && (oldFpcw & fpcwSet)) 2195 _control87(oldFpcw, _MCW_PC); 2196 #endif 2197 if (status != 0) { 2198 start = pr->u.p.lb; 2199 incr = pr->u.p.st; 2200 if (p_st != NULL) 2201 *p_st = incr; 2202 *p_lb = start + init * incr; 2203 *p_ub = start + limit * incr; 2204 if (pr->ordered) { 2205 pr->u.p.ordered_lower = init; 2206 pr->u.p.ordered_upper = limit; 2207 #ifdef KMP_DEBUG 2208 { 2209 const char *buff; 2210 // create format specifiers before the debug output 2211 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2212 "ordered_lower:%%%s ordered_upper:%%%s\n", 2213 traits_t<UT>::spec, traits_t<UT>::spec); 2214 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2215 pr->u.p.ordered_upper)); 2216 __kmp_str_free(&buff); 2217 } 2218 #endif 2219 } 2220 } else { 2221 *p_lb = 0; 2222 *p_ub = 0; 2223 if (p_st != NULL) 2224 *p_st = 0; 2225 } 2226 } // case 2227 break; 2228 2229 case kmp_sch_trapezoidal: { 2230 UT index; 2231 T parm2 = pr->u.p.parm2; 2232 T parm3 = pr->u.p.parm3; 2233 T parm4 = pr->u.p.parm4; 2234 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2235 gtid)); 2236 2237 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2238 2239 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2240 trip = pr->u.p.tc - 1; 2241 2242 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2243 *p_lb = 0; 2244 *p_ub = 0; 2245 if (p_st != NULL) 2246 *p_st = 0; 2247 } else { 2248 start = pr->u.p.lb; 2249 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2250 incr = pr->u.p.st; 2251 2252 if ((last = (limit >= trip)) != 0) 2253 limit = trip; 2254 2255 if (p_st != NULL) 2256 *p_st = incr; 2257 2258 if (incr == 1) { 2259 *p_lb = start + init; 2260 *p_ub = start + limit; 2261 } else { 2262 *p_lb = start + init * incr; 2263 *p_ub = start + limit * incr; 2264 } 2265 2266 if (pr->ordered) { 2267 pr->u.p.ordered_lower = init; 2268 pr->u.p.ordered_upper = limit; 2269 #ifdef KMP_DEBUG 2270 { 2271 const char *buff; 2272 // create format specifiers before the debug output 2273 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2274 "ordered_lower:%%%s ordered_upper:%%%s\n", 2275 traits_t<UT>::spec, traits_t<UT>::spec); 2276 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2277 pr->u.p.ordered_upper)); 2278 __kmp_str_free(&buff); 2279 } 2280 #endif 2281 } // if 2282 } // if 2283 } // case 2284 break; 2285 default: { 2286 status = 0; // to avoid complaints on uninitialized variable use 2287 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 2288 KMP_HNT(GetNewerLibrary), // Hint 2289 __kmp_msg_null // Variadic argument list terminator 2290 ); 2291 } break; 2292 } // switch 2293 } // if tc == 0; 2294 2295 if (status == 0) { 2296 UT num_done; 2297 2298 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2299 #ifdef KMP_DEBUG 2300 { 2301 const char *buff; 2302 // create format specifiers before the debug output 2303 buff = __kmp_str_format( 2304 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2305 traits_t<UT>::spec); 2306 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2307 __kmp_str_free(&buff); 2308 } 2309 #endif 2310 2311 if ((ST)num_done == th->th.th_team_nproc - 1) { 2312 #if (KMP_STATIC_STEAL_ENABLED) 2313 if (pr->schedule == kmp_sch_static_steal && 2314 traits_t<T>::type_size > 4) { 2315 int i; 2316 kmp_info_t **other_threads = team->t.t_threads; 2317 // loop complete, safe to destroy locks used for stealing 2318 for (i = 0; i < th->th.th_team_nproc; ++i) { 2319 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2320 KMP_ASSERT(lck != NULL); 2321 __kmp_destroy_lock(lck); 2322 __kmp_free(lck); 2323 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2324 } 2325 } 2326 #endif 2327 /* NOTE: release this buffer to be reused */ 2328 2329 KMP_MB(); /* Flush all pending memory write invalidates. */ 2330 2331 sh->u.s.num_done = 0; 2332 sh->u.s.iteration = 0; 2333 2334 /* TODO replace with general release procedure? */ 2335 if (pr->ordered) { 2336 sh->u.s.ordered_iteration = 0; 2337 } 2338 2339 KMP_MB(); /* Flush all pending memory write invalidates. */ 2340 2341 sh->buffer_index += __kmp_dispatch_num_buffers; 2342 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2343 gtid, sh->buffer_index)); 2344 2345 KMP_MB(); /* Flush all pending memory write invalidates. */ 2346 2347 } // if 2348 if (__kmp_env_consistency_check) { 2349 if (pr->pushed_ws != ct_none) { 2350 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2351 } 2352 } 2353 2354 th->th.th_dispatch->th_deo_fcn = NULL; 2355 th->th.th_dispatch->th_dxo_fcn = NULL; 2356 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2357 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2358 } // if (status == 0) 2359 #if KMP_OS_WINDOWS 2360 else if (last) { 2361 pr->u.p.last_upper = pr->u.p.ub; 2362 } 2363 #endif /* KMP_OS_WINDOWS */ 2364 if (p_last != NULL && status != 0) 2365 *p_last = last; 2366 } // if 2367 2368 #ifdef KMP_DEBUG 2369 { 2370 const char *buff; 2371 // create format specifiers before the debug output 2372 buff = __kmp_str_format( 2373 "__kmp_dispatch_next: T#%%d normal case: " 2374 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2375 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2376 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2377 __kmp_str_free(&buff); 2378 } 2379 #endif 2380 #if INCLUDE_SSC_MARKS 2381 SSC_MARK_DISPATCH_NEXT(); 2382 #endif 2383 OMPT_LOOP_END; 2384 return status; 2385 } 2386 2387 template <typename T> 2388 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2389 kmp_int32 *plastiter, T *plower, T *pupper, 2390 typename traits_t<T>::signed_t incr) { 2391 typedef typename traits_t<T>::unsigned_t UT; 2392 typedef typename traits_t<T>::signed_t ST; 2393 kmp_uint32 team_id; 2394 kmp_uint32 nteams; 2395 UT trip_count; 2396 kmp_team_t *team; 2397 kmp_info_t *th; 2398 2399 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2400 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2401 #ifdef KMP_DEBUG 2402 { 2403 const char *buff; 2404 // create format specifiers before the debug output 2405 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2406 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2407 traits_t<T>::spec, traits_t<T>::spec, 2408 traits_t<ST>::spec, traits_t<T>::spec); 2409 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2410 __kmp_str_free(&buff); 2411 } 2412 #endif 2413 2414 if (__kmp_env_consistency_check) { 2415 if (incr == 0) { 2416 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2417 loc); 2418 } 2419 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2420 // The loop is illegal. 2421 // Some zero-trip loops maintained by compiler, e.g.: 2422 // for(i=10;i<0;++i) // lower >= upper - run-time check 2423 // for(i=0;i>10;--i) // lower <= upper - run-time check 2424 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2425 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2426 // Compiler does not check the following illegal loops: 2427 // for(i=0;i<10;i+=incr) // where incr<0 2428 // for(i=10;i>0;i-=incr) // where incr<0 2429 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2430 } 2431 } 2432 th = __kmp_threads[gtid]; 2433 team = th->th.th_team; 2434 #if OMP_40_ENABLED 2435 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2436 nteams = th->th.th_teams_size.nteams; 2437 #endif 2438 team_id = team->t.t_master_tid; 2439 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2440 2441 // compute global trip count 2442 if (incr == 1) { 2443 trip_count = *pupper - *plower + 1; 2444 } else if (incr == -1) { 2445 trip_count = *plower - *pupper + 1; 2446 } else if (incr > 0) { 2447 // upper-lower can exceed the limit of signed type 2448 trip_count = (UT)(*pupper - *plower) / incr + 1; 2449 } else { 2450 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2451 } 2452 2453 if (trip_count <= nteams) { 2454 KMP_DEBUG_ASSERT( 2455 __kmp_static == kmp_sch_static_greedy || 2456 __kmp_static == 2457 kmp_sch_static_balanced); // Unknown static scheduling type. 2458 // only some teams get single iteration, others get nothing 2459 if (team_id < trip_count) { 2460 *pupper = *plower = *plower + team_id * incr; 2461 } else { 2462 *plower = *pupper + incr; // zero-trip loop 2463 } 2464 if (plastiter != NULL) 2465 *plastiter = (team_id == trip_count - 1); 2466 } else { 2467 if (__kmp_static == kmp_sch_static_balanced) { 2468 UT chunk = trip_count / nteams; 2469 UT extras = trip_count % nteams; 2470 *plower += 2471 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2472 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2473 if (plastiter != NULL) 2474 *plastiter = (team_id == nteams - 1); 2475 } else { 2476 T chunk_inc_count = 2477 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2478 T upper = *pupper; 2479 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2480 // Unknown static scheduling type. 2481 *plower += team_id * chunk_inc_count; 2482 *pupper = *plower + chunk_inc_count - incr; 2483 // Check/correct bounds if needed 2484 if (incr > 0) { 2485 if (*pupper < *plower) 2486 *pupper = traits_t<T>::max_value; 2487 if (plastiter != NULL) 2488 *plastiter = *plower <= upper && *pupper > upper - incr; 2489 if (*pupper > upper) 2490 *pupper = upper; // tracker C73258 2491 } else { 2492 if (*pupper > *plower) 2493 *pupper = traits_t<T>::min_value; 2494 if (plastiter != NULL) 2495 *plastiter = *plower >= upper && *pupper < upper - incr; 2496 if (*pupper < upper) 2497 *pupper = upper; // tracker C73258 2498 } 2499 } 2500 } 2501 } 2502 2503 //----------------------------------------------------------------------------- 2504 // Dispatch routines 2505 // Transfer call to template< type T > 2506 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2507 // T lb, T ub, ST st, ST chunk ) 2508 extern "C" { 2509 2510 /*! 2511 @ingroup WORK_SHARING 2512 @{ 2513 @param loc Source location 2514 @param gtid Global thread id 2515 @param schedule Schedule type 2516 @param lb Lower bound 2517 @param ub Upper bound 2518 @param st Step (or increment if you prefer) 2519 @param chunk The chunk size to block with 2520 2521 This function prepares the runtime to start a dynamically scheduled for loop, 2522 saving the loop arguments. 2523 These functions are all identical apart from the types of the arguments. 2524 */ 2525 2526 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2527 enum sched_type schedule, kmp_int32 lb, 2528 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2529 KMP_DEBUG_ASSERT(__kmp_init_serial); 2530 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2531 } 2532 /*! 2533 See @ref __kmpc_dispatch_init_4 2534 */ 2535 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2536 enum sched_type schedule, kmp_uint32 lb, 2537 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2538 KMP_DEBUG_ASSERT(__kmp_init_serial); 2539 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2540 } 2541 2542 /*! 2543 See @ref __kmpc_dispatch_init_4 2544 */ 2545 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2546 enum sched_type schedule, kmp_int64 lb, 2547 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2548 KMP_DEBUG_ASSERT(__kmp_init_serial); 2549 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2550 } 2551 2552 /*! 2553 See @ref __kmpc_dispatch_init_4 2554 */ 2555 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2556 enum sched_type schedule, kmp_uint64 lb, 2557 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2558 KMP_DEBUG_ASSERT(__kmp_init_serial); 2559 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2560 } 2561 2562 /*! 2563 See @ref __kmpc_dispatch_init_4 2564 2565 Difference from __kmpc_dispatch_init set of functions is these functions 2566 are called for composite distribute parallel for construct. Thus before 2567 regular iterations dispatching we need to calc per-team iteration space. 2568 2569 These functions are all identical apart from the types of the arguments. 2570 */ 2571 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2572 enum sched_type schedule, kmp_int32 *p_last, 2573 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2574 kmp_int32 chunk) { 2575 KMP_DEBUG_ASSERT(__kmp_init_serial); 2576 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2577 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2578 } 2579 2580 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2581 enum sched_type schedule, kmp_int32 *p_last, 2582 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2583 kmp_int32 chunk) { 2584 KMP_DEBUG_ASSERT(__kmp_init_serial); 2585 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2586 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2587 } 2588 2589 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2590 enum sched_type schedule, kmp_int32 *p_last, 2591 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2592 kmp_int64 chunk) { 2593 KMP_DEBUG_ASSERT(__kmp_init_serial); 2594 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2595 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2596 } 2597 2598 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2599 enum sched_type schedule, kmp_int32 *p_last, 2600 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2601 kmp_int64 chunk) { 2602 KMP_DEBUG_ASSERT(__kmp_init_serial); 2603 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2604 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2605 } 2606 2607 /*! 2608 @param loc Source code location 2609 @param gtid Global thread id 2610 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2611 otherwise 2612 @param p_lb Pointer to the lower bound for the next chunk of work 2613 @param p_ub Pointer to the upper bound for the next chunk of work 2614 @param p_st Pointer to the stride for the next chunk of work 2615 @return one if there is work to be done, zero otherwise 2616 2617 Get the next dynamically allocated chunk of work for this thread. 2618 If there is no more work, then the lb,ub and stride need not be modified. 2619 */ 2620 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2621 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2622 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2623 } 2624 2625 /*! 2626 See @ref __kmpc_dispatch_next_4 2627 */ 2628 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2629 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2630 kmp_int32 *p_st) { 2631 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2632 } 2633 2634 /*! 2635 See @ref __kmpc_dispatch_next_4 2636 */ 2637 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2638 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2639 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2640 } 2641 2642 /*! 2643 See @ref __kmpc_dispatch_next_4 2644 */ 2645 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2646 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2647 kmp_int64 *p_st) { 2648 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2649 } 2650 2651 /*! 2652 @param loc Source code location 2653 @param gtid Global thread id 2654 2655 Mark the end of a dynamic loop. 2656 */ 2657 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2658 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2659 } 2660 2661 /*! 2662 See @ref __kmpc_dispatch_fini_4 2663 */ 2664 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2665 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2666 } 2667 2668 /*! 2669 See @ref __kmpc_dispatch_fini_4 2670 */ 2671 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2672 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2673 } 2674 2675 /*! 2676 See @ref __kmpc_dispatch_fini_4 2677 */ 2678 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2679 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2680 } 2681 /*! @} */ 2682 2683 //----------------------------------------------------------------------------- 2684 // Non-template routines from kmp_dispatch.cpp used in other sources 2685 2686 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2687 return value == checker; 2688 } 2689 2690 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2691 return value != checker; 2692 } 2693 2694 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2695 return value < checker; 2696 } 2697 2698 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2699 return value >= checker; 2700 } 2701 2702 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2703 return value <= checker; 2704 } 2705 2706 kmp_uint32 2707 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2708 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2709 void *obj // Higher-level synchronization object, or NULL. 2710 ) { 2711 // note: we may not belong to a team at this point 2712 volatile kmp_uint32 *spin = spinner; 2713 kmp_uint32 check = checker; 2714 kmp_uint32 spins; 2715 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2716 kmp_uint32 r; 2717 2718 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2719 KMP_INIT_YIELD(spins); 2720 // main wait spin loop 2721 while (!f(r = TCR_4(*spin), check)) { 2722 KMP_FSYNC_SPIN_PREPARE(obj); 2723 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2724 split. It causes problems with infinite recursion because of exit lock */ 2725 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2726 __kmp_abort_thread(); */ 2727 2728 /* if we have waited a bit, or are oversubscribed, yield */ 2729 /* pause is in the following code */ 2730 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2731 KMP_YIELD_SPIN(spins); 2732 } 2733 KMP_FSYNC_SPIN_ACQUIRED(obj); 2734 return r; 2735 } 2736 2737 void __kmp_wait_yield_4_ptr( 2738 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2739 void *obj // Higher-level synchronization object, or NULL. 2740 ) { 2741 // note: we may not belong to a team at this point 2742 void *spin = spinner; 2743 kmp_uint32 check = checker; 2744 kmp_uint32 spins; 2745 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2746 2747 KMP_FSYNC_SPIN_INIT(obj, spin); 2748 KMP_INIT_YIELD(spins); 2749 // main wait spin loop 2750 while (!f(spin, check)) { 2751 KMP_FSYNC_SPIN_PREPARE(obj); 2752 /* if we have waited a bit, or are oversubscribed, yield */ 2753 /* pause is in the following code */ 2754 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2755 KMP_YIELD_SPIN(spins); 2756 } 2757 KMP_FSYNC_SPIN_ACQUIRED(obj); 2758 } 2759 2760 } // extern "C" 2761 2762 #ifdef KMP_GOMP_COMPAT 2763 2764 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2765 enum sched_type schedule, kmp_int32 lb, 2766 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2767 int push_ws) { 2768 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2769 push_ws); 2770 } 2771 2772 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2773 enum sched_type schedule, kmp_uint32 lb, 2774 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2775 int push_ws) { 2776 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2777 push_ws); 2778 } 2779 2780 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2781 enum sched_type schedule, kmp_int64 lb, 2782 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2783 int push_ws) { 2784 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2785 push_ws); 2786 } 2787 2788 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2789 enum sched_type schedule, kmp_uint64 lb, 2790 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2791 int push_ws) { 2792 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2793 push_ws); 2794 } 2795 2796 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2797 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2798 } 2799 2800 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2801 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2802 } 2803 2804 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2805 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2806 } 2807 2808 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2809 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2810 } 2811 2812 #endif /* KMP_GOMP_COMPAT */ 2813 2814 /* ------------------------------------------------------------------------ */ 2815