1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* Dynamic scheduling initialization and dispatch. 17 * 18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 19 * it may change values between parallel regions. __kmp_max_nth 20 * is the largest value __kmp_nth may take, 1 is the smallest. 21 */ 22 23 // Need to raise Win version from XP to Vista here for support of 24 // InterlockedExchange64 25 #if defined(_WIN32_WINNT) && defined(_M_IX86) 26 #undef _WIN32_WINNT 27 #define _WIN32_WINNT 0x0502 28 #endif 29 30 #include "kmp.h" 31 #include "kmp_error.h" 32 #include "kmp_i18n.h" 33 #include "kmp_itt.h" 34 #include "kmp_stats.h" 35 #include "kmp_str.h" 36 #if KMP_OS_WINDOWS && KMP_ARCH_X86 37 #include <float.h> 38 #endif 39 40 #if OMPT_SUPPORT 41 #include "ompt-internal.h" 42 #include "ompt-specific.h" 43 #endif 44 45 /* ------------------------------------------------------------------------ */ 46 47 #if KMP_STATIC_STEAL_ENABLED 48 49 // replaces dispatch_private_info{32,64} structures and 50 // dispatch_private_info{32,64}_t types 51 template <typename T> struct dispatch_private_infoXX_template { 52 typedef typename traits_t<T>::unsigned_t UT; 53 typedef typename traits_t<T>::signed_t ST; 54 UT count; // unsigned 55 T ub; 56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 57 T lb; 58 ST st; // signed 59 UT tc; // unsigned 60 T static_steal_counter; // for static_steal only; maybe better to put after ub 61 62 /* parm[1-4] are used in different ways by different scheduling algorithms */ 63 64 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 65 // a) parm3 is properly aligned and 66 // b) all parm1-4 are in the same cache line. 67 // Because of parm1-4 are used together, performance seems to be better 68 // if they are in the same line (not measured though). 69 70 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 71 T parm1; 72 T parm2; 73 T parm3; 74 T parm4; 75 }; 76 77 UT ordered_lower; // unsigned 78 UT ordered_upper; // unsigned 79 #if KMP_OS_WINDOWS 80 T last_upper; 81 #endif /* KMP_OS_WINDOWS */ 82 }; 83 84 #else /* KMP_STATIC_STEAL_ENABLED */ 85 86 // replaces dispatch_private_info{32,64} structures and 87 // dispatch_private_info{32,64}_t types 88 template <typename T> struct dispatch_private_infoXX_template { 89 typedef typename traits_t<T>::unsigned_t UT; 90 typedef typename traits_t<T>::signed_t ST; 91 T lb; 92 T ub; 93 ST st; // signed 94 UT tc; // unsigned 95 96 T parm1; 97 T parm2; 98 T parm3; 99 T parm4; 100 101 UT count; // unsigned 102 103 UT ordered_lower; // unsigned 104 UT ordered_upper; // unsigned 105 #if KMP_OS_WINDOWS 106 T last_upper; 107 #endif /* KMP_OS_WINDOWS */ 108 }; 109 110 #endif /* KMP_STATIC_STEAL_ENABLED */ 111 112 // replaces dispatch_private_info structure and dispatch_private_info_t type 113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 114 // duplicate alignment here, otherwise size of structure is not correct in our 115 // compiler 116 union KMP_ALIGN_CACHE private_info_tmpl { 117 dispatch_private_infoXX_template<T> p; 118 dispatch_private_info64_t p64; 119 } u; 120 enum sched_type schedule; /* scheduling algorithm */ 121 kmp_uint32 ordered; /* ordered clause specified */ 122 kmp_uint32 ordered_bumped; 123 // To retain the structure size after making ordered_iteration scalar 124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 126 kmp_uint32 nomerge; /* don't merge iters if serialized */ 127 kmp_uint32 type_size; 128 enum cons_type pushed_ws; 129 }; 130 131 // replaces dispatch_shared_info{32,64} structures and 132 // dispatch_shared_info{32,64}_t types 133 template <typename UT> struct dispatch_shared_infoXX_template { 134 /* chunk index under dynamic, number of idle threads under static-steal; 135 iteration index otherwise */ 136 volatile UT iteration; 137 volatile UT num_done; 138 volatile UT ordered_iteration; 139 // to retain the structure size making ordered_iteration scalar 140 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 141 }; 142 143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 144 template <typename UT> struct dispatch_shared_info_template { 145 // we need union here to keep the structure size 146 union shared_info_tmpl { 147 dispatch_shared_infoXX_template<UT> s; 148 dispatch_shared_info64_t s64; 149 } u; 150 volatile kmp_uint32 buffer_index; 151 #if OMP_45_ENABLED 152 volatile kmp_int32 doacross_buf_idx; // teamwise index 153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 154 kmp_int32 doacross_num_done; // count finished threads 155 #endif 156 #if KMP_USE_HWLOC 157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 158 // machines (> 48 cores). Performance analysis showed that a cache thrash 159 // was occurring and this padding helps alleviate the problem. 160 char padding[64]; 161 #endif 162 }; 163 164 /* ------------------------------------------------------------------------ */ 165 166 #undef USE_TEST_LOCKS 167 168 // test_then_add template (general template should NOT be used) 169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 170 171 template <> 172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 173 kmp_int32 d) { 174 kmp_int32 r; 175 r = KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, p), d); 176 return r; 177 } 178 179 template <> 180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 181 kmp_int64 d) { 182 kmp_int64 r; 183 r = KMP_TEST_THEN_ADD64(CCAST(kmp_int64 *, p), d); 184 return r; 185 } 186 187 // test_then_inc_acq template (general template should NOT be used) 188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 189 190 template <> 191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 192 kmp_int32 r; 193 r = KMP_TEST_THEN_INC_ACQ32(CCAST(kmp_int32 *, p)); 194 return r; 195 } 196 197 template <> 198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 199 kmp_int64 r; 200 r = KMP_TEST_THEN_INC_ACQ64(CCAST(kmp_int64 *, p)); 201 return r; 202 } 203 204 // test_then_inc template (general template should NOT be used) 205 template <typename T> static __forceinline T test_then_inc(volatile T *p); 206 207 template <> 208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 209 kmp_int32 r; 210 r = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, p)); 211 return r; 212 } 213 214 template <> 215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 216 kmp_int64 r; 217 r = KMP_TEST_THEN_INC64(CCAST(kmp_int64 *, p)); 218 return r; 219 } 220 221 // compare_and_swap template (general template should NOT be used) 222 template <typename T> 223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 224 225 template <> 226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 227 kmp_int32 c, kmp_int32 s) { 228 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 229 } 230 231 template <> 232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 233 kmp_int64 c, kmp_int64 s) { 234 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 235 } 236 237 /* Spin wait loop that first does pause, then yield. 238 Waits until function returns non-zero when called with *spinner and check. 239 Does NOT put threads to sleep. 240 #if USE_ITT_BUILD 241 Arguments: 242 obj -- is higher-level synchronization object to report to ittnotify. 243 It is used to report locks consistently. For example, if lock is 244 acquired immediately, its address is reported to ittnotify via 245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 247 same address, not an address of low-level spinner. 248 #endif // USE_ITT_BUILD 249 */ 250 template <typename UT> 251 // ToDo: make inline function (move to header file for icl) 252 static UT // unsigned 4- or 8-byte type 253 __kmp_wait_yield( 254 volatile UT *spinner, UT checker, 255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG( 256 void *obj) // Higher-level synchronization object, or NULL. 257 ) { 258 // note: we may not belong to a team at this point 259 volatile UT *spin = spinner; 260 UT check = checker; 261 kmp_uint32 spins; 262 kmp_uint32 (*f)(UT, UT) = pred; 263 UT r; 264 265 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 266 KMP_INIT_YIELD(spins); 267 // main wait spin loop 268 while (!f(r = *spin, check)) { 269 KMP_FSYNC_SPIN_PREPARE(obj); 270 /* GEH - remove this since it was accidentally introduced when kmp_wait was 271 split. It causes problems with infinite recursion because of exit lock */ 272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 273 __kmp_abort_thread(); */ 274 275 // if we are oversubscribed, or have waited a bit (and 276 // KMP_LIBRARY=throughput, then yield. pause is in the following code 277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 278 KMP_YIELD_SPIN(spins); 279 } 280 KMP_FSYNC_SPIN_ACQUIRED(obj); 281 return r; 282 } 283 284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 285 return value == checker; 286 } 287 288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 289 return value != checker; 290 } 291 292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 293 return value < checker; 294 } 295 296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 297 return value >= checker; 298 } 299 300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 301 return value <= checker; 302 } 303 304 /* ------------------------------------------------------------------------ */ 305 306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 307 ident_t *loc_ref) { 308 kmp_info_t *th; 309 310 KMP_DEBUG_ASSERT(gtid_ref); 311 312 if (__kmp_env_consistency_check) { 313 th = __kmp_threads[*gtid_ref]; 314 if (th->th.th_root->r.r_active && 315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 316 #if KMP_USE_DYNAMIC_LOCK 317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 318 #else 319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 320 #endif 321 } 322 } 323 } 324 325 template <typename UT> 326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 327 typedef typename traits_t<UT>::signed_t ST; 328 dispatch_private_info_template<UT> *pr; 329 330 int gtid = *gtid_ref; 331 // int cid = *cid_ref; 332 kmp_info_t *th = __kmp_threads[gtid]; 333 KMP_DEBUG_ASSERT(th->th.th_dispatch); 334 335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 336 if (__kmp_env_consistency_check) { 337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 338 th->th.th_dispatch->th_dispatch_pr_current); 339 if (pr->pushed_ws != ct_none) { 340 #if KMP_USE_DYNAMIC_LOCK 341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 342 #else 343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 344 #endif 345 } 346 } 347 348 if (!th->th.th_team->t.t_serialized) { 349 dispatch_shared_info_template<UT> *sh = 350 reinterpret_cast<dispatch_shared_info_template<UT> *>( 351 th->th.th_dispatch->th_dispatch_sh_current); 352 UT lower; 353 354 if (!__kmp_env_consistency_check) { 355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 356 th->th.th_dispatch->th_dispatch_pr_current); 357 } 358 lower = pr->u.p.ordered_lower; 359 360 #if !defined(KMP_GOMP_COMPAT) 361 if (__kmp_env_consistency_check) { 362 if (pr->ordered_bumped) { 363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 365 ct_ordered_in_pdo, loc_ref, 366 &p->stack_data[p->w_top]); 367 } 368 } 369 #endif /* !defined(KMP_GOMP_COMPAT) */ 370 371 KMP_MB(); 372 #ifdef KMP_DEBUG 373 { 374 const char *buff; 375 // create format specifiers before the debug output 376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 377 "ordered_iter:%%%s lower:%%%s\n", 378 traits_t<UT>::spec, traits_t<UT>::spec); 379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 380 __kmp_str_free(&buff); 381 } 382 #endif 383 384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 386 KMP_MB(); /* is this necessary? */ 387 #ifdef KMP_DEBUG 388 { 389 const char *buff; 390 // create format specifiers before the debug output 391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 392 "ordered_iter:%%%s lower:%%%s\n", 393 traits_t<UT>::spec, traits_t<UT>::spec); 394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 395 __kmp_str_free(&buff); 396 } 397 #endif 398 } 399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 400 } 401 402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 403 ident_t *loc_ref) { 404 kmp_info_t *th; 405 406 if (__kmp_env_consistency_check) { 407 th = __kmp_threads[*gtid_ref]; 408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 410 } 411 } 412 } 413 414 template <typename UT> 415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 416 typedef typename traits_t<UT>::signed_t ST; 417 dispatch_private_info_template<UT> *pr; 418 419 int gtid = *gtid_ref; 420 // int cid = *cid_ref; 421 kmp_info_t *th = __kmp_threads[gtid]; 422 KMP_DEBUG_ASSERT(th->th.th_dispatch); 423 424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 425 if (__kmp_env_consistency_check) { 426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 427 th->th.th_dispatch->th_dispatch_pr_current); 428 if (pr->pushed_ws != ct_none) { 429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 430 } 431 } 432 433 if (!th->th.th_team->t.t_serialized) { 434 dispatch_shared_info_template<UT> *sh = 435 reinterpret_cast<dispatch_shared_info_template<UT> *>( 436 th->th.th_dispatch->th_dispatch_sh_current); 437 438 if (!__kmp_env_consistency_check) { 439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 440 th->th.th_dispatch->th_dispatch_pr_current); 441 } 442 443 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 444 #if !defined(KMP_GOMP_COMPAT) 445 if (__kmp_env_consistency_check) { 446 if (pr->ordered_bumped != 0) { 447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 448 /* How to test it? - OM */ 449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 450 ct_ordered_in_pdo, loc_ref, 451 &p->stack_data[p->w_top]); 452 } 453 } 454 #endif /* !defined(KMP_GOMP_COMPAT) */ 455 456 KMP_MB(); /* Flush all pending memory write invalidates. */ 457 458 pr->ordered_bumped += 1; 459 460 KD_TRACE(1000, 461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 462 gtid, pr->ordered_bumped)); 463 464 KMP_MB(); /* Flush all pending memory write invalidates. */ 465 466 /* TODO use general release procedure? */ 467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 468 469 KMP_MB(); /* Flush all pending memory write invalidates. */ 470 } 471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 472 } 473 474 // Computes and returns x to the power of y, where y must a non-negative integer 475 template <typename UT> 476 static __forceinline long double __kmp_pow(long double x, UT y) { 477 long double s = 1.0L; 478 479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 481 while (y) { 482 if (y & 1) 483 s *= x; 484 x *= x; 485 y >>= 1; 486 } 487 return s; 488 } 489 490 /* Computes and returns the number of unassigned iterations after idx chunks 491 have been assigned (the total number of unassigned iterations in chunks with 492 index greater than or equal to idx). __forceinline seems to be broken so that 493 if we __forceinline this function, the behavior is wrong 494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 495 template <typename T> 496 static __inline typename traits_t<T>::unsigned_t 497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 498 typename traits_t<T>::unsigned_t idx) { 499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 500 ICL 8.1, long double arithmetic may not really have long double precision, 501 even with /Qlong_double. Currently, we workaround that in the caller code, 502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 503 of precision is not expected to be a correctness issue, though. */ 504 typedef typename traits_t<T>::unsigned_t UT; 505 506 long double x = tc * __kmp_pow<UT>(base, idx); 507 UT r = (UT)x; 508 if (x == r) 509 return r; 510 return r + 1; 511 } 512 513 // Parameters of the guided-iterative algorithm: 514 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 515 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 516 // by default n = 2. For example with n = 3 the chunks distribution will be more 517 // flat. 518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 519 static int guided_int_param = 2; 520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 521 522 // UT - unsigned flavor of T, ST - signed flavor of T, 523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 524 template <typename T> 525 static void 526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 527 T ub, typename traits_t<T>::signed_t st, 528 typename traits_t<T>::signed_t chunk, int push_ws) { 529 typedef typename traits_t<T>::unsigned_t UT; 530 typedef typename traits_t<T>::signed_t ST; 531 typedef typename traits_t<T>::floating_t DBL; 532 533 int active; 534 T tc; 535 kmp_info_t *th; 536 kmp_team_t *team; 537 kmp_uint32 my_buffer_index; 538 dispatch_private_info_template<T> *pr; 539 dispatch_shared_info_template<UT> volatile *sh; 540 541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 542 sizeof(dispatch_private_info)); 543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 544 sizeof(dispatch_shared_info)); 545 546 if (!TCR_4(__kmp_init_parallel)) 547 __kmp_parallel_initialize(); 548 549 #if INCLUDE_SSC_MARKS 550 SSC_MARK_DISPATCH_INIT(); 551 #endif 552 #ifdef KMP_DEBUG 553 { 554 const char *buff; 555 // create format specifiers before the debug output 556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 558 traits_t<ST>::spec, traits_t<T>::spec, 559 traits_t<T>::spec, traits_t<ST>::spec); 560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 561 __kmp_str_free(&buff); 562 } 563 #endif 564 /* setup data */ 565 th = __kmp_threads[gtid]; 566 team = th->th.th_team; 567 active = !team->t.t_serialized; 568 th->th.th_ident = loc; 569 570 #if USE_ITT_BUILD 571 kmp_uint64 cur_chunk = chunk; 572 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 573 __kmp_forkjoin_frames_mode == 3 && 574 KMP_MASTER_GTID(gtid) && 575 #if OMP_40_ENABLED 576 th->th.th_teams_microtask == NULL && 577 #endif 578 team->t.t_active_level == 1; 579 #endif 580 if (!active) { 581 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 583 } else { 584 KMP_DEBUG_ASSERT(th->th.th_dispatch == 585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 586 587 my_buffer_index = th->th.th_dispatch->th_disp_index++; 588 589 /* What happens when number of threads changes, need to resize buffer? */ 590 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 591 &th->th.th_dispatch 592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 595 } 596 597 #if (KMP_STATIC_STEAL_ENABLED) 598 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 599 // AC: we now have only one implementation of stealing, so use it 600 schedule = kmp_sch_static_steal; 601 else 602 #endif 603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 604 605 /* Pick up the nomerge/ordered bits from the scheduling type */ 606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 607 pr->nomerge = TRUE; 608 schedule = 609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 610 } else { 611 pr->nomerge = FALSE; 612 } 613 pr->type_size = traits_t<T>::type_size; // remember the size of variables 614 if (kmp_ord_lower & schedule) { 615 pr->ordered = TRUE; 616 schedule = 617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 618 } else { 619 pr->ordered = FALSE; 620 } 621 622 if (schedule == kmp_sch_static) { 623 schedule = __kmp_static; 624 } else { 625 if (schedule == kmp_sch_runtime) { 626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 627 // not specified) 628 schedule = team->t.t_sched.r_sched_type; 629 // Detail the schedule if needed (global controls are differentiated 630 // appropriately) 631 if (schedule == kmp_sch_guided_chunked) { 632 schedule = __kmp_guided; 633 } else if (schedule == kmp_sch_static) { 634 schedule = __kmp_static; 635 } 636 // Use the chunk size specified by OMP_SCHEDULE (or default if not 637 // specified) 638 chunk = team->t.t_sched.chunk; 639 #if USE_ITT_BUILD 640 cur_chunk = chunk; 641 #endif 642 #ifdef KMP_DEBUG 643 { 644 const char *buff; 645 // create format specifiers before the debug output 646 buff = __kmp_str_format( 647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 648 traits_t<ST>::spec); 649 KD_TRACE(10, (buff, gtid, schedule, chunk)); 650 __kmp_str_free(&buff); 651 } 652 #endif 653 } else { 654 if (schedule == kmp_sch_guided_chunked) { 655 schedule = __kmp_guided; 656 } 657 if (chunk <= 0) { 658 chunk = KMP_DEFAULT_CHUNK; 659 } 660 } 661 662 if (schedule == kmp_sch_auto) { 663 // mapping and differentiation: in the __kmp_do_serial_initialize() 664 schedule = __kmp_auto; 665 #ifdef KMP_DEBUG 666 { 667 const char *buff; 668 // create format specifiers before the debug output 669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 670 "schedule:%%d chunk:%%%s\n", 671 traits_t<ST>::spec); 672 KD_TRACE(10, (buff, gtid, schedule, chunk)); 673 __kmp_str_free(&buff); 674 } 675 #endif 676 } 677 678 /* guided analytical not safe for too many threads */ 679 if (schedule == kmp_sch_guided_analytical_chunked && 680 th->th.th_team_nproc > 1 << 20) { 681 schedule = kmp_sch_guided_iterative_chunked; 682 KMP_WARNING(DispatchManyThreads); 683 } 684 if (schedule == kmp_sch_runtime_simd) { 685 // compiler provides simd_width in the chunk parameter 686 schedule = team->t.t_sched.r_sched_type; 687 // Detail the schedule if needed (global controls are differentiated 688 // appropriately) 689 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 690 schedule == __kmp_static) { 691 schedule = kmp_sch_static_balanced_chunked; 692 } else { 693 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 694 schedule = kmp_sch_guided_simd; 695 } 696 chunk = team->t.t_sched.chunk * chunk; 697 } 698 #if USE_ITT_BUILD 699 cur_chunk = chunk; 700 #endif 701 #ifdef KMP_DEBUG 702 { 703 const char *buff; 704 // create format specifiers before the debug output 705 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 706 " chunk:%%%s\n", 707 traits_t<ST>::spec); 708 KD_TRACE(10, (buff, gtid, schedule, chunk)); 709 __kmp_str_free(&buff); 710 } 711 #endif 712 } 713 pr->u.p.parm1 = chunk; 714 } 715 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 716 "unknown scheduling type"); 717 718 pr->u.p.count = 0; 719 720 if (__kmp_env_consistency_check) { 721 if (st == 0) { 722 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 723 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 724 } 725 } 726 // compute trip count 727 if (st == 1) { // most common case 728 if (ub >= lb) { 729 tc = ub - lb + 1; 730 } else { // ub < lb 731 tc = 0; // zero-trip 732 } 733 } else if (st < 0) { 734 if (lb >= ub) { 735 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 736 // where the division needs to be unsigned regardless of the result type 737 tc = (UT)(lb - ub) / (-st) + 1; 738 } else { // lb < ub 739 tc = 0; // zero-trip 740 } 741 } else { // st > 0 742 if (ub >= lb) { 743 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 744 // where the division needs to be unsigned regardless of the result type 745 tc = (UT)(ub - lb) / st + 1; 746 } else { // ub < lb 747 tc = 0; // zero-trip 748 } 749 } 750 751 // Any half-decent optimizer will remove this test when the blocks are empty 752 // since the macros expand to nothing when statistics are disabled. 753 if (schedule == __kmp_static) { 754 KMP_COUNT_BLOCK(OMP_FOR_static); 755 KMP_COUNT_VALUE(FOR_static_iterations, tc); 756 } else { 757 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 758 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 759 } 760 761 pr->u.p.lb = lb; 762 pr->u.p.ub = ub; 763 pr->u.p.st = st; 764 pr->u.p.tc = tc; 765 766 #if KMP_OS_WINDOWS 767 pr->u.p.last_upper = ub + st; 768 #endif /* KMP_OS_WINDOWS */ 769 770 /* NOTE: only the active parallel region(s) has active ordered sections */ 771 772 if (active) { 773 if (pr->ordered == 0) { 774 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 775 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 776 } else { 777 pr->ordered_bumped = 0; 778 779 pr->u.p.ordered_lower = 1; 780 pr->u.p.ordered_upper = 0; 781 782 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 783 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 784 } 785 } 786 787 if (__kmp_env_consistency_check) { 788 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 789 if (push_ws) { 790 __kmp_push_workshare(gtid, ws, loc); 791 pr->pushed_ws = ws; 792 } else { 793 __kmp_check_workshare(gtid, ws, loc); 794 pr->pushed_ws = ct_none; 795 } 796 } 797 798 switch (schedule) { 799 #if (KMP_STATIC_STEAL_ENABLED) 800 case kmp_sch_static_steal: { 801 T nproc = th->th.th_team_nproc; 802 T ntc, init; 803 804 KD_TRACE(100, 805 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 806 807 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 808 if (nproc > 1 && ntc >= nproc) { 809 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 810 T id = __kmp_tid_from_gtid(gtid); 811 T small_chunk, extras; 812 813 small_chunk = ntc / nproc; 814 extras = ntc % nproc; 815 816 init = id * small_chunk + (id < extras ? id : extras); 817 pr->u.p.count = init; 818 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 819 820 pr->u.p.parm2 = lb; 821 // pr->pfields.parm3 = 0; // it's not used in static_steal 822 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 823 pr->u.p.st = st; 824 if (traits_t<T>::type_size > 4) { 825 // AC: TODO: check if 16-byte CAS available and use it to 826 // improve performance (probably wait for explicit request 827 // before spending time on this). 828 // For now use dynamically allocated per-thread lock, 829 // free memory in __kmp_dispatch_next when status==0. 830 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 831 th->th.th_dispatch->th_steal_lock = 832 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 833 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 834 } 835 break; 836 } else { 837 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 838 "kmp_sch_static_balanced\n", 839 gtid)); 840 schedule = kmp_sch_static_balanced; 841 /* too few iterations: fall-through to kmp_sch_static_balanced */ 842 } // if 843 /* FALL-THROUGH to static balanced */ 844 } // case 845 #endif 846 case kmp_sch_static_balanced: { 847 T nproc = th->th.th_team_nproc; 848 T init, limit; 849 850 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 851 gtid)); 852 853 if (nproc > 1) { 854 T id = __kmp_tid_from_gtid(gtid); 855 856 if (tc < nproc) { 857 if (id < tc) { 858 init = id; 859 limit = id; 860 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 861 } else { 862 pr->u.p.count = 1; /* means no more chunks to execute */ 863 pr->u.p.parm1 = FALSE; 864 break; 865 } 866 } else { 867 T small_chunk = tc / nproc; 868 T extras = tc % nproc; 869 init = id * small_chunk + (id < extras ? id : extras); 870 limit = init + small_chunk - (id < extras ? 0 : 1); 871 pr->u.p.parm1 = (id == nproc - 1); 872 } 873 } else { 874 if (tc > 0) { 875 init = 0; 876 limit = tc - 1; 877 pr->u.p.parm1 = TRUE; 878 } else { // zero trip count 879 pr->u.p.count = 1; /* means no more chunks to execute */ 880 pr->u.p.parm1 = FALSE; 881 break; 882 } 883 } 884 #if USE_ITT_BUILD 885 // Calculate chunk for metadata report 886 if (itt_need_metadata_reporting) 887 cur_chunk = limit - init + 1; 888 #endif 889 if (st == 1) { 890 pr->u.p.lb = lb + init; 891 pr->u.p.ub = lb + limit; 892 } else { 893 // calculated upper bound, "ub" is user-defined upper bound 894 T ub_tmp = lb + limit * st; 895 pr->u.p.lb = lb + init * st; 896 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 897 // it exactly 898 if (st > 0) { 899 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 900 } else { 901 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 902 } 903 } 904 if (pr->ordered) { 905 pr->u.p.ordered_lower = init; 906 pr->u.p.ordered_upper = limit; 907 } 908 break; 909 } // case 910 case kmp_sch_static_balanced_chunked: { 911 // similar to balanced, but chunk adjusted to multiple of simd width 912 T nth = th->th.th_team_nproc; 913 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" 914 " -> falling-through to static_greedy\n", 915 gtid)); 916 schedule = kmp_sch_static_greedy; 917 if (nth > 1) 918 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 919 else 920 pr->u.p.parm1 = tc; 921 break; 922 } // case 923 case kmp_sch_guided_iterative_chunked: 924 case kmp_sch_guided_simd: { 925 T nproc = th->th.th_team_nproc; 926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 927 " case\n", 928 gtid)); 929 930 if (nproc > 1) { 931 if ((2L * chunk + 1) * nproc >= tc) { 932 /* chunk size too large, switch to dynamic */ 933 schedule = kmp_sch_dynamic_chunked; 934 } else { 935 // when remaining iters become less than parm2 - switch to dynamic 936 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 937 *(double *)&pr->u.p.parm3 = 938 guided_flt_param / nproc; // may occupy parm3 and parm4 939 } 940 } else { 941 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 942 "kmp_sch_static_greedy\n", 943 gtid)); 944 schedule = kmp_sch_static_greedy; 945 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 946 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 947 gtid)); 948 pr->u.p.parm1 = tc; 949 } // if 950 } // case 951 break; 952 case kmp_sch_guided_analytical_chunked: { 953 T nproc = th->th.th_team_nproc; 954 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 955 " case\n", 956 gtid)); 957 if (nproc > 1) { 958 if ((2L * chunk + 1) * nproc >= tc) { 959 /* chunk size too large, switch to dynamic */ 960 schedule = kmp_sch_dynamic_chunked; 961 } else { 962 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 963 DBL x; 964 965 #if KMP_OS_WINDOWS && KMP_ARCH_X86 966 /* Linux* OS already has 64-bit computation by default for long double, 967 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 968 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 969 instead of the default 53-bit. Even though long double doesn't work 970 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 971 expected to impact the correctness of the algorithm, but this has not 972 been mathematically proven. */ 973 // save original FPCW and set precision to 64-bit, as 974 // Windows* OS on IA-32 architecture defaults to 53-bit 975 unsigned int oldFpcw = _control87(0, 0); 976 _control87(_PC_64, _MCW_PC); // 0,0x30000 977 #endif 978 /* value used for comparison in solver for cross-over point */ 979 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 980 981 /* crossover point--chunk indexes equal to or greater than 982 this point switch to dynamic-style scheduling */ 983 UT cross; 984 985 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 986 x = (long double)1.0 - (long double)0.5 / nproc; 987 988 #ifdef KMP_DEBUG 989 { // test natural alignment 990 struct _test_a { 991 char a; 992 union { 993 char b; 994 DBL d; 995 }; 996 } t; 997 ptrdiff_t natural_alignment = 998 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 999 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 1000 // long)natural_alignment ); 1001 KMP_DEBUG_ASSERT( 1002 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 1003 } 1004 #endif // KMP_DEBUG 1005 1006 /* save the term in thread private dispatch structure */ 1007 *(DBL *)&pr->u.p.parm3 = x; 1008 1009 /* solve for the crossover point to the nearest integer i for which C_i 1010 <= chunk */ 1011 { 1012 UT left, right, mid; 1013 long double p; 1014 1015 /* estimate initial upper and lower bound */ 1016 1017 /* doesn't matter what value right is as long as it is positive, but 1018 it affects performance of the solver */ 1019 right = 229; 1020 p = __kmp_pow<UT>(x, right); 1021 if (p > target) { 1022 do { 1023 p *= p; 1024 right <<= 1; 1025 } while (p > target && right < (1 << 27)); 1026 /* lower bound is previous (failed) estimate of upper bound */ 1027 left = right >> 1; 1028 } else { 1029 left = 0; 1030 } 1031 1032 /* bisection root-finding method */ 1033 while (left + 1 < right) { 1034 mid = (left + right) / 2; 1035 if (__kmp_pow<UT>(x, mid) > target) { 1036 left = mid; 1037 } else { 1038 right = mid; 1039 } 1040 } // while 1041 cross = right; 1042 } 1043 /* assert sanity of computed crossover point */ 1044 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1045 __kmp_pow<UT>(x, cross) <= target); 1046 1047 /* save the crossover point in thread private dispatch structure */ 1048 pr->u.p.parm2 = cross; 1049 1050 // C75803 1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1053 #else 1054 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1055 #endif 1056 /* dynamic-style scheduling offset */ 1057 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1058 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1059 cross * chunk; 1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1061 // restore FPCW 1062 _control87(oldFpcw, _MCW_PC); 1063 #endif 1064 } // if 1065 } else { 1066 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1067 "kmp_sch_static_greedy\n", 1068 gtid)); 1069 schedule = kmp_sch_static_greedy; 1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1071 pr->u.p.parm1 = tc; 1072 } // if 1073 } // case 1074 break; 1075 case kmp_sch_static_greedy: 1076 KD_TRACE(100, 1077 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1078 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1079 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1080 : tc; 1081 break; 1082 case kmp_sch_static_chunked: 1083 case kmp_sch_dynamic_chunked: 1084 if (pr->u.p.parm1 <= 0) { 1085 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1086 } 1087 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1088 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1089 gtid)); 1090 break; 1091 case kmp_sch_trapezoidal: { 1092 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1093 1094 T parm1, parm2, parm3, parm4; 1095 KD_TRACE(100, 1096 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1097 1098 parm1 = chunk; 1099 1100 /* F : size of the first cycle */ 1101 parm2 = (tc / (2 * th->th.th_team_nproc)); 1102 1103 if (parm2 < 1) { 1104 parm2 = 1; 1105 } 1106 1107 /* L : size of the last cycle. Make sure the last cycle is not larger 1108 than the first cycle. */ 1109 if (parm1 < 1) { 1110 parm1 = 1; 1111 } else if (parm1 > parm2) { 1112 parm1 = parm2; 1113 } 1114 1115 /* N : number of cycles */ 1116 parm3 = (parm2 + parm1); 1117 parm3 = (2 * tc + parm3 - 1) / parm3; 1118 1119 if (parm3 < 2) { 1120 parm3 = 2; 1121 } 1122 1123 /* sigma : decreasing incr of the trapezoid */ 1124 parm4 = (parm3 - 1); 1125 parm4 = (parm2 - parm1) / parm4; 1126 1127 // pointless check, because parm4 >= 0 always 1128 // if ( parm4 < 0 ) { 1129 // parm4 = 0; 1130 //} 1131 1132 pr->u.p.parm1 = parm1; 1133 pr->u.p.parm2 = parm2; 1134 pr->u.p.parm3 = parm3; 1135 pr->u.p.parm4 = parm4; 1136 } // case 1137 break; 1138 1139 default: { 1140 __kmp_msg(kmp_ms_fatal, // Severity 1141 KMP_MSG(UnknownSchedTypeDetected), // Primary message 1142 KMP_HNT(GetNewerLibrary), // Hint 1143 __kmp_msg_null // Variadic argument list terminator 1144 ); 1145 } break; 1146 } // switch 1147 pr->schedule = schedule; 1148 if (active) { 1149 /* The name of this buffer should be my_buffer_index when it's free to use 1150 * it */ 1151 1152 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1153 "sh->buffer_index:%d\n", 1154 gtid, my_buffer_index, sh->buffer_index)); 1155 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1156 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1157 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1158 // my_buffer_index are *always* 32-bit integers. 1159 KMP_MB(); /* is this necessary? */ 1160 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1161 "sh->buffer_index:%d\n", 1162 gtid, my_buffer_index, sh->buffer_index)); 1163 1164 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1165 th->th.th_dispatch->th_dispatch_sh_current = 1166 RCAST(dispatch_shared_info_t *, 1167 CCAST(dispatch_shared_info_template<UT> *, sh)); 1168 #if USE_ITT_BUILD 1169 if (pr->ordered) { 1170 __kmp_itt_ordered_init(gtid); 1171 }; // if 1172 // Report loop metadata 1173 if (itt_need_metadata_reporting) { 1174 // Only report metadata by master of active team at level 1 1175 kmp_uint64 schedtype = 0; 1176 switch (schedule) { 1177 case kmp_sch_static_chunked: 1178 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1179 break; 1180 case kmp_sch_static_greedy: 1181 cur_chunk = pr->u.p.parm1; 1182 break; 1183 case kmp_sch_dynamic_chunked: 1184 schedtype = 1; 1185 break; 1186 case kmp_sch_guided_iterative_chunked: 1187 case kmp_sch_guided_analytical_chunked: 1188 case kmp_sch_guided_simd: 1189 schedtype = 2; 1190 break; 1191 default: 1192 // Should we put this case under "static"? 1193 // case kmp_sch_static_steal: 1194 schedtype = 3; 1195 break; 1196 } 1197 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1198 } 1199 #endif /* USE_ITT_BUILD */ 1200 }; // if 1201 1202 #ifdef KMP_DEBUG 1203 { 1204 const char *buff; 1205 // create format specifiers before the debug output 1206 buff = __kmp_str_format( 1207 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1208 "lb:%%%s ub:%%%s" 1209 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1210 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1211 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1212 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1213 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1214 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1215 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1216 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1217 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1218 pr->u.p.parm3, pr->u.p.parm4)); 1219 __kmp_str_free(&buff); 1220 } 1221 #endif 1222 #if (KMP_STATIC_STEAL_ENABLED) 1223 // It cannot be guaranteed that after execution of a loop with some other 1224 // schedule kind all the parm3 variables will contain the same value. Even if 1225 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1226 // rather than program life-time increment. So the dedicated variable is 1227 // required. The 'static_steal_counter' is used. 1228 if (schedule == kmp_sch_static_steal) { 1229 // Other threads will inspect this variable when searching for a victim. 1230 // This is a flag showing that other threads may steal from this thread 1231 // since then. 1232 volatile T *p = &pr->u.p.static_steal_counter; 1233 *p = *p + 1; 1234 } 1235 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1236 1237 #if OMPT_SUPPORT && OMPT_TRACE 1238 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1239 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1240 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1241 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1242 team_info->parallel_id, task_info->task_id, team_info->microtask); 1243 } 1244 #endif 1245 } 1246 1247 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1248 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1249 * every chunk of iterations. If the ordered section(s) were not executed 1250 * for this iteration (or every iteration in this chunk), we need to set the 1251 * ordered iteration counters so that the next thread can proceed. */ 1252 template <typename UT> 1253 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1254 typedef typename traits_t<UT>::signed_t ST; 1255 kmp_info_t *th = __kmp_threads[gtid]; 1256 1257 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1258 if (!th->th.th_team->t.t_serialized) { 1259 1260 dispatch_private_info_template<UT> *pr = 1261 reinterpret_cast<dispatch_private_info_template<UT> *>( 1262 th->th.th_dispatch->th_dispatch_pr_current); 1263 dispatch_shared_info_template<UT> volatile *sh = 1264 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1265 th->th.th_dispatch->th_dispatch_sh_current); 1266 KMP_DEBUG_ASSERT(pr); 1267 KMP_DEBUG_ASSERT(sh); 1268 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1269 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1270 1271 if (pr->ordered_bumped) { 1272 KD_TRACE( 1273 1000, 1274 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1275 gtid)); 1276 pr->ordered_bumped = 0; 1277 } else { 1278 UT lower = pr->u.p.ordered_lower; 1279 1280 #ifdef KMP_DEBUG 1281 { 1282 const char *buff; 1283 // create format specifiers before the debug output 1284 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1285 "ordered_iteration:%%%s lower:%%%s\n", 1286 traits_t<UT>::spec, traits_t<UT>::spec); 1287 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1288 __kmp_str_free(&buff); 1289 } 1290 #endif 1291 1292 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1293 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1294 KMP_MB(); /* is this necessary? */ 1295 #ifdef KMP_DEBUG 1296 { 1297 const char *buff; 1298 // create format specifiers before the debug output 1299 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1300 "ordered_iteration:%%%s lower:%%%s\n", 1301 traits_t<UT>::spec, traits_t<UT>::spec); 1302 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1303 __kmp_str_free(&buff); 1304 } 1305 #endif 1306 1307 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1308 } // if 1309 } // if 1310 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1311 } 1312 1313 #ifdef KMP_GOMP_COMPAT 1314 1315 template <typename UT> 1316 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1317 typedef typename traits_t<UT>::signed_t ST; 1318 kmp_info_t *th = __kmp_threads[gtid]; 1319 1320 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1321 if (!th->th.th_team->t.t_serialized) { 1322 // int cid; 1323 dispatch_private_info_template<UT> *pr = 1324 reinterpret_cast<dispatch_private_info_template<UT> *>( 1325 th->th.th_dispatch->th_dispatch_pr_current); 1326 dispatch_shared_info_template<UT> volatile *sh = 1327 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1328 th->th.th_dispatch->th_dispatch_sh_current); 1329 KMP_DEBUG_ASSERT(pr); 1330 KMP_DEBUG_ASSERT(sh); 1331 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1332 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1333 1334 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1335 UT lower = pr->u.p.ordered_lower; 1336 UT upper = pr->u.p.ordered_upper; 1337 UT inc = upper - lower + 1; 1338 1339 if (pr->ordered_bumped == inc) { 1340 KD_TRACE( 1341 1000, 1342 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1343 gtid)); 1344 pr->ordered_bumped = 0; 1345 } else { 1346 inc -= pr->ordered_bumped; 1347 1348 #ifdef KMP_DEBUG 1349 { 1350 const char *buff; 1351 // create format specifiers before the debug output 1352 buff = __kmp_str_format( 1353 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1354 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1355 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1356 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1357 __kmp_str_free(&buff); 1358 } 1359 #endif 1360 1361 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1362 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1363 1364 KMP_MB(); /* is this necessary? */ 1365 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1366 "ordered_bumped to zero\n", 1367 gtid)); 1368 pr->ordered_bumped = 0; 1369 //!!!!! TODO check if the inc should be unsigned, or signed??? 1370 #ifdef KMP_DEBUG 1371 { 1372 const char *buff; 1373 // create format specifiers before the debug output 1374 buff = __kmp_str_format( 1375 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1376 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1377 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1378 traits_t<UT>::spec); 1379 KD_TRACE(1000, 1380 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1381 __kmp_str_free(&buff); 1382 } 1383 #endif 1384 1385 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1386 } 1387 // } 1388 } 1389 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1390 } 1391 1392 #endif /* KMP_GOMP_COMPAT */ 1393 1394 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1395 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1396 is not called. */ 1397 #if OMPT_SUPPORT && OMPT_TRACE 1398 #define OMPT_LOOP_END \ 1399 if (status == 0) { \ 1400 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1401 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1402 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1403 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1404 team_info->parallel_id, task_info->task_id); \ 1405 } \ 1406 } 1407 #else 1408 #define OMPT_LOOP_END // no-op 1409 #endif 1410 1411 template <typename T> 1412 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1413 T *p_lb, T *p_ub, 1414 typename traits_t<T>::signed_t *p_st) { 1415 1416 typedef typename traits_t<T>::unsigned_t UT; 1417 typedef typename traits_t<T>::signed_t ST; 1418 typedef typename traits_t<T>::floating_t DBL; 1419 1420 // This is potentially slightly misleading, schedule(runtime) will appear here 1421 // even if the actual runtme schedule is static. (Which points out a 1422 // disadavantage of schedule(runtime): even when static scheduling is used it 1423 // costs more than a compile time choice to use static scheduling would.) 1424 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1425 1426 int status; 1427 dispatch_private_info_template<T> *pr; 1428 kmp_info_t *th = __kmp_threads[gtid]; 1429 kmp_team_t *team = th->th.th_team; 1430 1431 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1432 #ifdef KMP_DEBUG 1433 { 1434 const char *buff; 1435 // create format specifiers before the debug output 1436 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1437 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1438 traits_t<T>::spec, traits_t<T>::spec, 1439 traits_t<ST>::spec); 1440 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1441 __kmp_str_free(&buff); 1442 } 1443 #endif 1444 1445 if (team->t.t_serialized) { 1446 /* NOTE: serialize this dispatch becase we are not at the active level */ 1447 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1448 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1449 KMP_DEBUG_ASSERT(pr); 1450 1451 if ((status = (pr->u.p.tc != 0)) == 0) { 1452 *p_lb = 0; 1453 *p_ub = 0; 1454 // if ( p_last != NULL ) 1455 // *p_last = 0; 1456 if (p_st != NULL) 1457 *p_st = 0; 1458 if (__kmp_env_consistency_check) { 1459 if (pr->pushed_ws != ct_none) { 1460 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1461 } 1462 } 1463 } else if (pr->nomerge) { 1464 kmp_int32 last; 1465 T start; 1466 UT limit, trip, init; 1467 ST incr; 1468 T chunk = pr->u.p.parm1; 1469 1470 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1471 gtid)); 1472 1473 init = chunk * pr->u.p.count++; 1474 trip = pr->u.p.tc - 1; 1475 1476 if ((status = (init <= trip)) == 0) { 1477 *p_lb = 0; 1478 *p_ub = 0; 1479 // if ( p_last != NULL ) 1480 // *p_last = 0; 1481 if (p_st != NULL) 1482 *p_st = 0; 1483 if (__kmp_env_consistency_check) { 1484 if (pr->pushed_ws != ct_none) { 1485 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1486 } 1487 } 1488 } else { 1489 start = pr->u.p.lb; 1490 limit = chunk + init - 1; 1491 incr = pr->u.p.st; 1492 1493 if ((last = (limit >= trip)) != 0) { 1494 limit = trip; 1495 #if KMP_OS_WINDOWS 1496 pr->u.p.last_upper = pr->u.p.ub; 1497 #endif /* KMP_OS_WINDOWS */ 1498 } 1499 if (p_last != NULL) 1500 *p_last = last; 1501 if (p_st != NULL) 1502 *p_st = incr; 1503 if (incr == 1) { 1504 *p_lb = start + init; 1505 *p_ub = start + limit; 1506 } else { 1507 *p_lb = start + init * incr; 1508 *p_ub = start + limit * incr; 1509 } 1510 1511 if (pr->ordered) { 1512 pr->u.p.ordered_lower = init; 1513 pr->u.p.ordered_upper = limit; 1514 #ifdef KMP_DEBUG 1515 { 1516 const char *buff; 1517 // create format specifiers before the debug output 1518 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1519 "ordered_lower:%%%s ordered_upper:%%%s\n", 1520 traits_t<UT>::spec, traits_t<UT>::spec); 1521 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1522 pr->u.p.ordered_upper)); 1523 __kmp_str_free(&buff); 1524 } 1525 #endif 1526 } // if 1527 } // if 1528 } else { 1529 pr->u.p.tc = 0; 1530 *p_lb = pr->u.p.lb; 1531 *p_ub = pr->u.p.ub; 1532 #if KMP_OS_WINDOWS 1533 pr->u.p.last_upper = *p_ub; 1534 #endif /* KMP_OS_WINDOWS */ 1535 if (p_last != NULL) 1536 *p_last = TRUE; 1537 if (p_st != NULL) 1538 *p_st = pr->u.p.st; 1539 } // if 1540 #ifdef KMP_DEBUG 1541 { 1542 const char *buff; 1543 // create format specifiers before the debug output 1544 buff = __kmp_str_format( 1545 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1546 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1547 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1548 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1549 __kmp_str_free(&buff); 1550 } 1551 #endif 1552 #if INCLUDE_SSC_MARKS 1553 SSC_MARK_DISPATCH_NEXT(); 1554 #endif 1555 OMPT_LOOP_END; 1556 return status; 1557 } else { 1558 kmp_int32 last = 0; 1559 dispatch_shared_info_template<UT> *sh; 1560 T start; 1561 ST incr; 1562 UT limit, trip, init; 1563 1564 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1565 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1566 1567 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1568 th->th.th_dispatch->th_dispatch_pr_current); 1569 KMP_DEBUG_ASSERT(pr); 1570 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1571 th->th.th_dispatch->th_dispatch_sh_current); 1572 KMP_DEBUG_ASSERT(sh); 1573 1574 if (pr->u.p.tc == 0) { 1575 // zero trip count 1576 status = 0; 1577 } else { 1578 switch (pr->schedule) { 1579 #if (KMP_STATIC_STEAL_ENABLED) 1580 case kmp_sch_static_steal: { 1581 T chunk = pr->u.p.parm1; 1582 int nproc = th->th.th_team_nproc; 1583 1584 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1585 gtid)); 1586 1587 trip = pr->u.p.tc - 1; 1588 1589 if (traits_t<T>::type_size > 4) { 1590 // use lock for 8-byte and CAS for 4-byte induction 1591 // variable. TODO (optional): check and use 16-byte CAS 1592 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1593 KMP_DEBUG_ASSERT(lck != NULL); 1594 if (pr->u.p.count < (UT)pr->u.p.ub) { 1595 __kmp_acquire_lock(lck, gtid); 1596 // try to get own chunk of iterations 1597 init = (pr->u.p.count)++; 1598 status = (init < (UT)pr->u.p.ub); 1599 __kmp_release_lock(lck, gtid); 1600 } else { 1601 status = 0; // no own chunks 1602 } 1603 if (!status) { // try to steal 1604 kmp_info_t **other_threads = team->t.t_threads; 1605 int while_limit = nproc; // nproc attempts to find a victim 1606 int while_index = 0; 1607 // TODO: algorithm of searching for a victim 1608 // should be cleaned up and measured 1609 while ((!status) && (while_limit != ++while_index)) { 1610 T remaining; 1611 T victimIdx = pr->u.p.parm4; 1612 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1613 dispatch_private_info_template<T> *victim = 1614 reinterpret_cast<dispatch_private_info_template<T> *>( 1615 other_threads[victimIdx] 1616 ->th.th_dispatch->th_dispatch_pr_current); 1617 while ((victim == NULL || victim == pr || 1618 (*(volatile T *)&victim->u.p.static_steal_counter != 1619 *(volatile T *)&pr->u.p.static_steal_counter)) && 1620 oldVictimIdx != victimIdx) { 1621 victimIdx = (victimIdx + 1) % nproc; 1622 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1623 other_threads[victimIdx] 1624 ->th.th_dispatch->th_dispatch_pr_current); 1625 }; 1626 if (!victim || 1627 (*(volatile T *)&victim->u.p.static_steal_counter != 1628 *(volatile T *)&pr->u.p.static_steal_counter)) { 1629 continue; // try once more (nproc attempts in total) 1630 // no victim is ready yet to participate in stealing 1631 // because all victims are still in kmp_init_dispatch 1632 } 1633 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1634 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1635 continue; // not enough chunks to steal, goto next victim 1636 } 1637 1638 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1639 KMP_ASSERT(lck != NULL); 1640 __kmp_acquire_lock(lck, gtid); 1641 limit = victim->u.p.ub; // keep initial ub 1642 if (victim->u.p.count >= limit || 1643 (remaining = limit - victim->u.p.count) < 2) { 1644 __kmp_release_lock(lck, gtid); 1645 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1646 continue; // not enough chunks to steal 1647 } 1648 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1649 // or by 1 1650 if (remaining > 3) { 1651 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1652 init = (victim->u.p.ub -= 1653 (remaining >> 2)); // steal 1/4 of remaining 1654 } else { 1655 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1656 init = 1657 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1658 } 1659 __kmp_release_lock(lck, gtid); 1660 1661 KMP_DEBUG_ASSERT(init + 1 <= limit); 1662 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1663 status = 1; 1664 while_index = 0; 1665 // now update own count and ub with stolen range but init chunk 1666 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1667 pr->u.p.count = init + 1; 1668 pr->u.p.ub = limit; 1669 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1670 } // while (search for victim) 1671 } // if (try to find victim and steal) 1672 } else { 1673 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1674 typedef union { 1675 struct { 1676 UT count; 1677 T ub; 1678 } p; 1679 kmp_int64 b; 1680 } union_i4; 1681 // All operations on 'count' or 'ub' must be combined atomically 1682 // together. 1683 { 1684 union_i4 vold, vnew; 1685 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1686 vnew = vold; 1687 vnew.p.count++; 1688 while (!KMP_COMPARE_AND_STORE_ACQ64( 1689 (volatile kmp_int64 *)&pr->u.p.count, 1690 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1691 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1692 KMP_CPU_PAUSE(); 1693 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1694 vnew = vold; 1695 vnew.p.count++; 1696 } 1697 vnew = vold; 1698 init = vnew.p.count; 1699 status = (init < (UT)vnew.p.ub); 1700 } 1701 1702 if (!status) { 1703 kmp_info_t **other_threads = team->t.t_threads; 1704 int while_limit = nproc; // nproc attempts to find a victim 1705 int while_index = 0; 1706 1707 // TODO: algorithm of searching for a victim 1708 // should be cleaned up and measured 1709 while ((!status) && (while_limit != ++while_index)) { 1710 union_i4 vold, vnew; 1711 kmp_int32 remaining; 1712 T victimIdx = pr->u.p.parm4; 1713 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1714 dispatch_private_info_template<T> *victim = 1715 reinterpret_cast<dispatch_private_info_template<T> *>( 1716 other_threads[victimIdx] 1717 ->th.th_dispatch->th_dispatch_pr_current); 1718 while ((victim == NULL || victim == pr || 1719 (*(volatile T *)&victim->u.p.static_steal_counter != 1720 *(volatile T *)&pr->u.p.static_steal_counter)) && 1721 oldVictimIdx != victimIdx) { 1722 victimIdx = (victimIdx + 1) % nproc; 1723 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1724 other_threads[victimIdx] 1725 ->th.th_dispatch->th_dispatch_pr_current); 1726 }; 1727 if (!victim || 1728 (*(volatile T *)&victim->u.p.static_steal_counter != 1729 *(volatile T *)&pr->u.p.static_steal_counter)) { 1730 continue; // try once more (nproc attempts in total) 1731 // no victim is ready yet to participate in stealing 1732 // because all victims are still in kmp_init_dispatch 1733 } 1734 pr->u.p.parm4 = victimIdx; // new victim found 1735 while (1) { // CAS loop if victim has enough chunks to steal 1736 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1737 vnew = vold; 1738 1739 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1740 if (vnew.p.count >= (UT)vnew.p.ub || 1741 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1742 pr->u.p.parm4 = 1743 (victimIdx + 1) % nproc; // shift start victim id 1744 break; // not enough chunks to steal, goto next victim 1745 } 1746 if (remaining > 3) { 1747 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1748 } else { 1749 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1750 } 1751 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1752 // TODO: Should this be acquire or release? 1753 if (KMP_COMPARE_AND_STORE_ACQ64( 1754 (volatile kmp_int64 *)&victim->u.p.count, 1755 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1756 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1757 // stealing succeeded 1758 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1759 vold.p.ub - vnew.p.ub); 1760 status = 1; 1761 while_index = 0; 1762 // now update own count and ub 1763 init = vnew.p.ub; 1764 vold.p.count = init + 1; 1765 #if KMP_ARCH_X86 1766 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1767 vold.b); 1768 #else 1769 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1770 #endif 1771 break; 1772 } // if (check CAS result) 1773 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1774 } // while (try to steal from particular victim) 1775 } // while (search for victim) 1776 } // if (try to find victim and steal) 1777 } // if (4-byte induction variable) 1778 if (!status) { 1779 *p_lb = 0; 1780 *p_ub = 0; 1781 if (p_st != NULL) 1782 *p_st = 0; 1783 } else { 1784 start = pr->u.p.parm2; 1785 init *= chunk; 1786 limit = chunk + init - 1; 1787 incr = pr->u.p.st; 1788 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1789 1790 KMP_DEBUG_ASSERT(init <= trip); 1791 if ((last = (limit >= trip)) != 0) 1792 limit = trip; 1793 if (p_st != NULL) 1794 *p_st = incr; 1795 1796 if (incr == 1) { 1797 *p_lb = start + init; 1798 *p_ub = start + limit; 1799 } else { 1800 *p_lb = start + init * incr; 1801 *p_ub = start + limit * incr; 1802 } 1803 1804 if (pr->ordered) { 1805 pr->u.p.ordered_lower = init; 1806 pr->u.p.ordered_upper = limit; 1807 #ifdef KMP_DEBUG 1808 { 1809 const char *buff; 1810 // create format specifiers before the debug output 1811 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1812 "ordered_lower:%%%s ordered_upper:%%%s\n", 1813 traits_t<UT>::spec, traits_t<UT>::spec); 1814 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1815 pr->u.p.ordered_upper)); 1816 __kmp_str_free(&buff); 1817 } 1818 #endif 1819 } // if 1820 } // if 1821 break; 1822 } // case 1823 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1824 case kmp_sch_static_balanced: { 1825 KD_TRACE( 1826 100, 1827 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1828 if ((status = !pr->u.p.count) != 1829 0) { /* check if thread has any iteration to do */ 1830 pr->u.p.count = 1; 1831 *p_lb = pr->u.p.lb; 1832 *p_ub = pr->u.p.ub; 1833 last = pr->u.p.parm1; 1834 if (p_st != NULL) 1835 *p_st = pr->u.p.st; 1836 } else { /* no iterations to do */ 1837 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1838 } 1839 if (pr->ordered) { 1840 #ifdef KMP_DEBUG 1841 { 1842 const char *buff; 1843 // create format specifiers before the debug output 1844 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1845 "ordered_lower:%%%s ordered_upper:%%%s\n", 1846 traits_t<UT>::spec, traits_t<UT>::spec); 1847 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1848 pr->u.p.ordered_upper)); 1849 __kmp_str_free(&buff); 1850 } 1851 #endif 1852 } // if 1853 } // case 1854 break; 1855 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1856 merged here */ 1857 case kmp_sch_static_chunked: { 1858 T parm1; 1859 1860 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1861 "kmp_sch_static_[affinity|chunked] case\n", 1862 gtid)); 1863 parm1 = pr->u.p.parm1; 1864 1865 trip = pr->u.p.tc - 1; 1866 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1867 1868 if ((status = (init <= trip)) != 0) { 1869 start = pr->u.p.lb; 1870 incr = pr->u.p.st; 1871 limit = parm1 + init - 1; 1872 1873 if ((last = (limit >= trip)) != 0) 1874 limit = trip; 1875 1876 if (p_st != NULL) 1877 *p_st = incr; 1878 1879 pr->u.p.count += th->th.th_team_nproc; 1880 1881 if (incr == 1) { 1882 *p_lb = start + init; 1883 *p_ub = start + limit; 1884 } else { 1885 *p_lb = start + init * incr; 1886 *p_ub = start + limit * incr; 1887 } 1888 1889 if (pr->ordered) { 1890 pr->u.p.ordered_lower = init; 1891 pr->u.p.ordered_upper = limit; 1892 #ifdef KMP_DEBUG 1893 { 1894 const char *buff; 1895 // create format specifiers before the debug output 1896 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1897 "ordered_lower:%%%s ordered_upper:%%%s\n", 1898 traits_t<UT>::spec, traits_t<UT>::spec); 1899 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1900 pr->u.p.ordered_upper)); 1901 __kmp_str_free(&buff); 1902 } 1903 #endif 1904 } // if 1905 } // if 1906 } // case 1907 break; 1908 1909 case kmp_sch_dynamic_chunked: { 1910 T chunk = pr->u.p.parm1; 1911 1912 KD_TRACE( 1913 100, 1914 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1915 1916 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1917 trip = pr->u.p.tc - 1; 1918 1919 if ((status = (init <= trip)) == 0) { 1920 *p_lb = 0; 1921 *p_ub = 0; 1922 if (p_st != NULL) 1923 *p_st = 0; 1924 } else { 1925 start = pr->u.p.lb; 1926 limit = chunk + init - 1; 1927 incr = pr->u.p.st; 1928 1929 if ((last = (limit >= trip)) != 0) 1930 limit = trip; 1931 1932 if (p_st != NULL) 1933 *p_st = incr; 1934 1935 if (incr == 1) { 1936 *p_lb = start + init; 1937 *p_ub = start + limit; 1938 } else { 1939 *p_lb = start + init * incr; 1940 *p_ub = start + limit * incr; 1941 } 1942 1943 if (pr->ordered) { 1944 pr->u.p.ordered_lower = init; 1945 pr->u.p.ordered_upper = limit; 1946 #ifdef KMP_DEBUG 1947 { 1948 const char *buff; 1949 // create format specifiers before the debug output 1950 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1951 "ordered_lower:%%%s ordered_upper:%%%s\n", 1952 traits_t<UT>::spec, traits_t<UT>::spec); 1953 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1954 pr->u.p.ordered_upper)); 1955 __kmp_str_free(&buff); 1956 } 1957 #endif 1958 } // if 1959 } // if 1960 } // case 1961 break; 1962 1963 case kmp_sch_guided_iterative_chunked: { 1964 T chunkspec = pr->u.p.parm1; 1965 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1966 "iterative case\n", 1967 gtid)); 1968 trip = pr->u.p.tc; 1969 // Start atomic part of calculations 1970 while (1) { 1971 ST remaining; // signed, because can be < 0 1972 init = sh->u.s.iteration; // shared value 1973 remaining = trip - init; 1974 if (remaining <= 0) { // AC: need to compare with 0 first 1975 // nothing to do, don't try atomic op 1976 status = 0; 1977 break; 1978 } 1979 if ((T)remaining < 1980 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1981 // use dynamic-style shcedule 1982 // atomically inrement iterations, get old value 1983 init = test_then_add<ST>( 1984 RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunkspec); 1985 remaining = trip - init; 1986 if (remaining <= 0) { 1987 status = 0; // all iterations got by other threads 1988 } else { // got some iterations to work on 1989 status = 1; 1990 if ((T)remaining > chunkspec) { 1991 limit = init + chunkspec - 1; 1992 } else { 1993 last = 1; // the last chunk 1994 limit = init + remaining - 1; 1995 } // if 1996 } // if 1997 break; 1998 } // if 1999 limit = init + (UT)(remaining * 2000 *(double *)&pr->u.p.parm3); // divide by K*nproc 2001 if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), 2002 (ST)init, (ST)limit)) { 2003 // CAS was successful, chunk obtained 2004 status = 1; 2005 --limit; 2006 break; 2007 } // if 2008 } // while 2009 if (status != 0) { 2010 start = pr->u.p.lb; 2011 incr = pr->u.p.st; 2012 if (p_st != NULL) 2013 *p_st = incr; 2014 *p_lb = start + init * incr; 2015 *p_ub = start + limit * incr; 2016 if (pr->ordered) { 2017 pr->u.p.ordered_lower = init; 2018 pr->u.p.ordered_upper = limit; 2019 #ifdef KMP_DEBUG 2020 { 2021 const char *buff; 2022 // create format specifiers before the debug output 2023 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2024 "ordered_lower:%%%s ordered_upper:%%%s\n", 2025 traits_t<UT>::spec, traits_t<UT>::spec); 2026 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2027 pr->u.p.ordered_upper)); 2028 __kmp_str_free(&buff); 2029 } 2030 #endif 2031 } // if 2032 } else { 2033 *p_lb = 0; 2034 *p_ub = 0; 2035 if (p_st != NULL) 2036 *p_st = 0; 2037 } // if 2038 } // case 2039 break; 2040 2041 case kmp_sch_guided_simd: { 2042 // same as iterative but curr-chunk adjusted to be multiple of given 2043 // chunk 2044 T chunk = pr->u.p.parm1; 2045 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", 2046 gtid)); 2047 trip = pr->u.p.tc; 2048 // Start atomic part of calculations 2049 while (1) { 2050 ST remaining; // signed, because can be < 0 2051 init = sh->u.s.iteration; // shared value 2052 remaining = trip - init; 2053 if (remaining <= 0) { // AC: need to compare with 0 first 2054 status = 0; // nothing to do, don't try atomic op 2055 break; 2056 } 2057 KMP_DEBUG_ASSERT(init % chunk == 0); 2058 // compare with K*nproc*(chunk+1), K=2 by default 2059 if ((T)remaining < pr->u.p.parm2) { 2060 // use dynamic-style shcedule 2061 // atomically inrement iterations, get old value 2062 init = test_then_add<ST>( 2063 RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), (ST)chunk); 2064 remaining = trip - init; 2065 if (remaining <= 0) { 2066 status = 0; // all iterations got by other threads 2067 } else { 2068 // got some iterations to work on 2069 status = 1; 2070 if ((T)remaining > chunk) { 2071 limit = init + chunk - 1; 2072 } else { 2073 last = 1; // the last chunk 2074 limit = init + remaining - 1; 2075 } // if 2076 } // if 2077 break; 2078 } // if 2079 // divide by K*nproc 2080 UT span = remaining * (*(double *)&pr->u.p.parm3); 2081 UT rem = span % chunk; 2082 if (rem) // adjust so that span%chunk == 0 2083 span += chunk - rem; 2084 limit = init + span; 2085 if (compare_and_swap<ST>(RCAST(ST *, CCAST(UT *, &sh->u.s.iteration)), 2086 (ST)init, (ST)limit)) { 2087 // CAS was successful, chunk obtained 2088 status = 1; 2089 --limit; 2090 break; 2091 } // if 2092 } // while 2093 if (status != 0) { 2094 start = pr->u.p.lb; 2095 incr = pr->u.p.st; 2096 if (p_st != NULL) 2097 *p_st = incr; 2098 *p_lb = start + init * incr; 2099 *p_ub = start + limit * incr; 2100 if (pr->ordered) { 2101 pr->u.p.ordered_lower = init; 2102 pr->u.p.ordered_upper = limit; 2103 #ifdef KMP_DEBUG 2104 { 2105 const char *buff; 2106 // create format specifiers before the debug output 2107 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2108 "ordered_lower:%%%s ordered_upper:%%%s\n", 2109 traits_t<UT>::spec, traits_t<UT>::spec); 2110 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2111 pr->u.p.ordered_upper)); 2112 __kmp_str_free(&buff); 2113 } 2114 #endif 2115 } // if 2116 } else { 2117 *p_lb = 0; 2118 *p_ub = 0; 2119 if (p_st != NULL) 2120 *p_st = 0; 2121 } // if 2122 } // case 2123 break; 2124 2125 case kmp_sch_guided_analytical_chunked: { 2126 T chunkspec = pr->u.p.parm1; 2127 UT chunkIdx; 2128 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2129 /* for storing original FPCW value for Windows* OS on 2130 IA-32 architecture 8-byte version */ 2131 unsigned int oldFpcw; 2132 unsigned int fpcwSet = 0; 2133 #endif 2134 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2135 "analytical case\n", 2136 gtid)); 2137 2138 trip = pr->u.p.tc; 2139 2140 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2141 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2142 trip); 2143 2144 while (1) { /* this while loop is a safeguard against unexpected zero 2145 chunk sizes */ 2146 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2147 if (chunkIdx >= (UT)pr->u.p.parm2) { 2148 --trip; 2149 /* use dynamic-style scheduling */ 2150 init = chunkIdx * chunkspec + pr->u.p.count; 2151 /* need to verify init > 0 in case of overflow in the above 2152 * calculation */ 2153 if ((status = (init > 0 && init <= trip)) != 0) { 2154 limit = init + chunkspec - 1; 2155 2156 if ((last = (limit >= trip)) != 0) 2157 limit = trip; 2158 } 2159 break; 2160 } else { 2161 /* use exponential-style scheduling */ 2162 /* The following check is to workaround the lack of long double precision on 2163 Windows* OS. 2164 This check works around the possible effect that init != 0 for chunkIdx == 0. 2165 */ 2166 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2167 /* If we haven't already done so, save original FPCW and set 2168 precision to 64-bit, as Windows* OS on IA-32 architecture 2169 defaults to 53-bit */ 2170 if (!fpcwSet) { 2171 oldFpcw = _control87(0, 0); 2172 _control87(_PC_64, _MCW_PC); 2173 fpcwSet = 0x30000; 2174 } 2175 #endif 2176 if (chunkIdx) { 2177 init = __kmp_dispatch_guided_remaining<T>( 2178 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2179 KMP_DEBUG_ASSERT(init); 2180 init = trip - init; 2181 } else 2182 init = 0; 2183 limit = trip - __kmp_dispatch_guided_remaining<T>( 2184 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2185 KMP_ASSERT(init <= limit); 2186 if (init < limit) { 2187 KMP_DEBUG_ASSERT(limit <= trip); 2188 --limit; 2189 status = 1; 2190 break; 2191 } // if 2192 } // if 2193 } // while (1) 2194 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2195 /* restore FPCW if necessary 2196 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2197 here */ 2198 if (fpcwSet && (oldFpcw & fpcwSet)) 2199 _control87(oldFpcw, _MCW_PC); 2200 #endif 2201 if (status != 0) { 2202 start = pr->u.p.lb; 2203 incr = pr->u.p.st; 2204 if (p_st != NULL) 2205 *p_st = incr; 2206 *p_lb = start + init * incr; 2207 *p_ub = start + limit * incr; 2208 if (pr->ordered) { 2209 pr->u.p.ordered_lower = init; 2210 pr->u.p.ordered_upper = limit; 2211 #ifdef KMP_DEBUG 2212 { 2213 const char *buff; 2214 // create format specifiers before the debug output 2215 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2216 "ordered_lower:%%%s ordered_upper:%%%s\n", 2217 traits_t<UT>::spec, traits_t<UT>::spec); 2218 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2219 pr->u.p.ordered_upper)); 2220 __kmp_str_free(&buff); 2221 } 2222 #endif 2223 } 2224 } else { 2225 *p_lb = 0; 2226 *p_ub = 0; 2227 if (p_st != NULL) 2228 *p_st = 0; 2229 } 2230 } // case 2231 break; 2232 2233 case kmp_sch_trapezoidal: { 2234 UT index; 2235 T parm2 = pr->u.p.parm2; 2236 T parm3 = pr->u.p.parm3; 2237 T parm4 = pr->u.p.parm4; 2238 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2239 gtid)); 2240 2241 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2242 2243 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2244 trip = pr->u.p.tc - 1; 2245 2246 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2247 *p_lb = 0; 2248 *p_ub = 0; 2249 if (p_st != NULL) 2250 *p_st = 0; 2251 } else { 2252 start = pr->u.p.lb; 2253 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2254 incr = pr->u.p.st; 2255 2256 if ((last = (limit >= trip)) != 0) 2257 limit = trip; 2258 2259 if (p_st != NULL) 2260 *p_st = incr; 2261 2262 if (incr == 1) { 2263 *p_lb = start + init; 2264 *p_ub = start + limit; 2265 } else { 2266 *p_lb = start + init * incr; 2267 *p_ub = start + limit * incr; 2268 } 2269 2270 if (pr->ordered) { 2271 pr->u.p.ordered_lower = init; 2272 pr->u.p.ordered_upper = limit; 2273 #ifdef KMP_DEBUG 2274 { 2275 const char *buff; 2276 // create format specifiers before the debug output 2277 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2278 "ordered_lower:%%%s ordered_upper:%%%s\n", 2279 traits_t<UT>::spec, traits_t<UT>::spec); 2280 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2281 pr->u.p.ordered_upper)); 2282 __kmp_str_free(&buff); 2283 } 2284 #endif 2285 } // if 2286 } // if 2287 } // case 2288 break; 2289 default: { 2290 status = 0; // to avoid complaints on uninitialized variable use 2291 __kmp_msg(kmp_ms_fatal, // Severity 2292 KMP_MSG(UnknownSchedTypeDetected), // Primary message 2293 KMP_HNT(GetNewerLibrary), // Hint 2294 __kmp_msg_null // Variadic argument list terminator 2295 ); 2296 } break; 2297 } // switch 2298 } // if tc == 0; 2299 2300 if (status == 0) { 2301 UT num_done; 2302 2303 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2304 #ifdef KMP_DEBUG 2305 { 2306 const char *buff; 2307 // create format specifiers before the debug output 2308 buff = __kmp_str_format( 2309 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2310 traits_t<UT>::spec); 2311 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2312 __kmp_str_free(&buff); 2313 } 2314 #endif 2315 2316 if ((ST)num_done == th->th.th_team_nproc - 1) { 2317 #if (KMP_STATIC_STEAL_ENABLED) 2318 if (pr->schedule == kmp_sch_static_steal && 2319 traits_t<T>::type_size > 4) { 2320 int i; 2321 kmp_info_t **other_threads = team->t.t_threads; 2322 // loop complete, safe to destroy locks used for stealing 2323 for (i = 0; i < th->th.th_team_nproc; ++i) { 2324 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2325 KMP_ASSERT(lck != NULL); 2326 __kmp_destroy_lock(lck); 2327 __kmp_free(lck); 2328 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2329 } 2330 } 2331 #endif 2332 /* NOTE: release this buffer to be reused */ 2333 2334 KMP_MB(); /* Flush all pending memory write invalidates. */ 2335 2336 sh->u.s.num_done = 0; 2337 sh->u.s.iteration = 0; 2338 2339 /* TODO replace with general release procedure? */ 2340 if (pr->ordered) { 2341 sh->u.s.ordered_iteration = 0; 2342 } 2343 2344 KMP_MB(); /* Flush all pending memory write invalidates. */ 2345 2346 sh->buffer_index += __kmp_dispatch_num_buffers; 2347 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2348 gtid, sh->buffer_index)); 2349 2350 KMP_MB(); /* Flush all pending memory write invalidates. */ 2351 2352 } // if 2353 if (__kmp_env_consistency_check) { 2354 if (pr->pushed_ws != ct_none) { 2355 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2356 } 2357 } 2358 2359 th->th.th_dispatch->th_deo_fcn = NULL; 2360 th->th.th_dispatch->th_dxo_fcn = NULL; 2361 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2362 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2363 } // if (status == 0) 2364 #if KMP_OS_WINDOWS 2365 else if (last) { 2366 pr->u.p.last_upper = pr->u.p.ub; 2367 } 2368 #endif /* KMP_OS_WINDOWS */ 2369 if (p_last != NULL && status != 0) 2370 *p_last = last; 2371 } // if 2372 2373 #ifdef KMP_DEBUG 2374 { 2375 const char *buff; 2376 // create format specifiers before the debug output 2377 buff = __kmp_str_format( 2378 "__kmp_dispatch_next: T#%%d normal case: " 2379 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2380 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2381 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2382 __kmp_str_free(&buff); 2383 } 2384 #endif 2385 #if INCLUDE_SSC_MARKS 2386 SSC_MARK_DISPATCH_NEXT(); 2387 #endif 2388 OMPT_LOOP_END; 2389 return status; 2390 } 2391 2392 template <typename T> 2393 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2394 kmp_int32 *plastiter, T *plower, T *pupper, 2395 typename traits_t<T>::signed_t incr) { 2396 typedef typename traits_t<T>::unsigned_t UT; 2397 typedef typename traits_t<T>::signed_t ST; 2398 kmp_uint32 team_id; 2399 kmp_uint32 nteams; 2400 UT trip_count; 2401 kmp_team_t *team; 2402 kmp_info_t *th; 2403 2404 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2405 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2406 #ifdef KMP_DEBUG 2407 { 2408 const char *buff; 2409 // create format specifiers before the debug output 2410 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2411 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2412 traits_t<T>::spec, traits_t<T>::spec, 2413 traits_t<ST>::spec, traits_t<T>::spec); 2414 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2415 __kmp_str_free(&buff); 2416 } 2417 #endif 2418 2419 if (__kmp_env_consistency_check) { 2420 if (incr == 0) { 2421 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2422 loc); 2423 } 2424 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2425 // The loop is illegal. 2426 // Some zero-trip loops maintained by compiler, e.g.: 2427 // for(i=10;i<0;++i) // lower >= upper - run-time check 2428 // for(i=0;i>10;--i) // lower <= upper - run-time check 2429 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2430 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2431 // Compiler does not check the following illegal loops: 2432 // for(i=0;i<10;i+=incr) // where incr<0 2433 // for(i=10;i>0;i-=incr) // where incr<0 2434 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2435 } 2436 } 2437 th = __kmp_threads[gtid]; 2438 team = th->th.th_team; 2439 #if OMP_40_ENABLED 2440 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2441 nteams = th->th.th_teams_size.nteams; 2442 #endif 2443 team_id = team->t.t_master_tid; 2444 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2445 2446 // compute global trip count 2447 if (incr == 1) { 2448 trip_count = *pupper - *plower + 1; 2449 } else if (incr == -1) { 2450 trip_count = *plower - *pupper + 1; 2451 } else if (incr > 0) { 2452 // upper-lower can exceed the limit of signed type 2453 trip_count = (UT)(*pupper - *plower) / incr + 1; 2454 } else { 2455 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2456 } 2457 2458 if (trip_count <= nteams) { 2459 KMP_DEBUG_ASSERT( 2460 __kmp_static == kmp_sch_static_greedy || 2461 __kmp_static == 2462 kmp_sch_static_balanced); // Unknown static scheduling type. 2463 // only some teams get single iteration, others get nothing 2464 if (team_id < trip_count) { 2465 *pupper = *plower = *plower + team_id * incr; 2466 } else { 2467 *plower = *pupper + incr; // zero-trip loop 2468 } 2469 if (plastiter != NULL) 2470 *plastiter = (team_id == trip_count - 1); 2471 } else { 2472 if (__kmp_static == kmp_sch_static_balanced) { 2473 UT chunk = trip_count / nteams; 2474 UT extras = trip_count % nteams; 2475 *plower += 2476 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2477 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2478 if (plastiter != NULL) 2479 *plastiter = (team_id == nteams - 1); 2480 } else { 2481 T chunk_inc_count = 2482 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2483 T upper = *pupper; 2484 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2485 // Unknown static scheduling type. 2486 *plower += team_id * chunk_inc_count; 2487 *pupper = *plower + chunk_inc_count - incr; 2488 // Check/correct bounds if needed 2489 if (incr > 0) { 2490 if (*pupper < *plower) 2491 *pupper = traits_t<T>::max_value; 2492 if (plastiter != NULL) 2493 *plastiter = *plower <= upper && *pupper > upper - incr; 2494 if (*pupper > upper) 2495 *pupper = upper; // tracker C73258 2496 } else { 2497 if (*pupper > *plower) 2498 *pupper = traits_t<T>::min_value; 2499 if (plastiter != NULL) 2500 *plastiter = *plower >= upper && *pupper < upper - incr; 2501 if (*pupper < upper) 2502 *pupper = upper; // tracker C73258 2503 } 2504 } 2505 } 2506 } 2507 2508 //----------------------------------------------------------------------------- 2509 // Dispatch routines 2510 // Transfer call to template< type T > 2511 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2512 // T lb, T ub, ST st, ST chunk ) 2513 extern "C" { 2514 2515 /*! 2516 @ingroup WORK_SHARING 2517 @{ 2518 @param loc Source location 2519 @param gtid Global thread id 2520 @param schedule Schedule type 2521 @param lb Lower bound 2522 @param ub Upper bound 2523 @param st Step (or increment if you prefer) 2524 @param chunk The chunk size to block with 2525 2526 This function prepares the runtime to start a dynamically scheduled for loop, 2527 saving the loop arguments. 2528 These functions are all identical apart from the types of the arguments. 2529 */ 2530 2531 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2532 enum sched_type schedule, kmp_int32 lb, 2533 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2534 KMP_DEBUG_ASSERT(__kmp_init_serial); 2535 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2536 } 2537 /*! 2538 See @ref __kmpc_dispatch_init_4 2539 */ 2540 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2541 enum sched_type schedule, kmp_uint32 lb, 2542 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2543 KMP_DEBUG_ASSERT(__kmp_init_serial); 2544 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2545 } 2546 2547 /*! 2548 See @ref __kmpc_dispatch_init_4 2549 */ 2550 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2551 enum sched_type schedule, kmp_int64 lb, 2552 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2553 KMP_DEBUG_ASSERT(__kmp_init_serial); 2554 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2555 } 2556 2557 /*! 2558 See @ref __kmpc_dispatch_init_4 2559 */ 2560 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2561 enum sched_type schedule, kmp_uint64 lb, 2562 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2563 KMP_DEBUG_ASSERT(__kmp_init_serial); 2564 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2565 } 2566 2567 /*! 2568 See @ref __kmpc_dispatch_init_4 2569 2570 Difference from __kmpc_dispatch_init set of functions is these functions 2571 are called for composite distribute parallel for construct. Thus before 2572 regular iterations dispatching we need to calc per-team iteration space. 2573 2574 These functions are all identical apart from the types of the arguments. 2575 */ 2576 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2577 enum sched_type schedule, kmp_int32 *p_last, 2578 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2579 kmp_int32 chunk) { 2580 KMP_DEBUG_ASSERT(__kmp_init_serial); 2581 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2582 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2583 } 2584 2585 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2586 enum sched_type schedule, kmp_int32 *p_last, 2587 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2588 kmp_int32 chunk) { 2589 KMP_DEBUG_ASSERT(__kmp_init_serial); 2590 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2591 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2592 } 2593 2594 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2595 enum sched_type schedule, kmp_int32 *p_last, 2596 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2597 kmp_int64 chunk) { 2598 KMP_DEBUG_ASSERT(__kmp_init_serial); 2599 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2600 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2601 } 2602 2603 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2604 enum sched_type schedule, kmp_int32 *p_last, 2605 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2606 kmp_int64 chunk) { 2607 KMP_DEBUG_ASSERT(__kmp_init_serial); 2608 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2609 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2610 } 2611 2612 /*! 2613 @param loc Source code location 2614 @param gtid Global thread id 2615 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2616 otherwise 2617 @param p_lb Pointer to the lower bound for the next chunk of work 2618 @param p_ub Pointer to the upper bound for the next chunk of work 2619 @param p_st Pointer to the stride for the next chunk of work 2620 @return one if there is work to be done, zero otherwise 2621 2622 Get the next dynamically allocated chunk of work for this thread. 2623 If there is no more work, then the lb,ub and stride need not be modified. 2624 */ 2625 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2626 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2627 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2628 } 2629 2630 /*! 2631 See @ref __kmpc_dispatch_next_4 2632 */ 2633 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2634 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2635 kmp_int32 *p_st) { 2636 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2637 } 2638 2639 /*! 2640 See @ref __kmpc_dispatch_next_4 2641 */ 2642 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2643 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2644 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2645 } 2646 2647 /*! 2648 See @ref __kmpc_dispatch_next_4 2649 */ 2650 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2651 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2652 kmp_int64 *p_st) { 2653 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2654 } 2655 2656 /*! 2657 @param loc Source code location 2658 @param gtid Global thread id 2659 2660 Mark the end of a dynamic loop. 2661 */ 2662 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2663 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2664 } 2665 2666 /*! 2667 See @ref __kmpc_dispatch_fini_4 2668 */ 2669 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2670 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2671 } 2672 2673 /*! 2674 See @ref __kmpc_dispatch_fini_4 2675 */ 2676 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2677 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2678 } 2679 2680 /*! 2681 See @ref __kmpc_dispatch_fini_4 2682 */ 2683 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2684 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2685 } 2686 /*! @} */ 2687 2688 //----------------------------------------------------------------------------- 2689 // Non-template routines from kmp_dispatch.cpp used in other sources 2690 2691 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2692 return value == checker; 2693 } 2694 2695 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2696 return value != checker; 2697 } 2698 2699 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2700 return value < checker; 2701 } 2702 2703 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2704 return value >= checker; 2705 } 2706 2707 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2708 return value <= checker; 2709 } 2710 2711 kmp_uint32 2712 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2713 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2714 void *obj // Higher-level synchronization object, or NULL. 2715 ) { 2716 // note: we may not belong to a team at this point 2717 volatile kmp_uint32 *spin = spinner; 2718 kmp_uint32 check = checker; 2719 kmp_uint32 spins; 2720 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2721 kmp_uint32 r; 2722 2723 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2724 KMP_INIT_YIELD(spins); 2725 // main wait spin loop 2726 while (!f(r = TCR_4(*spin), check)) { 2727 KMP_FSYNC_SPIN_PREPARE(obj); 2728 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2729 split. It causes problems with infinite recursion because of exit lock */ 2730 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2731 __kmp_abort_thread(); */ 2732 2733 /* if we have waited a bit, or are oversubscribed, yield */ 2734 /* pause is in the following code */ 2735 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2736 KMP_YIELD_SPIN(spins); 2737 } 2738 KMP_FSYNC_SPIN_ACQUIRED(obj); 2739 return r; 2740 } 2741 2742 void __kmp_wait_yield_4_ptr( 2743 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2744 void *obj // Higher-level synchronization object, or NULL. 2745 ) { 2746 // note: we may not belong to a team at this point 2747 void *spin = spinner; 2748 kmp_uint32 check = checker; 2749 kmp_uint32 spins; 2750 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2751 2752 KMP_FSYNC_SPIN_INIT(obj, spin); 2753 KMP_INIT_YIELD(spins); 2754 // main wait spin loop 2755 while (!f(spin, check)) { 2756 KMP_FSYNC_SPIN_PREPARE(obj); 2757 /* if we have waited a bit, or are oversubscribed, yield */ 2758 /* pause is in the following code */ 2759 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2760 KMP_YIELD_SPIN(spins); 2761 } 2762 KMP_FSYNC_SPIN_ACQUIRED(obj); 2763 } 2764 2765 } // extern "C" 2766 2767 #ifdef KMP_GOMP_COMPAT 2768 2769 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2770 enum sched_type schedule, kmp_int32 lb, 2771 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2772 int push_ws) { 2773 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2774 push_ws); 2775 } 2776 2777 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2778 enum sched_type schedule, kmp_uint32 lb, 2779 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2780 int push_ws) { 2781 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2782 push_ws); 2783 } 2784 2785 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2786 enum sched_type schedule, kmp_int64 lb, 2787 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2788 int push_ws) { 2789 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2790 push_ws); 2791 } 2792 2793 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2794 enum sched_type schedule, kmp_uint64 lb, 2795 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2796 int push_ws) { 2797 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2798 push_ws); 2799 } 2800 2801 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2802 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2803 } 2804 2805 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2806 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2807 } 2808 2809 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2810 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2811 } 2812 2813 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2814 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2815 } 2816 2817 #endif /* KMP_GOMP_COMPAT */ 2818 2819 /* ------------------------------------------------------------------------ */ 2820