1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* Dynamic scheduling initialization and dispatch. 17 * 18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 19 * it may change values between parallel regions. __kmp_max_nth 20 * is the largest value __kmp_nth may take, 1 is the smallest. 21 */ 22 23 // Need to raise Win version from XP to Vista here for support of 24 // InterlockedExchange64 25 #if defined(_WIN32_WINNT) && defined(_M_IX86) 26 #undef _WIN32_WINNT 27 #define _WIN32_WINNT 0x0502 28 #endif 29 30 #include "kmp.h" 31 #include "kmp_error.h" 32 #include "kmp_i18n.h" 33 #include "kmp_itt.h" 34 #include "kmp_stats.h" 35 #include "kmp_str.h" 36 #if KMP_OS_WINDOWS && KMP_ARCH_X86 37 #include <float.h> 38 #endif 39 40 #if OMPT_SUPPORT 41 #include "ompt-internal.h" 42 #include "ompt-specific.h" 43 #endif 44 45 /* ------------------------------------------------------------------------ */ 46 47 #if KMP_STATIC_STEAL_ENABLED 48 49 // replaces dispatch_private_info{32,64} structures and 50 // dispatch_private_info{32,64}_t types 51 template <typename T> struct dispatch_private_infoXX_template { 52 typedef typename traits_t<T>::unsigned_t UT; 53 typedef typename traits_t<T>::signed_t ST; 54 UT count; // unsigned 55 T ub; 56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 57 T lb; 58 ST st; // signed 59 UT tc; // unsigned 60 T static_steal_counter; // for static_steal only; maybe better to put after ub 61 62 /* parm[1-4] are used in different ways by different scheduling algorithms */ 63 64 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 65 // a) parm3 is properly aligned and 66 // b) all parm1-4 are in the same cache line. 67 // Because of parm1-4 are used together, performance seems to be better 68 // if they are in the same line (not measured though). 69 70 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 71 T parm1; 72 T parm2; 73 T parm3; 74 T parm4; 75 }; 76 77 UT ordered_lower; // unsigned 78 UT ordered_upper; // unsigned 79 #if KMP_OS_WINDOWS 80 T last_upper; 81 #endif /* KMP_OS_WINDOWS */ 82 }; 83 84 #else /* KMP_STATIC_STEAL_ENABLED */ 85 86 // replaces dispatch_private_info{32,64} structures and 87 // dispatch_private_info{32,64}_t types 88 template <typename T> struct dispatch_private_infoXX_template { 89 typedef typename traits_t<T>::unsigned_t UT; 90 typedef typename traits_t<T>::signed_t ST; 91 T lb; 92 T ub; 93 ST st; // signed 94 UT tc; // unsigned 95 96 T parm1; 97 T parm2; 98 T parm3; 99 T parm4; 100 101 UT count; // unsigned 102 103 UT ordered_lower; // unsigned 104 UT ordered_upper; // unsigned 105 #if KMP_OS_WINDOWS 106 T last_upper; 107 #endif /* KMP_OS_WINDOWS */ 108 }; 109 110 #endif /* KMP_STATIC_STEAL_ENABLED */ 111 112 // replaces dispatch_private_info structure and dispatch_private_info_t type 113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 114 // duplicate alignment here, otherwise size of structure is not correct in our 115 // compiler 116 union KMP_ALIGN_CACHE private_info_tmpl { 117 dispatch_private_infoXX_template<T> p; 118 dispatch_private_info64_t p64; 119 } u; 120 enum sched_type schedule; /* scheduling algorithm */ 121 kmp_uint32 ordered; /* ordered clause specified */ 122 kmp_uint32 ordered_bumped; 123 // To retain the structure size after making ordered_iteration scalar 124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 126 kmp_uint32 nomerge; /* don't merge iters if serialized */ 127 kmp_uint32 type_size; 128 enum cons_type pushed_ws; 129 }; 130 131 // replaces dispatch_shared_info{32,64} structures and 132 // dispatch_shared_info{32,64}_t types 133 template <typename UT> struct dispatch_shared_infoXX_template { 134 /* chunk index under dynamic, number of idle threads under static-steal; 135 iteration index otherwise */ 136 volatile UT iteration; 137 volatile UT num_done; 138 volatile UT ordered_iteration; 139 // to retain the structure size making ordered_iteration scalar 140 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 141 }; 142 143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 144 template <typename UT> struct dispatch_shared_info_template { 145 // we need union here to keep the structure size 146 union shared_info_tmpl { 147 dispatch_shared_infoXX_template<UT> s; 148 dispatch_shared_info64_t s64; 149 } u; 150 volatile kmp_uint32 buffer_index; 151 #if OMP_45_ENABLED 152 volatile kmp_int32 doacross_buf_idx; // teamwise index 153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 154 kmp_int32 doacross_num_done; // count finished threads 155 #endif 156 #if KMP_USE_HWLOC 157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 158 // machines (> 48 cores). Performance analysis showed that a cache thrash 159 // was occurring and this padding helps alleviate the problem. 160 char padding[64]; 161 #endif 162 }; 163 164 /* ------------------------------------------------------------------------ */ 165 166 #undef USE_TEST_LOCKS 167 168 // test_then_add template (general template should NOT be used) 169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 170 171 template <> 172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 173 kmp_int32 d) { 174 kmp_int32 r; 175 r = KMP_TEST_THEN_ADD32(p, d); 176 return r; 177 } 178 179 template <> 180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 181 kmp_int64 d) { 182 kmp_int64 r; 183 r = KMP_TEST_THEN_ADD64(p, d); 184 return r; 185 } 186 187 // test_then_inc_acq template (general template should NOT be used) 188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 189 190 template <> 191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 192 kmp_int32 r; 193 r = KMP_TEST_THEN_INC_ACQ32(p); 194 return r; 195 } 196 197 template <> 198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 199 kmp_int64 r; 200 r = KMP_TEST_THEN_INC_ACQ64(p); 201 return r; 202 } 203 204 // test_then_inc template (general template should NOT be used) 205 template <typename T> static __forceinline T test_then_inc(volatile T *p); 206 207 template <> 208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 209 kmp_int32 r; 210 r = KMP_TEST_THEN_INC32(p); 211 return r; 212 } 213 214 template <> 215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 216 kmp_int64 r; 217 r = KMP_TEST_THEN_INC64(p); 218 return r; 219 } 220 221 // compare_and_swap template (general template should NOT be used) 222 template <typename T> 223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 224 225 template <> 226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 227 kmp_int32 c, kmp_int32 s) { 228 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 229 } 230 231 template <> 232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 233 kmp_int64 c, kmp_int64 s) { 234 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 235 } 236 237 /* Spin wait loop that first does pause, then yield. 238 Waits until function returns non-zero when called with *spinner and check. 239 Does NOT put threads to sleep. 240 #if USE_ITT_BUILD 241 Arguments: 242 obj -- is higher-level synchronization object to report to ittnotify. 243 It is used to report locks consistently. For example, if lock is 244 acquired immediately, its address is reported to ittnotify via 245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 247 same address, not an address of low-level spinner. 248 #endif // USE_ITT_BUILD 249 */ 250 template <typename UT> 251 // ToDo: make inline function (move to header file for icl) 252 static UT // unsigned 4- or 8-byte type 253 __kmp_wait_yield( 254 volatile UT *spinner, UT checker, 255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG( 256 void *obj) // Higher-level synchronization object, or NULL. 257 ) { 258 // note: we may not belong to a team at this point 259 register volatile UT *spin = spinner; 260 register UT check = checker; 261 register kmp_uint32 spins; 262 register kmp_uint32 (*f)(UT, UT) = pred; 263 register UT r; 264 265 KMP_FSYNC_SPIN_INIT(obj, (void *)spin); 266 KMP_INIT_YIELD(spins); 267 // main wait spin loop 268 while (!f(r = *spin, check)) { 269 KMP_FSYNC_SPIN_PREPARE(obj); 270 /* GEH - remove this since it was accidentally introduced when kmp_wait was 271 split. It causes problems with infinite recursion because of exit lock */ 272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 273 __kmp_abort_thread(); */ 274 275 // if we are oversubscribed, or have waited a bit (and 276 // KMP_LIBRARY=throughput, then yield. pause is in the following code 277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 278 KMP_YIELD_SPIN(spins); 279 } 280 KMP_FSYNC_SPIN_ACQUIRED(obj); 281 return r; 282 } 283 284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 285 return value == checker; 286 } 287 288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 289 return value != checker; 290 } 291 292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 293 return value < checker; 294 } 295 296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 297 return value >= checker; 298 } 299 300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 301 return value <= checker; 302 } 303 304 /* ------------------------------------------------------------------------ */ 305 306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 307 ident_t *loc_ref) { 308 kmp_info_t *th; 309 310 KMP_DEBUG_ASSERT(gtid_ref); 311 312 if (__kmp_env_consistency_check) { 313 th = __kmp_threads[*gtid_ref]; 314 if (th->th.th_root->r.r_active && 315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 316 #if KMP_USE_DYNAMIC_LOCK 317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 318 #else 319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 320 #endif 321 } 322 } 323 } 324 325 template <typename UT> 326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 327 typedef typename traits_t<UT>::signed_t ST; 328 dispatch_private_info_template<UT> *pr; 329 330 int gtid = *gtid_ref; 331 // int cid = *cid_ref; 332 kmp_info_t *th = __kmp_threads[gtid]; 333 KMP_DEBUG_ASSERT(th->th.th_dispatch); 334 335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 336 if (__kmp_env_consistency_check) { 337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 338 th->th.th_dispatch->th_dispatch_pr_current); 339 if (pr->pushed_ws != ct_none) { 340 #if KMP_USE_DYNAMIC_LOCK 341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 342 #else 343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 344 #endif 345 } 346 } 347 348 if (!th->th.th_team->t.t_serialized) { 349 dispatch_shared_info_template<UT> *sh = 350 reinterpret_cast<dispatch_shared_info_template<UT> *>( 351 th->th.th_dispatch->th_dispatch_sh_current); 352 UT lower; 353 354 if (!__kmp_env_consistency_check) { 355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 356 th->th.th_dispatch->th_dispatch_pr_current); 357 } 358 lower = pr->u.p.ordered_lower; 359 360 #if !defined(KMP_GOMP_COMPAT) 361 if (__kmp_env_consistency_check) { 362 if (pr->ordered_bumped) { 363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 365 ct_ordered_in_pdo, loc_ref, 366 &p->stack_data[p->w_top]); 367 } 368 } 369 #endif /* !defined(KMP_GOMP_COMPAT) */ 370 371 KMP_MB(); 372 #ifdef KMP_DEBUG 373 { 374 const char *buff; 375 // create format specifiers before the debug output 376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 377 "ordered_iter:%%%s lower:%%%s\n", 378 traits_t<UT>::spec, traits_t<UT>::spec); 379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 380 __kmp_str_free(&buff); 381 } 382 #endif 383 384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 386 KMP_MB(); /* is this necessary? */ 387 #ifdef KMP_DEBUG 388 { 389 const char *buff; 390 // create format specifiers before the debug output 391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 392 "ordered_iter:%%%s lower:%%%s\n", 393 traits_t<UT>::spec, traits_t<UT>::spec); 394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 395 __kmp_str_free(&buff); 396 } 397 #endif 398 } 399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 400 } 401 402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 403 ident_t *loc_ref) { 404 kmp_info_t *th; 405 406 if (__kmp_env_consistency_check) { 407 th = __kmp_threads[*gtid_ref]; 408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 410 } 411 } 412 } 413 414 template <typename UT> 415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 416 typedef typename traits_t<UT>::signed_t ST; 417 dispatch_private_info_template<UT> *pr; 418 419 int gtid = *gtid_ref; 420 // int cid = *cid_ref; 421 kmp_info_t *th = __kmp_threads[gtid]; 422 KMP_DEBUG_ASSERT(th->th.th_dispatch); 423 424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 425 if (__kmp_env_consistency_check) { 426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 427 th->th.th_dispatch->th_dispatch_pr_current); 428 if (pr->pushed_ws != ct_none) { 429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 430 } 431 } 432 433 if (!th->th.th_team->t.t_serialized) { 434 dispatch_shared_info_template<UT> *sh = 435 reinterpret_cast<dispatch_shared_info_template<UT> *>( 436 th->th.th_dispatch->th_dispatch_sh_current); 437 438 if (!__kmp_env_consistency_check) { 439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 440 th->th.th_dispatch->th_dispatch_pr_current); 441 } 442 443 KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration); 444 #if !defined(KMP_GOMP_COMPAT) 445 if (__kmp_env_consistency_check) { 446 if (pr->ordered_bumped != 0) { 447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 448 /* How to test it? - OM */ 449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 450 ct_ordered_in_pdo, loc_ref, 451 &p->stack_data[p->w_top]); 452 } 453 } 454 #endif /* !defined(KMP_GOMP_COMPAT) */ 455 456 KMP_MB(); /* Flush all pending memory write invalidates. */ 457 458 pr->ordered_bumped += 1; 459 460 KD_TRACE(1000, 461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 462 gtid, pr->ordered_bumped)); 463 464 KMP_MB(); /* Flush all pending memory write invalidates. */ 465 466 /* TODO use general release procedure? */ 467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 468 469 KMP_MB(); /* Flush all pending memory write invalidates. */ 470 } 471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 472 } 473 474 // Computes and returns x to the power of y, where y must a non-negative integer 475 template <typename UT> 476 static __forceinline long double __kmp_pow(long double x, UT y) { 477 long double s = 1.0L; 478 479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 481 while (y) { 482 if (y & 1) 483 s *= x; 484 x *= x; 485 y >>= 1; 486 } 487 return s; 488 } 489 490 /* Computes and returns the number of unassigned iterations after idx chunks 491 have been assigned (the total number of unassigned iterations in chunks with 492 index greater than or equal to idx). __forceinline seems to be broken so that 493 if we __forceinline this function, the behavior is wrong 494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 495 template <typename T> 496 static __inline typename traits_t<T>::unsigned_t 497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 498 typename traits_t<T>::unsigned_t idx) { 499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 500 ICL 8.1, long double arithmetic may not really have long double precision, 501 even with /Qlong_double. Currently, we workaround that in the caller code, 502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 503 of precision is not expected to be a correctness issue, though. */ 504 typedef typename traits_t<T>::unsigned_t UT; 505 506 long double x = tc * __kmp_pow<UT>(base, idx); 507 UT r = (UT)x; 508 if (x == r) 509 return r; 510 return r + 1; 511 } 512 513 // Parameters of the guided-iterative algorithm: 514 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 515 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 516 // by default n = 2. For example with n = 3 the chunks distribution will be more 517 // flat. 518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 519 static int guided_int_param = 2; 520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 521 522 // UT - unsigned flavor of T, ST - signed flavor of T, 523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 524 template <typename T> 525 static void 526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 527 T ub, typename traits_t<T>::signed_t st, 528 typename traits_t<T>::signed_t chunk, int push_ws) { 529 typedef typename traits_t<T>::unsigned_t UT; 530 typedef typename traits_t<T>::signed_t ST; 531 typedef typename traits_t<T>::floating_t DBL; 532 533 int active; 534 T tc; 535 kmp_info_t *th; 536 kmp_team_t *team; 537 kmp_uint32 my_buffer_index; 538 dispatch_private_info_template<T> *pr; 539 dispatch_shared_info_template<UT> volatile *sh; 540 541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 542 sizeof(dispatch_private_info)); 543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 544 sizeof(dispatch_shared_info)); 545 546 if (!TCR_4(__kmp_init_parallel)) 547 __kmp_parallel_initialize(); 548 549 #if INCLUDE_SSC_MARKS 550 SSC_MARK_DISPATCH_INIT(); 551 #endif 552 #ifdef KMP_DEBUG 553 { 554 const char *buff; 555 // create format specifiers before the debug output 556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 558 traits_t<ST>::spec, traits_t<T>::spec, 559 traits_t<T>::spec, traits_t<ST>::spec); 560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 561 __kmp_str_free(&buff); 562 } 563 #endif 564 /* setup data */ 565 th = __kmp_threads[gtid]; 566 team = th->th.th_team; 567 active = !team->t.t_serialized; 568 th->th.th_ident = loc; 569 570 #if USE_ITT_BUILD 571 kmp_uint64 cur_chunk = chunk; 572 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 573 __kmp_forkjoin_frames_mode == 3 && 574 KMP_MASTER_GTID(gtid) && 575 #if OMP_40_ENABLED 576 th->th.th_teams_microtask == NULL && 577 #endif 578 team->t.t_active_level == 1; 579 #endif 580 if (!active) { 581 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 583 } else { 584 KMP_DEBUG_ASSERT(th->th.th_dispatch == 585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 586 587 my_buffer_index = th->th.th_dispatch->th_disp_index++; 588 589 /* What happens when number of threads changes, need to resize buffer? */ 590 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 591 &th->th.th_dispatch 592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 595 } 596 597 #if (KMP_STATIC_STEAL_ENABLED) 598 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 599 // AC: we now have only one implementation of stealing, so use it 600 schedule = kmp_sch_static_steal; 601 else 602 #endif 603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 604 605 /* Pick up the nomerge/ordered bits from the scheduling type */ 606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 607 pr->nomerge = TRUE; 608 schedule = 609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 610 } else { 611 pr->nomerge = FALSE; 612 } 613 pr->type_size = traits_t<T>::type_size; // remember the size of variables 614 if (kmp_ord_lower & schedule) { 615 pr->ordered = TRUE; 616 schedule = 617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 618 } else { 619 pr->ordered = FALSE; 620 } 621 622 if (schedule == kmp_sch_static) { 623 schedule = __kmp_static; 624 } else { 625 if (schedule == kmp_sch_runtime) { 626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 627 // not specified) 628 schedule = team->t.t_sched.r_sched_type; 629 // Detail the schedule if needed (global controls are differentiated 630 // appropriately) 631 if (schedule == kmp_sch_guided_chunked) { 632 schedule = __kmp_guided; 633 } else if (schedule == kmp_sch_static) { 634 schedule = __kmp_static; 635 } 636 // Use the chunk size specified by OMP_SCHEDULE (or default if not 637 // specified) 638 chunk = team->t.t_sched.chunk; 639 #if USE_ITT_BUILD 640 cur_chunk = chunk; 641 #endif 642 #ifdef KMP_DEBUG 643 { 644 const char *buff; 645 // create format specifiers before the debug output 646 buff = __kmp_str_format( 647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 648 traits_t<ST>::spec); 649 KD_TRACE(10, (buff, gtid, schedule, chunk)); 650 __kmp_str_free(&buff); 651 } 652 #endif 653 } else { 654 if (schedule == kmp_sch_guided_chunked) { 655 schedule = __kmp_guided; 656 } 657 if (chunk <= 0) { 658 chunk = KMP_DEFAULT_CHUNK; 659 } 660 } 661 662 if (schedule == kmp_sch_auto) { 663 // mapping and differentiation: in the __kmp_do_serial_initialize() 664 schedule = __kmp_auto; 665 #ifdef KMP_DEBUG 666 { 667 const char *buff; 668 // create format specifiers before the debug output 669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 670 "schedule:%%d chunk:%%%s\n", 671 traits_t<ST>::spec); 672 KD_TRACE(10, (buff, gtid, schedule, chunk)); 673 __kmp_str_free(&buff); 674 } 675 #endif 676 } 677 678 /* guided analytical not safe for too many threads */ 679 if (schedule == kmp_sch_guided_analytical_chunked && 680 th->th.th_team_nproc > 1 << 20) { 681 schedule = kmp_sch_guided_iterative_chunked; 682 KMP_WARNING(DispatchManyThreads); 683 } 684 pr->u.p.parm1 = chunk; 685 } 686 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 687 "unknown scheduling type"); 688 689 pr->u.p.count = 0; 690 691 if (__kmp_env_consistency_check) { 692 if (st == 0) { 693 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 694 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 695 } 696 } 697 // compute trip count 698 if (st == 1) { // most common case 699 if (ub >= lb) { 700 tc = ub - lb + 1; 701 } else { // ub < lb 702 tc = 0; // zero-trip 703 } 704 } else if (st < 0) { 705 if (lb >= ub) { 706 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 707 // where the division needs to be unsigned regardless of the result type 708 tc = (UT)(lb - ub) / (-st) + 1; 709 } else { // lb < ub 710 tc = 0; // zero-trip 711 } 712 } else { // st > 0 713 if (ub >= lb) { 714 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 715 // where the division needs to be unsigned regardless of the result type 716 tc = (UT)(ub - lb) / st + 1; 717 } else { // ub < lb 718 tc = 0; // zero-trip 719 } 720 } 721 722 // Any half-decent optimizer will remove this test when the blocks are empty 723 // since the macros expand to nothing when statistics are disabled. 724 if (schedule == __kmp_static) { 725 KMP_COUNT_BLOCK(OMP_FOR_static); 726 KMP_COUNT_VALUE(FOR_static_iterations, tc); 727 } else { 728 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 729 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 730 } 731 732 pr->u.p.lb = lb; 733 pr->u.p.ub = ub; 734 pr->u.p.st = st; 735 pr->u.p.tc = tc; 736 737 #if KMP_OS_WINDOWS 738 pr->u.p.last_upper = ub + st; 739 #endif /* KMP_OS_WINDOWS */ 740 741 /* NOTE: only the active parallel region(s) has active ordered sections */ 742 743 if (active) { 744 if (pr->ordered == 0) { 745 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 746 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 747 } else { 748 pr->ordered_bumped = 0; 749 750 pr->u.p.ordered_lower = 1; 751 pr->u.p.ordered_upper = 0; 752 753 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 754 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 755 } 756 } 757 758 if (__kmp_env_consistency_check) { 759 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 760 if (push_ws) { 761 __kmp_push_workshare(gtid, ws, loc); 762 pr->pushed_ws = ws; 763 } else { 764 __kmp_check_workshare(gtid, ws, loc); 765 pr->pushed_ws = ct_none; 766 } 767 } 768 769 switch (schedule) { 770 #if (KMP_STATIC_STEAL_ENABLED) 771 case kmp_sch_static_steal: { 772 T nproc = th->th.th_team_nproc; 773 T ntc, init; 774 775 KD_TRACE(100, 776 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 777 778 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 779 if (nproc > 1 && ntc >= nproc) { 780 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 781 T id = __kmp_tid_from_gtid(gtid); 782 T small_chunk, extras; 783 784 small_chunk = ntc / nproc; 785 extras = ntc % nproc; 786 787 init = id * small_chunk + (id < extras ? id : extras); 788 pr->u.p.count = init; 789 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 790 791 pr->u.p.parm2 = lb; 792 // pr->pfields.parm3 = 0; // it's not used in static_steal 793 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 794 pr->u.p.st = st; 795 if (traits_t<T>::type_size > 4) { 796 // AC: TODO: check if 16-byte CAS available and use it to 797 // improve performance (probably wait for explicit request 798 // before spending time on this). 799 // For now use dynamically allocated per-thread lock, 800 // free memory in __kmp_dispatch_next when status==0. 801 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 802 th->th.th_dispatch->th_steal_lock = 803 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 804 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 805 } 806 break; 807 } else { 808 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 809 "kmp_sch_static_balanced\n", 810 gtid)); 811 schedule = kmp_sch_static_balanced; 812 /* too few iterations: fall-through to kmp_sch_static_balanced */ 813 } // if 814 /* FALL-THROUGH to static balanced */ 815 } // case 816 #endif 817 case kmp_sch_static_balanced: { 818 T nproc = th->th.th_team_nproc; 819 T init, limit; 820 821 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 822 gtid)); 823 824 if (nproc > 1) { 825 T id = __kmp_tid_from_gtid(gtid); 826 827 if (tc < nproc) { 828 if (id < tc) { 829 init = id; 830 limit = id; 831 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 832 } else { 833 pr->u.p.count = 1; /* means no more chunks to execute */ 834 pr->u.p.parm1 = FALSE; 835 break; 836 } 837 } else { 838 T small_chunk = tc / nproc; 839 T extras = tc % nproc; 840 init = id * small_chunk + (id < extras ? id : extras); 841 limit = init + small_chunk - (id < extras ? 0 : 1); 842 pr->u.p.parm1 = (id == nproc - 1); 843 } 844 } else { 845 if (tc > 0) { 846 init = 0; 847 limit = tc - 1; 848 pr->u.p.parm1 = TRUE; 849 } else { // zero trip count 850 pr->u.p.count = 1; /* means no more chunks to execute */ 851 pr->u.p.parm1 = FALSE; 852 break; 853 } 854 } 855 #if USE_ITT_BUILD 856 // Calculate chunk for metadata report 857 if (itt_need_metadata_reporting) 858 cur_chunk = limit - init + 1; 859 #endif 860 if (st == 1) { 861 pr->u.p.lb = lb + init; 862 pr->u.p.ub = lb + limit; 863 } else { 864 // calculated upper bound, "ub" is user-defined upper bound 865 T ub_tmp = lb + limit * st; 866 pr->u.p.lb = lb + init * st; 867 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 868 // it exactly 869 if (st > 0) { 870 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 871 } else { 872 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 873 } 874 } 875 if (pr->ordered) { 876 pr->u.p.ordered_lower = init; 877 pr->u.p.ordered_upper = limit; 878 } 879 break; 880 } // case 881 case kmp_sch_guided_iterative_chunked: { 882 T nproc = th->th.th_team_nproc; 883 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 884 " case\n", 885 gtid)); 886 887 if (nproc > 1) { 888 if ((2L * chunk + 1) * nproc >= tc) { 889 /* chunk size too large, switch to dynamic */ 890 schedule = kmp_sch_dynamic_chunked; 891 } else { 892 // when remaining iters become less than parm2 - switch to dynamic 893 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 894 *(double *)&pr->u.p.parm3 = 895 guided_flt_param / nproc; // may occupy parm3 and parm4 896 } 897 } else { 898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 899 "kmp_sch_static_greedy\n", 900 gtid)); 901 schedule = kmp_sch_static_greedy; 902 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 903 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 904 gtid)); 905 pr->u.p.parm1 = tc; 906 } // if 907 } // case 908 break; 909 case kmp_sch_guided_analytical_chunked: { 910 T nproc = th->th.th_team_nproc; 911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 912 " case\n", 913 gtid)); 914 if (nproc > 1) { 915 if ((2L * chunk + 1) * nproc >= tc) { 916 /* chunk size too large, switch to dynamic */ 917 schedule = kmp_sch_dynamic_chunked; 918 } else { 919 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 920 DBL x; 921 922 #if KMP_OS_WINDOWS && KMP_ARCH_X86 923 /* Linux* OS already has 64-bit computation by default for long double, 924 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 925 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 926 instead of the default 53-bit. Even though long double doesn't work 927 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 928 expected to impact the correctness of the algorithm, but this has not 929 been mathematically proven. */ 930 // save original FPCW and set precision to 64-bit, as 931 // Windows* OS on IA-32 architecture defaults to 53-bit 932 unsigned int oldFpcw = _control87(0, 0); 933 _control87(_PC_64, _MCW_PC); // 0,0x30000 934 #endif 935 /* value used for comparison in solver for cross-over point */ 936 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 937 938 /* crossover point--chunk indexes equal to or greater than 939 this point switch to dynamic-style scheduling */ 940 UT cross; 941 942 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 943 x = (long double)1.0 - (long double)0.5 / nproc; 944 945 #ifdef KMP_DEBUG 946 { // test natural alignment 947 struct _test_a { 948 char a; 949 union { 950 char b; 951 DBL d; 952 }; 953 } t; 954 ptrdiff_t natural_alignment = 955 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 956 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 957 // long)natural_alignment ); 958 KMP_DEBUG_ASSERT( 959 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 960 } 961 #endif // KMP_DEBUG 962 963 /* save the term in thread private dispatch structure */ 964 *(DBL *)&pr->u.p.parm3 = x; 965 966 /* solve for the crossover point to the nearest integer i for which C_i 967 <= chunk */ 968 { 969 UT left, right, mid; 970 long double p; 971 972 /* estimate initial upper and lower bound */ 973 974 /* doesn't matter what value right is as long as it is positive, but 975 it affects performance of the solver */ 976 right = 229; 977 p = __kmp_pow<UT>(x, right); 978 if (p > target) { 979 do { 980 p *= p; 981 right <<= 1; 982 } while (p > target && right < (1 << 27)); 983 /* lower bound is previous (failed) estimate of upper bound */ 984 left = right >> 1; 985 } else { 986 left = 0; 987 } 988 989 /* bisection root-finding method */ 990 while (left + 1 < right) { 991 mid = (left + right) / 2; 992 if (__kmp_pow<UT>(x, mid) > target) { 993 left = mid; 994 } else { 995 right = mid; 996 } 997 } // while 998 cross = right; 999 } 1000 /* assert sanity of computed crossover point */ 1001 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1002 __kmp_pow<UT>(x, cross) <= target); 1003 1004 /* save the crossover point in thread private dispatch structure */ 1005 pr->u.p.parm2 = cross; 1006 1007 // C75803 1008 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1009 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1010 #else 1011 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1012 #endif 1013 /* dynamic-style scheduling offset */ 1014 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1015 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1016 cross * chunk; 1017 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1018 // restore FPCW 1019 _control87(oldFpcw, _MCW_PC); 1020 #endif 1021 } // if 1022 } else { 1023 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1024 "kmp_sch_static_greedy\n", 1025 gtid)); 1026 schedule = kmp_sch_static_greedy; 1027 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1028 pr->u.p.parm1 = tc; 1029 } // if 1030 } // case 1031 break; 1032 case kmp_sch_static_greedy: 1033 KD_TRACE(100, 1034 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1035 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1036 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1037 : tc; 1038 break; 1039 case kmp_sch_static_chunked: 1040 case kmp_sch_dynamic_chunked: 1041 if (pr->u.p.parm1 <= 0) { 1042 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1043 } 1044 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1045 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1046 gtid)); 1047 break; 1048 case kmp_sch_trapezoidal: { 1049 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1050 1051 T parm1, parm2, parm3, parm4; 1052 KD_TRACE(100, 1053 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1054 1055 parm1 = chunk; 1056 1057 /* F : size of the first cycle */ 1058 parm2 = (tc / (2 * th->th.th_team_nproc)); 1059 1060 if (parm2 < 1) { 1061 parm2 = 1; 1062 } 1063 1064 /* L : size of the last cycle. Make sure the last cycle is not larger 1065 than the first cycle. */ 1066 if (parm1 < 1) { 1067 parm1 = 1; 1068 } else if (parm1 > parm2) { 1069 parm1 = parm2; 1070 } 1071 1072 /* N : number of cycles */ 1073 parm3 = (parm2 + parm1); 1074 parm3 = (2 * tc + parm3 - 1) / parm3; 1075 1076 if (parm3 < 2) { 1077 parm3 = 2; 1078 } 1079 1080 /* sigma : decreasing incr of the trapezoid */ 1081 parm4 = (parm3 - 1); 1082 parm4 = (parm2 - parm1) / parm4; 1083 1084 // pointless check, because parm4 >= 0 always 1085 // if ( parm4 < 0 ) { 1086 // parm4 = 0; 1087 //} 1088 1089 pr->u.p.parm1 = parm1; 1090 pr->u.p.parm2 = parm2; 1091 pr->u.p.parm3 = parm3; 1092 pr->u.p.parm4 = parm4; 1093 } // case 1094 break; 1095 1096 default: { 1097 __kmp_msg(kmp_ms_fatal, // Severity 1098 KMP_MSG(UnknownSchedTypeDetected), // Primary message 1099 KMP_HNT(GetNewerLibrary), // Hint 1100 __kmp_msg_null // Variadic argument list terminator 1101 ); 1102 } break; 1103 } // switch 1104 pr->schedule = schedule; 1105 if (active) { 1106 /* The name of this buffer should be my_buffer_index when it's free to use 1107 * it */ 1108 1109 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1110 "sh->buffer_index:%d\n", 1111 gtid, my_buffer_index, sh->buffer_index)); 1112 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1113 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1114 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1115 // my_buffer_index are *always* 32-bit integers. 1116 KMP_MB(); /* is this necessary? */ 1117 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1118 "sh->buffer_index:%d\n", 1119 gtid, my_buffer_index, sh->buffer_index)); 1120 1121 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1122 th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh; 1123 #if USE_ITT_BUILD 1124 if (pr->ordered) { 1125 __kmp_itt_ordered_init(gtid); 1126 }; // if 1127 // Report loop metadata 1128 if (itt_need_metadata_reporting) { 1129 // Only report metadata by master of active team at level 1 1130 kmp_uint64 schedtype = 0; 1131 switch (schedule) { 1132 case kmp_sch_static_chunked: 1133 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1134 break; 1135 case kmp_sch_static_greedy: 1136 cur_chunk = pr->u.p.parm1; 1137 break; 1138 case kmp_sch_dynamic_chunked: 1139 schedtype = 1; 1140 break; 1141 case kmp_sch_guided_iterative_chunked: 1142 case kmp_sch_guided_analytical_chunked: 1143 schedtype = 2; 1144 break; 1145 default: 1146 // Should we put this case under "static"? 1147 // case kmp_sch_static_steal: 1148 schedtype = 3; 1149 break; 1150 } 1151 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1152 } 1153 #endif /* USE_ITT_BUILD */ 1154 }; // if 1155 1156 #ifdef KMP_DEBUG 1157 { 1158 const char *buff; 1159 // create format specifiers before the debug output 1160 buff = __kmp_str_format( 1161 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1162 "lb:%%%s ub:%%%s" 1163 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1164 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1165 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1166 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1167 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1168 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1169 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1170 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1171 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1172 pr->u.p.parm3, pr->u.p.parm4)); 1173 __kmp_str_free(&buff); 1174 } 1175 #endif 1176 #if (KMP_STATIC_STEAL_ENABLED) 1177 // It cannot be guaranteed that after execution of a loop with some other 1178 // schedule kind all the parm3 variables will contain the same value. Even if 1179 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1180 // rather than program life-time increment. So the dedicated variable is 1181 // required. The 'static_steal_counter' is used. 1182 if (schedule == kmp_sch_static_steal) { 1183 // Other threads will inspect this variable when searching for a victim. 1184 // This is a flag showing that other threads may steal from this thread 1185 // since then. 1186 volatile T *p = &pr->u.p.static_steal_counter; 1187 *p = *p + 1; 1188 } 1189 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1190 1191 #if OMPT_SUPPORT && OMPT_TRACE 1192 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1193 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1194 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1195 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1196 team_info->parallel_id, task_info->task_id, team_info->microtask); 1197 } 1198 #endif 1199 } 1200 1201 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1202 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1203 * every chunk of iterations. If the ordered section(s) were not executed 1204 * for this iteration (or every iteration in this chunk), we need to set the 1205 * ordered iteration counters so that the next thread can proceed. */ 1206 template <typename UT> 1207 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1208 typedef typename traits_t<UT>::signed_t ST; 1209 kmp_info_t *th = __kmp_threads[gtid]; 1210 1211 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1212 if (!th->th.th_team->t.t_serialized) { 1213 1214 dispatch_private_info_template<UT> *pr = 1215 reinterpret_cast<dispatch_private_info_template<UT> *>( 1216 th->th.th_dispatch->th_dispatch_pr_current); 1217 dispatch_shared_info_template<UT> volatile *sh = 1218 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1219 th->th.th_dispatch->th_dispatch_sh_current); 1220 KMP_DEBUG_ASSERT(pr); 1221 KMP_DEBUG_ASSERT(sh); 1222 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1223 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1224 1225 if (pr->ordered_bumped) { 1226 KD_TRACE( 1227 1000, 1228 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1229 gtid)); 1230 pr->ordered_bumped = 0; 1231 } else { 1232 UT lower = pr->u.p.ordered_lower; 1233 1234 #ifdef KMP_DEBUG 1235 { 1236 const char *buff; 1237 // create format specifiers before the debug output 1238 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1239 "ordered_iteration:%%%s lower:%%%s\n", 1240 traits_t<UT>::spec, traits_t<UT>::spec); 1241 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1242 __kmp_str_free(&buff); 1243 } 1244 #endif 1245 1246 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1247 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1248 KMP_MB(); /* is this necessary? */ 1249 #ifdef KMP_DEBUG 1250 { 1251 const char *buff; 1252 // create format specifiers before the debug output 1253 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1254 "ordered_iteration:%%%s lower:%%%s\n", 1255 traits_t<UT>::spec, traits_t<UT>::spec); 1256 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1257 __kmp_str_free(&buff); 1258 } 1259 #endif 1260 1261 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1262 } // if 1263 } // if 1264 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1265 } 1266 1267 #ifdef KMP_GOMP_COMPAT 1268 1269 template <typename UT> 1270 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1271 typedef typename traits_t<UT>::signed_t ST; 1272 kmp_info_t *th = __kmp_threads[gtid]; 1273 1274 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1275 if (!th->th.th_team->t.t_serialized) { 1276 // int cid; 1277 dispatch_private_info_template<UT> *pr = 1278 reinterpret_cast<dispatch_private_info_template<UT> *>( 1279 th->th.th_dispatch->th_dispatch_pr_current); 1280 dispatch_shared_info_template<UT> volatile *sh = 1281 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1282 th->th.th_dispatch->th_dispatch_sh_current); 1283 KMP_DEBUG_ASSERT(pr); 1284 KMP_DEBUG_ASSERT(sh); 1285 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1286 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1287 1288 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1289 UT lower = pr->u.p.ordered_lower; 1290 UT upper = pr->u.p.ordered_upper; 1291 UT inc = upper - lower + 1; 1292 1293 if (pr->ordered_bumped == inc) { 1294 KD_TRACE( 1295 1000, 1296 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1297 gtid)); 1298 pr->ordered_bumped = 0; 1299 } else { 1300 inc -= pr->ordered_bumped; 1301 1302 #ifdef KMP_DEBUG 1303 { 1304 const char *buff; 1305 // create format specifiers before the debug output 1306 buff = __kmp_str_format( 1307 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1308 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1309 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1310 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1311 __kmp_str_free(&buff); 1312 } 1313 #endif 1314 1315 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1316 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1317 1318 KMP_MB(); /* is this necessary? */ 1319 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1320 "ordered_bumped to zero\n", 1321 gtid)); 1322 pr->ordered_bumped = 0; 1323 //!!!!! TODO check if the inc should be unsigned, or signed??? 1324 #ifdef KMP_DEBUG 1325 { 1326 const char *buff; 1327 // create format specifiers before the debug output 1328 buff = __kmp_str_format( 1329 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1330 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1331 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1332 traits_t<UT>::spec); 1333 KD_TRACE(1000, 1334 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1335 __kmp_str_free(&buff); 1336 } 1337 #endif 1338 1339 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1340 } 1341 // } 1342 } 1343 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1344 } 1345 1346 #endif /* KMP_GOMP_COMPAT */ 1347 1348 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1349 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1350 is not called. */ 1351 #if OMPT_SUPPORT && OMPT_TRACE 1352 #define OMPT_LOOP_END \ 1353 if (status == 0) { \ 1354 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1355 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1356 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1357 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1358 team_info->parallel_id, task_info->task_id); \ 1359 } \ 1360 } 1361 #else 1362 #define OMPT_LOOP_END // no-op 1363 #endif 1364 1365 template <typename T> 1366 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1367 T *p_lb, T *p_ub, 1368 typename traits_t<T>::signed_t *p_st) { 1369 1370 typedef typename traits_t<T>::unsigned_t UT; 1371 typedef typename traits_t<T>::signed_t ST; 1372 typedef typename traits_t<T>::floating_t DBL; 1373 1374 // This is potentially slightly misleading, schedule(runtime) will appear here 1375 // even if the actual runtme schedule is static. (Which points out a 1376 // disadavantage of schedule(runtime): even when static scheduling is used it 1377 // costs more than a compile time choice to use static scheduling would.) 1378 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1379 1380 int status; 1381 dispatch_private_info_template<T> *pr; 1382 kmp_info_t *th = __kmp_threads[gtid]; 1383 kmp_team_t *team = th->th.th_team; 1384 1385 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1386 #ifdef KMP_DEBUG 1387 { 1388 const char *buff; 1389 // create format specifiers before the debug output 1390 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1391 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1392 traits_t<T>::spec, traits_t<T>::spec, 1393 traits_t<ST>::spec); 1394 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1395 __kmp_str_free(&buff); 1396 } 1397 #endif 1398 1399 if (team->t.t_serialized) { 1400 /* NOTE: serialize this dispatch becase we are not at the active level */ 1401 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1402 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1403 KMP_DEBUG_ASSERT(pr); 1404 1405 if ((status = (pr->u.p.tc != 0)) == 0) { 1406 *p_lb = 0; 1407 *p_ub = 0; 1408 // if ( p_last != NULL ) 1409 // *p_last = 0; 1410 if (p_st != NULL) 1411 *p_st = 0; 1412 if (__kmp_env_consistency_check) { 1413 if (pr->pushed_ws != ct_none) { 1414 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1415 } 1416 } 1417 } else if (pr->nomerge) { 1418 kmp_int32 last; 1419 T start; 1420 UT limit, trip, init; 1421 ST incr; 1422 T chunk = pr->u.p.parm1; 1423 1424 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1425 gtid)); 1426 1427 init = chunk * pr->u.p.count++; 1428 trip = pr->u.p.tc - 1; 1429 1430 if ((status = (init <= trip)) == 0) { 1431 *p_lb = 0; 1432 *p_ub = 0; 1433 // if ( p_last != NULL ) 1434 // *p_last = 0; 1435 if (p_st != NULL) 1436 *p_st = 0; 1437 if (__kmp_env_consistency_check) { 1438 if (pr->pushed_ws != ct_none) { 1439 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1440 } 1441 } 1442 } else { 1443 start = pr->u.p.lb; 1444 limit = chunk + init - 1; 1445 incr = pr->u.p.st; 1446 1447 if ((last = (limit >= trip)) != 0) { 1448 limit = trip; 1449 #if KMP_OS_WINDOWS 1450 pr->u.p.last_upper = pr->u.p.ub; 1451 #endif /* KMP_OS_WINDOWS */ 1452 } 1453 if (p_last != NULL) 1454 *p_last = last; 1455 if (p_st != NULL) 1456 *p_st = incr; 1457 if (incr == 1) { 1458 *p_lb = start + init; 1459 *p_ub = start + limit; 1460 } else { 1461 *p_lb = start + init * incr; 1462 *p_ub = start + limit * incr; 1463 } 1464 1465 if (pr->ordered) { 1466 pr->u.p.ordered_lower = init; 1467 pr->u.p.ordered_upper = limit; 1468 #ifdef KMP_DEBUG 1469 { 1470 const char *buff; 1471 // create format specifiers before the debug output 1472 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1473 "ordered_lower:%%%s ordered_upper:%%%s\n", 1474 traits_t<UT>::spec, traits_t<UT>::spec); 1475 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1476 pr->u.p.ordered_upper)); 1477 __kmp_str_free(&buff); 1478 } 1479 #endif 1480 } // if 1481 } // if 1482 } else { 1483 pr->u.p.tc = 0; 1484 *p_lb = pr->u.p.lb; 1485 *p_ub = pr->u.p.ub; 1486 #if KMP_OS_WINDOWS 1487 pr->u.p.last_upper = *p_ub; 1488 #endif /* KMP_OS_WINDOWS */ 1489 if (p_last != NULL) 1490 *p_last = TRUE; 1491 if (p_st != NULL) 1492 *p_st = pr->u.p.st; 1493 } // if 1494 #ifdef KMP_DEBUG 1495 { 1496 const char *buff; 1497 // create format specifiers before the debug output 1498 buff = __kmp_str_format( 1499 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1500 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1501 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1502 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1503 __kmp_str_free(&buff); 1504 } 1505 #endif 1506 #if INCLUDE_SSC_MARKS 1507 SSC_MARK_DISPATCH_NEXT(); 1508 #endif 1509 OMPT_LOOP_END; 1510 return status; 1511 } else { 1512 kmp_int32 last = 0; 1513 dispatch_shared_info_template<UT> *sh; 1514 T start; 1515 ST incr; 1516 UT limit, trip, init; 1517 1518 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1519 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1520 1521 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1522 th->th.th_dispatch->th_dispatch_pr_current); 1523 KMP_DEBUG_ASSERT(pr); 1524 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1525 th->th.th_dispatch->th_dispatch_sh_current); 1526 KMP_DEBUG_ASSERT(sh); 1527 1528 if (pr->u.p.tc == 0) { 1529 // zero trip count 1530 status = 0; 1531 } else { 1532 switch (pr->schedule) { 1533 #if (KMP_STATIC_STEAL_ENABLED) 1534 case kmp_sch_static_steal: { 1535 T chunk = pr->u.p.parm1; 1536 int nproc = th->th.th_team_nproc; 1537 1538 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1539 gtid)); 1540 1541 trip = pr->u.p.tc - 1; 1542 1543 if (traits_t<T>::type_size > 4) { 1544 // use lock for 8-byte and CAS for 4-byte induction 1545 // variable. TODO (optional): check and use 16-byte CAS 1546 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1547 KMP_DEBUG_ASSERT(lck != NULL); 1548 if (pr->u.p.count < (UT)pr->u.p.ub) { 1549 __kmp_acquire_lock(lck, gtid); 1550 // try to get own chunk of iterations 1551 init = (pr->u.p.count)++; 1552 status = (init < (UT)pr->u.p.ub); 1553 __kmp_release_lock(lck, gtid); 1554 } else { 1555 status = 0; // no own chunks 1556 } 1557 if (!status) { // try to steal 1558 kmp_info_t **other_threads = team->t.t_threads; 1559 int while_limit = nproc; // nproc attempts to find a victim 1560 int while_index = 0; 1561 // TODO: algorithm of searching for a victim 1562 // should be cleaned up and measured 1563 while ((!status) && (while_limit != ++while_index)) { 1564 T remaining; 1565 T victimIdx = pr->u.p.parm4; 1566 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1567 dispatch_private_info_template<T> *victim = 1568 reinterpret_cast<dispatch_private_info_template<T> *>( 1569 other_threads[victimIdx] 1570 ->th.th_dispatch->th_dispatch_pr_current); 1571 while ((victim == NULL || victim == pr || 1572 (*(volatile T *)&victim->u.p.static_steal_counter != 1573 *(volatile T *)&pr->u.p.static_steal_counter)) && 1574 oldVictimIdx != victimIdx) { 1575 victimIdx = (victimIdx + 1) % nproc; 1576 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1577 other_threads[victimIdx] 1578 ->th.th_dispatch->th_dispatch_pr_current); 1579 }; 1580 if (!victim || 1581 (*(volatile T *)&victim->u.p.static_steal_counter != 1582 *(volatile T *)&pr->u.p.static_steal_counter)) { 1583 continue; // try once more (nproc attempts in total) 1584 // no victim is ready yet to participate in stealing 1585 // because all victims are still in kmp_init_dispatch 1586 } 1587 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1588 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1589 continue; // not enough chunks to steal, goto next victim 1590 } 1591 1592 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1593 KMP_ASSERT(lck != NULL); 1594 __kmp_acquire_lock(lck, gtid); 1595 limit = victim->u.p.ub; // keep initial ub 1596 if (victim->u.p.count >= limit || 1597 (remaining = limit - victim->u.p.count) < 2) { 1598 __kmp_release_lock(lck, gtid); 1599 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1600 continue; // not enough chunks to steal 1601 } 1602 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1603 // or by 1 1604 if (remaining > 3) { 1605 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1606 init = (victim->u.p.ub -= 1607 (remaining >> 2)); // steal 1/4 of remaining 1608 } else { 1609 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1610 init = 1611 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1612 } 1613 __kmp_release_lock(lck, gtid); 1614 1615 KMP_DEBUG_ASSERT(init + 1 <= limit); 1616 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1617 status = 1; 1618 while_index = 0; 1619 // now update own count and ub with stolen range but init chunk 1620 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1621 pr->u.p.count = init + 1; 1622 pr->u.p.ub = limit; 1623 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1624 } // while (search for victim) 1625 } // if (try to find victim and steal) 1626 } else { 1627 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1628 typedef union { 1629 struct { 1630 UT count; 1631 T ub; 1632 } p; 1633 kmp_int64 b; 1634 } union_i4; 1635 // All operations on 'count' or 'ub' must be combined atomically 1636 // together. 1637 { 1638 union_i4 vold, vnew; 1639 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1640 vnew = vold; 1641 vnew.p.count++; 1642 while (!KMP_COMPARE_AND_STORE_ACQ64( 1643 (volatile kmp_int64 *)&pr->u.p.count, 1644 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1645 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1646 KMP_CPU_PAUSE(); 1647 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1648 vnew = vold; 1649 vnew.p.count++; 1650 } 1651 vnew = vold; 1652 init = vnew.p.count; 1653 status = (init < (UT)vnew.p.ub); 1654 } 1655 1656 if (!status) { 1657 kmp_info_t **other_threads = team->t.t_threads; 1658 int while_limit = nproc; // nproc attempts to find a victim 1659 int while_index = 0; 1660 1661 // TODO: algorithm of searching for a victim 1662 // should be cleaned up and measured 1663 while ((!status) && (while_limit != ++while_index)) { 1664 union_i4 vold, vnew; 1665 kmp_int32 remaining; 1666 T victimIdx = pr->u.p.parm4; 1667 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1668 dispatch_private_info_template<T> *victim = 1669 reinterpret_cast<dispatch_private_info_template<T> *>( 1670 other_threads[victimIdx] 1671 ->th.th_dispatch->th_dispatch_pr_current); 1672 while ((victim == NULL || victim == pr || 1673 (*(volatile T *)&victim->u.p.static_steal_counter != 1674 *(volatile T *)&pr->u.p.static_steal_counter)) && 1675 oldVictimIdx != victimIdx) { 1676 victimIdx = (victimIdx + 1) % nproc; 1677 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1678 other_threads[victimIdx] 1679 ->th.th_dispatch->th_dispatch_pr_current); 1680 }; 1681 if (!victim || 1682 (*(volatile T *)&victim->u.p.static_steal_counter != 1683 *(volatile T *)&pr->u.p.static_steal_counter)) { 1684 continue; // try once more (nproc attempts in total) 1685 // no victim is ready yet to participate in stealing 1686 // because all victims are still in kmp_init_dispatch 1687 } 1688 pr->u.p.parm4 = victimIdx; // new victim found 1689 while (1) { // CAS loop if victim has enough chunks to steal 1690 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1691 vnew = vold; 1692 1693 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1694 if (vnew.p.count >= (UT)vnew.p.ub || 1695 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1696 pr->u.p.parm4 = 1697 (victimIdx + 1) % nproc; // shift start victim id 1698 break; // not enough chunks to steal, goto next victim 1699 } 1700 if (remaining > 3) { 1701 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1702 } else { 1703 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1704 } 1705 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1706 // TODO: Should this be acquire or release? 1707 if (KMP_COMPARE_AND_STORE_ACQ64( 1708 (volatile kmp_int64 *)&victim->u.p.count, 1709 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1710 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1711 // stealing succeeded 1712 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1713 vold.p.ub - vnew.p.ub); 1714 status = 1; 1715 while_index = 0; 1716 // now update own count and ub 1717 init = vnew.p.ub; 1718 vold.p.count = init + 1; 1719 #if KMP_ARCH_X86 1720 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1721 vold.b); 1722 #else 1723 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1724 #endif 1725 break; 1726 } // if (check CAS result) 1727 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1728 } // while (try to steal from particular victim) 1729 } // while (search for victim) 1730 } // if (try to find victim and steal) 1731 } // if (4-byte induction variable) 1732 if (!status) { 1733 *p_lb = 0; 1734 *p_ub = 0; 1735 if (p_st != NULL) 1736 *p_st = 0; 1737 } else { 1738 start = pr->u.p.parm2; 1739 init *= chunk; 1740 limit = chunk + init - 1; 1741 incr = pr->u.p.st; 1742 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1743 1744 KMP_DEBUG_ASSERT(init <= trip); 1745 if ((last = (limit >= trip)) != 0) 1746 limit = trip; 1747 if (p_st != NULL) 1748 *p_st = incr; 1749 1750 if (incr == 1) { 1751 *p_lb = start + init; 1752 *p_ub = start + limit; 1753 } else { 1754 *p_lb = start + init * incr; 1755 *p_ub = start + limit * incr; 1756 } 1757 1758 if (pr->ordered) { 1759 pr->u.p.ordered_lower = init; 1760 pr->u.p.ordered_upper = limit; 1761 #ifdef KMP_DEBUG 1762 { 1763 const char *buff; 1764 // create format specifiers before the debug output 1765 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1766 "ordered_lower:%%%s ordered_upper:%%%s\n", 1767 traits_t<UT>::spec, traits_t<UT>::spec); 1768 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1769 pr->u.p.ordered_upper)); 1770 __kmp_str_free(&buff); 1771 } 1772 #endif 1773 } // if 1774 } // if 1775 break; 1776 } // case 1777 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1778 case kmp_sch_static_balanced: { 1779 KD_TRACE( 1780 100, 1781 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1782 if ((status = !pr->u.p.count) != 1783 0) { /* check if thread has any iteration to do */ 1784 pr->u.p.count = 1; 1785 *p_lb = pr->u.p.lb; 1786 *p_ub = pr->u.p.ub; 1787 last = pr->u.p.parm1; 1788 if (p_st != NULL) 1789 *p_st = pr->u.p.st; 1790 } else { /* no iterations to do */ 1791 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1792 } 1793 if (pr->ordered) { 1794 #ifdef KMP_DEBUG 1795 { 1796 const char *buff; 1797 // create format specifiers before the debug output 1798 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1799 "ordered_lower:%%%s ordered_upper:%%%s\n", 1800 traits_t<UT>::spec, traits_t<UT>::spec); 1801 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1802 pr->u.p.ordered_upper)); 1803 __kmp_str_free(&buff); 1804 } 1805 #endif 1806 } // if 1807 } // case 1808 break; 1809 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1810 merged here */ 1811 case kmp_sch_static_chunked: { 1812 T parm1; 1813 1814 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1815 "kmp_sch_static_[affinity|chunked] case\n", 1816 gtid)); 1817 parm1 = pr->u.p.parm1; 1818 1819 trip = pr->u.p.tc - 1; 1820 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1821 1822 if ((status = (init <= trip)) != 0) { 1823 start = pr->u.p.lb; 1824 incr = pr->u.p.st; 1825 limit = parm1 + init - 1; 1826 1827 if ((last = (limit >= trip)) != 0) 1828 limit = trip; 1829 1830 if (p_st != NULL) 1831 *p_st = incr; 1832 1833 pr->u.p.count += th->th.th_team_nproc; 1834 1835 if (incr == 1) { 1836 *p_lb = start + init; 1837 *p_ub = start + limit; 1838 } else { 1839 *p_lb = start + init * incr; 1840 *p_ub = start + limit * incr; 1841 } 1842 1843 if (pr->ordered) { 1844 pr->u.p.ordered_lower = init; 1845 pr->u.p.ordered_upper = limit; 1846 #ifdef KMP_DEBUG 1847 { 1848 const char *buff; 1849 // create format specifiers before the debug output 1850 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1851 "ordered_lower:%%%s ordered_upper:%%%s\n", 1852 traits_t<UT>::spec, traits_t<UT>::spec); 1853 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1854 pr->u.p.ordered_upper)); 1855 __kmp_str_free(&buff); 1856 } 1857 #endif 1858 } // if 1859 } // if 1860 } // case 1861 break; 1862 1863 case kmp_sch_dynamic_chunked: { 1864 T chunk = pr->u.p.parm1; 1865 1866 KD_TRACE( 1867 100, 1868 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1869 1870 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1871 trip = pr->u.p.tc - 1; 1872 1873 if ((status = (init <= trip)) == 0) { 1874 *p_lb = 0; 1875 *p_ub = 0; 1876 if (p_st != NULL) 1877 *p_st = 0; 1878 } else { 1879 start = pr->u.p.lb; 1880 limit = chunk + init - 1; 1881 incr = pr->u.p.st; 1882 1883 if ((last = (limit >= trip)) != 0) 1884 limit = trip; 1885 1886 if (p_st != NULL) 1887 *p_st = incr; 1888 1889 if (incr == 1) { 1890 *p_lb = start + init; 1891 *p_ub = start + limit; 1892 } else { 1893 *p_lb = start + init * incr; 1894 *p_ub = start + limit * incr; 1895 } 1896 1897 if (pr->ordered) { 1898 pr->u.p.ordered_lower = init; 1899 pr->u.p.ordered_upper = limit; 1900 #ifdef KMP_DEBUG 1901 { 1902 const char *buff; 1903 // create format specifiers before the debug output 1904 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1905 "ordered_lower:%%%s ordered_upper:%%%s\n", 1906 traits_t<UT>::spec, traits_t<UT>::spec); 1907 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1908 pr->u.p.ordered_upper)); 1909 __kmp_str_free(&buff); 1910 } 1911 #endif 1912 } // if 1913 } // if 1914 } // case 1915 break; 1916 1917 case kmp_sch_guided_iterative_chunked: { 1918 T chunkspec = pr->u.p.parm1; 1919 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1920 "iterative case\n", 1921 gtid)); 1922 trip = pr->u.p.tc; 1923 // Start atomic part of calculations 1924 while (1) { 1925 ST remaining; // signed, because can be < 0 1926 init = sh->u.s.iteration; // shared value 1927 remaining = trip - init; 1928 if (remaining <= 0) { // AC: need to compare with 0 first 1929 // nothing to do, don't try atomic op 1930 status = 0; 1931 break; 1932 } 1933 if ((T)remaining < 1934 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1935 // use dynamic-style shcedule 1936 // atomically inrement iterations, get old value 1937 init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec); 1938 remaining = trip - init; 1939 if (remaining <= 0) { 1940 status = 0; // all iterations got by other threads 1941 } else { // got some iterations to work on 1942 status = 1; 1943 if ((T)remaining > chunkspec) { 1944 limit = init + chunkspec - 1; 1945 } else { 1946 last = 1; // the last chunk 1947 limit = init + remaining - 1; 1948 } // if 1949 } // if 1950 break; 1951 } // if 1952 limit = init + (UT)(remaining * 1953 *(double *)&pr->u.p.parm3); // divide by K*nproc 1954 if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init, 1955 (ST)limit)) { 1956 // CAS was successful, chunk obtained 1957 status = 1; 1958 --limit; 1959 break; 1960 } // if 1961 } // while 1962 if (status != 0) { 1963 start = pr->u.p.lb; 1964 incr = pr->u.p.st; 1965 if (p_st != NULL) 1966 *p_st = incr; 1967 *p_lb = start + init * incr; 1968 *p_ub = start + limit * incr; 1969 if (pr->ordered) { 1970 pr->u.p.ordered_lower = init; 1971 pr->u.p.ordered_upper = limit; 1972 #ifdef KMP_DEBUG 1973 { 1974 const char *buff; 1975 // create format specifiers before the debug output 1976 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1977 "ordered_lower:%%%s ordered_upper:%%%s\n", 1978 traits_t<UT>::spec, traits_t<UT>::spec); 1979 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1980 pr->u.p.ordered_upper)); 1981 __kmp_str_free(&buff); 1982 } 1983 #endif 1984 } // if 1985 } else { 1986 *p_lb = 0; 1987 *p_ub = 0; 1988 if (p_st != NULL) 1989 *p_st = 0; 1990 } // if 1991 } // case 1992 break; 1993 1994 case kmp_sch_guided_analytical_chunked: { 1995 T chunkspec = pr->u.p.parm1; 1996 UT chunkIdx; 1997 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1998 /* for storing original FPCW value for Windows* OS on 1999 IA-32 architecture 8-byte version */ 2000 unsigned int oldFpcw; 2001 unsigned int fpcwSet = 0; 2002 #endif 2003 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2004 "analytical case\n", 2005 gtid)); 2006 2007 trip = pr->u.p.tc; 2008 2009 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2010 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2011 trip); 2012 2013 while (1) { /* this while loop is a safeguard against unexpected zero 2014 chunk sizes */ 2015 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2016 if (chunkIdx >= (UT)pr->u.p.parm2) { 2017 --trip; 2018 /* use dynamic-style scheduling */ 2019 init = chunkIdx * chunkspec + pr->u.p.count; 2020 /* need to verify init > 0 in case of overflow in the above 2021 * calculation */ 2022 if ((status = (init > 0 && init <= trip)) != 0) { 2023 limit = init + chunkspec - 1; 2024 2025 if ((last = (limit >= trip)) != 0) 2026 limit = trip; 2027 } 2028 break; 2029 } else { 2030 /* use exponential-style scheduling */ 2031 /* The following check is to workaround the lack of long double precision on 2032 Windows* OS. 2033 This check works around the possible effect that init != 0 for chunkIdx == 0. 2034 */ 2035 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2036 /* If we haven't already done so, save original FPCW and set 2037 precision to 64-bit, as Windows* OS on IA-32 architecture 2038 defaults to 53-bit */ 2039 if (!fpcwSet) { 2040 oldFpcw = _control87(0, 0); 2041 _control87(_PC_64, _MCW_PC); 2042 fpcwSet = 0x30000; 2043 } 2044 #endif 2045 if (chunkIdx) { 2046 init = __kmp_dispatch_guided_remaining<T>( 2047 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2048 KMP_DEBUG_ASSERT(init); 2049 init = trip - init; 2050 } else 2051 init = 0; 2052 limit = trip - __kmp_dispatch_guided_remaining<T>( 2053 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2054 KMP_ASSERT(init <= limit); 2055 if (init < limit) { 2056 KMP_DEBUG_ASSERT(limit <= trip); 2057 --limit; 2058 status = 1; 2059 break; 2060 } // if 2061 } // if 2062 } // while (1) 2063 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2064 /* restore FPCW if necessary 2065 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2066 here */ 2067 if (fpcwSet && (oldFpcw & fpcwSet)) 2068 _control87(oldFpcw, _MCW_PC); 2069 #endif 2070 if (status != 0) { 2071 start = pr->u.p.lb; 2072 incr = pr->u.p.st; 2073 if (p_st != NULL) 2074 *p_st = incr; 2075 *p_lb = start + init * incr; 2076 *p_ub = start + limit * incr; 2077 if (pr->ordered) { 2078 pr->u.p.ordered_lower = init; 2079 pr->u.p.ordered_upper = limit; 2080 #ifdef KMP_DEBUG 2081 { 2082 const char *buff; 2083 // create format specifiers before the debug output 2084 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2085 "ordered_lower:%%%s ordered_upper:%%%s\n", 2086 traits_t<UT>::spec, traits_t<UT>::spec); 2087 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2088 pr->u.p.ordered_upper)); 2089 __kmp_str_free(&buff); 2090 } 2091 #endif 2092 } 2093 } else { 2094 *p_lb = 0; 2095 *p_ub = 0; 2096 if (p_st != NULL) 2097 *p_st = 0; 2098 } 2099 } // case 2100 break; 2101 2102 case kmp_sch_trapezoidal: { 2103 UT index; 2104 T parm2 = pr->u.p.parm2; 2105 T parm3 = pr->u.p.parm3; 2106 T parm4 = pr->u.p.parm4; 2107 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2108 gtid)); 2109 2110 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2111 2112 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2113 trip = pr->u.p.tc - 1; 2114 2115 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2116 *p_lb = 0; 2117 *p_ub = 0; 2118 if (p_st != NULL) 2119 *p_st = 0; 2120 } else { 2121 start = pr->u.p.lb; 2122 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2123 incr = pr->u.p.st; 2124 2125 if ((last = (limit >= trip)) != 0) 2126 limit = trip; 2127 2128 if (p_st != NULL) 2129 *p_st = incr; 2130 2131 if (incr == 1) { 2132 *p_lb = start + init; 2133 *p_ub = start + limit; 2134 } else { 2135 *p_lb = start + init * incr; 2136 *p_ub = start + limit * incr; 2137 } 2138 2139 if (pr->ordered) { 2140 pr->u.p.ordered_lower = init; 2141 pr->u.p.ordered_upper = limit; 2142 #ifdef KMP_DEBUG 2143 { 2144 const char *buff; 2145 // create format specifiers before the debug output 2146 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2147 "ordered_lower:%%%s ordered_upper:%%%s\n", 2148 traits_t<UT>::spec, traits_t<UT>::spec); 2149 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2150 pr->u.p.ordered_upper)); 2151 __kmp_str_free(&buff); 2152 } 2153 #endif 2154 } // if 2155 } // if 2156 } // case 2157 break; 2158 default: { 2159 status = 0; // to avoid complaints on uninitialized variable use 2160 __kmp_msg(kmp_ms_fatal, // Severity 2161 KMP_MSG(UnknownSchedTypeDetected), // Primary message 2162 KMP_HNT(GetNewerLibrary), // Hint 2163 __kmp_msg_null // Variadic argument list terminator 2164 ); 2165 } break; 2166 } // switch 2167 } // if tc == 0; 2168 2169 if (status == 0) { 2170 UT num_done; 2171 2172 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2173 #ifdef KMP_DEBUG 2174 { 2175 const char *buff; 2176 // create format specifiers before the debug output 2177 buff = __kmp_str_format( 2178 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2179 traits_t<UT>::spec); 2180 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2181 __kmp_str_free(&buff); 2182 } 2183 #endif 2184 2185 if ((ST)num_done == th->th.th_team_nproc - 1) { 2186 #if (KMP_STATIC_STEAL_ENABLED) 2187 if (pr->schedule == kmp_sch_static_steal && 2188 traits_t<T>::type_size > 4) { 2189 int i; 2190 kmp_info_t **other_threads = team->t.t_threads; 2191 // loop complete, safe to destroy locks used for stealing 2192 for (i = 0; i < th->th.th_team_nproc; ++i) { 2193 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2194 KMP_ASSERT(lck != NULL); 2195 __kmp_destroy_lock(lck); 2196 __kmp_free(lck); 2197 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2198 } 2199 } 2200 #endif 2201 /* NOTE: release this buffer to be reused */ 2202 2203 KMP_MB(); /* Flush all pending memory write invalidates. */ 2204 2205 sh->u.s.num_done = 0; 2206 sh->u.s.iteration = 0; 2207 2208 /* TODO replace with general release procedure? */ 2209 if (pr->ordered) { 2210 sh->u.s.ordered_iteration = 0; 2211 } 2212 2213 KMP_MB(); /* Flush all pending memory write invalidates. */ 2214 2215 sh->buffer_index += __kmp_dispatch_num_buffers; 2216 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2217 gtid, sh->buffer_index)); 2218 2219 KMP_MB(); /* Flush all pending memory write invalidates. */ 2220 2221 } // if 2222 if (__kmp_env_consistency_check) { 2223 if (pr->pushed_ws != ct_none) { 2224 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2225 } 2226 } 2227 2228 th->th.th_dispatch->th_deo_fcn = NULL; 2229 th->th.th_dispatch->th_dxo_fcn = NULL; 2230 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2231 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2232 } // if (status == 0) 2233 #if KMP_OS_WINDOWS 2234 else if (last) { 2235 pr->u.p.last_upper = pr->u.p.ub; 2236 } 2237 #endif /* KMP_OS_WINDOWS */ 2238 if (p_last != NULL && status != 0) 2239 *p_last = last; 2240 } // if 2241 2242 #ifdef KMP_DEBUG 2243 { 2244 const char *buff; 2245 // create format specifiers before the debug output 2246 buff = __kmp_str_format( 2247 "__kmp_dispatch_next: T#%%d normal case: " 2248 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2249 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2250 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2251 __kmp_str_free(&buff); 2252 } 2253 #endif 2254 #if INCLUDE_SSC_MARKS 2255 SSC_MARK_DISPATCH_NEXT(); 2256 #endif 2257 OMPT_LOOP_END; 2258 return status; 2259 } 2260 2261 template <typename T> 2262 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2263 kmp_int32 *plastiter, T *plower, T *pupper, 2264 typename traits_t<T>::signed_t incr) { 2265 typedef typename traits_t<T>::unsigned_t UT; 2266 typedef typename traits_t<T>::signed_t ST; 2267 register kmp_uint32 team_id; 2268 register kmp_uint32 nteams; 2269 register UT trip_count; 2270 register kmp_team_t *team; 2271 kmp_info_t *th; 2272 2273 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2274 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2275 #ifdef KMP_DEBUG 2276 { 2277 const char *buff; 2278 // create format specifiers before the debug output 2279 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2280 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2281 traits_t<T>::spec, traits_t<T>::spec, 2282 traits_t<ST>::spec, traits_t<T>::spec); 2283 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2284 __kmp_str_free(&buff); 2285 } 2286 #endif 2287 2288 if (__kmp_env_consistency_check) { 2289 if (incr == 0) { 2290 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2291 loc); 2292 } 2293 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2294 // The loop is illegal. 2295 // Some zero-trip loops maintained by compiler, e.g.: 2296 // for(i=10;i<0;++i) // lower >= upper - run-time check 2297 // for(i=0;i>10;--i) // lower <= upper - run-time check 2298 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2299 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2300 // Compiler does not check the following illegal loops: 2301 // for(i=0;i<10;i+=incr) // where incr<0 2302 // for(i=10;i>0;i-=incr) // where incr<0 2303 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2304 } 2305 } 2306 th = __kmp_threads[gtid]; 2307 team = th->th.th_team; 2308 #if OMP_40_ENABLED 2309 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2310 nteams = th->th.th_teams_size.nteams; 2311 #endif 2312 team_id = team->t.t_master_tid; 2313 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2314 2315 // compute global trip count 2316 if (incr == 1) { 2317 trip_count = *pupper - *plower + 1; 2318 } else if (incr == -1) { 2319 trip_count = *plower - *pupper + 1; 2320 } else if (incr > 0) { 2321 // upper-lower can exceed the limit of signed type 2322 trip_count = (UT)(*pupper - *plower) / incr + 1; 2323 } else { 2324 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2325 } 2326 2327 if (trip_count <= nteams) { 2328 KMP_DEBUG_ASSERT( 2329 __kmp_static == kmp_sch_static_greedy || 2330 __kmp_static == 2331 kmp_sch_static_balanced); // Unknown static scheduling type. 2332 // only some teams get single iteration, others get nothing 2333 if (team_id < trip_count) { 2334 *pupper = *plower = *plower + team_id * incr; 2335 } else { 2336 *plower = *pupper + incr; // zero-trip loop 2337 } 2338 if (plastiter != NULL) 2339 *plastiter = (team_id == trip_count - 1); 2340 } else { 2341 if (__kmp_static == kmp_sch_static_balanced) { 2342 register UT chunk = trip_count / nteams; 2343 register UT extras = trip_count % nteams; 2344 *plower += 2345 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2346 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2347 if (plastiter != NULL) 2348 *plastiter = (team_id == nteams - 1); 2349 } else { 2350 register T chunk_inc_count = 2351 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2352 register T upper = *pupper; 2353 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2354 // Unknown static scheduling type. 2355 *plower += team_id * chunk_inc_count; 2356 *pupper = *plower + chunk_inc_count - incr; 2357 // Check/correct bounds if needed 2358 if (incr > 0) { 2359 if (*pupper < *plower) 2360 *pupper = traits_t<T>::max_value; 2361 if (plastiter != NULL) 2362 *plastiter = *plower <= upper && *pupper > upper - incr; 2363 if (*pupper > upper) 2364 *pupper = upper; // tracker C73258 2365 } else { 2366 if (*pupper > *plower) 2367 *pupper = traits_t<T>::min_value; 2368 if (plastiter != NULL) 2369 *plastiter = *plower >= upper && *pupper < upper - incr; 2370 if (*pupper < upper) 2371 *pupper = upper; // tracker C73258 2372 } 2373 } 2374 } 2375 } 2376 2377 //----------------------------------------------------------------------------- 2378 // Dispatch routines 2379 // Transfer call to template< type T > 2380 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2381 // T lb, T ub, ST st, ST chunk ) 2382 extern "C" { 2383 2384 /*! 2385 @ingroup WORK_SHARING 2386 @{ 2387 @param loc Source location 2388 @param gtid Global thread id 2389 @param schedule Schedule type 2390 @param lb Lower bound 2391 @param ub Upper bound 2392 @param st Step (or increment if you prefer) 2393 @param chunk The chunk size to block with 2394 2395 This function prepares the runtime to start a dynamically scheduled for loop, 2396 saving the loop arguments. 2397 These functions are all identical apart from the types of the arguments. 2398 */ 2399 2400 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2401 enum sched_type schedule, kmp_int32 lb, 2402 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2403 KMP_DEBUG_ASSERT(__kmp_init_serial); 2404 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2405 } 2406 /*! 2407 See @ref __kmpc_dispatch_init_4 2408 */ 2409 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2410 enum sched_type schedule, kmp_uint32 lb, 2411 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2412 KMP_DEBUG_ASSERT(__kmp_init_serial); 2413 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2414 } 2415 2416 /*! 2417 See @ref __kmpc_dispatch_init_4 2418 */ 2419 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2420 enum sched_type schedule, kmp_int64 lb, 2421 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2422 KMP_DEBUG_ASSERT(__kmp_init_serial); 2423 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2424 } 2425 2426 /*! 2427 See @ref __kmpc_dispatch_init_4 2428 */ 2429 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2430 enum sched_type schedule, kmp_uint64 lb, 2431 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2432 KMP_DEBUG_ASSERT(__kmp_init_serial); 2433 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2434 } 2435 2436 /*! 2437 See @ref __kmpc_dispatch_init_4 2438 2439 Difference from __kmpc_dispatch_init set of functions is these functions 2440 are called for composite distribute parallel for construct. Thus before 2441 regular iterations dispatching we need to calc per-team iteration space. 2442 2443 These functions are all identical apart from the types of the arguments. 2444 */ 2445 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2446 enum sched_type schedule, kmp_int32 *p_last, 2447 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2448 kmp_int32 chunk) { 2449 KMP_DEBUG_ASSERT(__kmp_init_serial); 2450 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2451 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2452 } 2453 2454 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2455 enum sched_type schedule, kmp_int32 *p_last, 2456 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2457 kmp_int32 chunk) { 2458 KMP_DEBUG_ASSERT(__kmp_init_serial); 2459 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2460 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2461 } 2462 2463 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2464 enum sched_type schedule, kmp_int32 *p_last, 2465 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2466 kmp_int64 chunk) { 2467 KMP_DEBUG_ASSERT(__kmp_init_serial); 2468 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2469 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2470 } 2471 2472 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2473 enum sched_type schedule, kmp_int32 *p_last, 2474 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2475 kmp_int64 chunk) { 2476 KMP_DEBUG_ASSERT(__kmp_init_serial); 2477 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2478 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2479 } 2480 2481 /*! 2482 @param loc Source code location 2483 @param gtid Global thread id 2484 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2485 otherwise 2486 @param p_lb Pointer to the lower bound for the next chunk of work 2487 @param p_ub Pointer to the upper bound for the next chunk of work 2488 @param p_st Pointer to the stride for the next chunk of work 2489 @return one if there is work to be done, zero otherwise 2490 2491 Get the next dynamically allocated chunk of work for this thread. 2492 If there is no more work, then the lb,ub and stride need not be modified. 2493 */ 2494 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2495 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2496 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2497 } 2498 2499 /*! 2500 See @ref __kmpc_dispatch_next_4 2501 */ 2502 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2503 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2504 kmp_int32 *p_st) { 2505 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st); 2506 } 2507 2508 /*! 2509 See @ref __kmpc_dispatch_next_4 2510 */ 2511 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2512 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2513 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2514 } 2515 2516 /*! 2517 See @ref __kmpc_dispatch_next_4 2518 */ 2519 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2520 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2521 kmp_int64 *p_st) { 2522 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st); 2523 } 2524 2525 /*! 2526 @param loc Source code location 2527 @param gtid Global thread id 2528 2529 Mark the end of a dynamic loop. 2530 */ 2531 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2532 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2533 } 2534 2535 /*! 2536 See @ref __kmpc_dispatch_fini_4 2537 */ 2538 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2539 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2540 } 2541 2542 /*! 2543 See @ref __kmpc_dispatch_fini_4 2544 */ 2545 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2546 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2547 } 2548 2549 /*! 2550 See @ref __kmpc_dispatch_fini_4 2551 */ 2552 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2553 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2554 } 2555 /*! @} */ 2556 2557 //----------------------------------------------------------------------------- 2558 // Non-template routines from kmp_dispatch.cpp used in other sources 2559 2560 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2561 return value == checker; 2562 } 2563 2564 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2565 return value != checker; 2566 } 2567 2568 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2569 return value < checker; 2570 } 2571 2572 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2573 return value >= checker; 2574 } 2575 2576 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2577 return value <= checker; 2578 } 2579 2580 kmp_uint32 2581 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2582 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2583 void *obj // Higher-level synchronization object, or NULL. 2584 ) { 2585 // note: we may not belong to a team at this point 2586 register volatile kmp_uint32 *spin = spinner; 2587 register kmp_uint32 check = checker; 2588 register kmp_uint32 spins; 2589 register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2590 register kmp_uint32 r; 2591 2592 KMP_FSYNC_SPIN_INIT(obj, (void *)spin); 2593 KMP_INIT_YIELD(spins); 2594 // main wait spin loop 2595 while (!f(r = TCR_4(*spin), check)) { 2596 KMP_FSYNC_SPIN_PREPARE(obj); 2597 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2598 split. It causes problems with infinite recursion because of exit lock */ 2599 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2600 __kmp_abort_thread(); */ 2601 2602 /* if we have waited a bit, or are oversubscribed, yield */ 2603 /* pause is in the following code */ 2604 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2605 KMP_YIELD_SPIN(spins); 2606 } 2607 KMP_FSYNC_SPIN_ACQUIRED(obj); 2608 return r; 2609 } 2610 2611 void __kmp_wait_yield_4_ptr( 2612 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2613 void *obj // Higher-level synchronization object, or NULL. 2614 ) { 2615 // note: we may not belong to a team at this point 2616 register void *spin = spinner; 2617 register kmp_uint32 check = checker; 2618 register kmp_uint32 spins; 2619 register kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2620 2621 KMP_FSYNC_SPIN_INIT(obj, spin); 2622 KMP_INIT_YIELD(spins); 2623 // main wait spin loop 2624 while (!f(spin, check)) { 2625 KMP_FSYNC_SPIN_PREPARE(obj); 2626 /* if we have waited a bit, or are oversubscribed, yield */ 2627 /* pause is in the following code */ 2628 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2629 KMP_YIELD_SPIN(spins); 2630 } 2631 KMP_FSYNC_SPIN_ACQUIRED(obj); 2632 } 2633 2634 } // extern "C" 2635 2636 #ifdef KMP_GOMP_COMPAT 2637 2638 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2639 enum sched_type schedule, kmp_int32 lb, 2640 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2641 int push_ws) { 2642 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2643 push_ws); 2644 } 2645 2646 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2647 enum sched_type schedule, kmp_uint32 lb, 2648 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2649 int push_ws) { 2650 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2651 push_ws); 2652 } 2653 2654 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2655 enum sched_type schedule, kmp_int64 lb, 2656 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2657 int push_ws) { 2658 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2659 push_ws); 2660 } 2661 2662 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2663 enum sched_type schedule, kmp_uint64 lb, 2664 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2665 int push_ws) { 2666 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2667 push_ws); 2668 } 2669 2670 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2671 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2672 } 2673 2674 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2675 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2676 } 2677 2678 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2679 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2680 } 2681 2682 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2683 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2684 } 2685 2686 #endif /* KMP_GOMP_COMPAT */ 2687 2688 /* ------------------------------------------------------------------------ */ 2689