1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-specific.h" 40 #endif 41 42 /* ------------------------------------------------------------------------ */ 43 44 #if KMP_STATIC_STEAL_ENABLED 45 46 // replaces dispatch_private_info{32,64} structures and 47 // dispatch_private_info{32,64}_t types 48 template <typename T> struct dispatch_private_infoXX_template { 49 typedef typename traits_t<T>::unsigned_t UT; 50 typedef typename traits_t<T>::signed_t ST; 51 UT count; // unsigned 52 T ub; 53 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 54 T lb; 55 ST st; // signed 56 UT tc; // unsigned 57 T static_steal_counter; // for static_steal only; maybe better to put after ub 58 59 /* parm[1-4] are used in different ways by different scheduling algorithms */ 60 61 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 62 // a) parm3 is properly aligned and 63 // b) all parm1-4 are in the same cache line. 64 // Because of parm1-4 are used together, performance seems to be better 65 // if they are in the same line (not measured though). 66 67 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 68 T parm1; 69 T parm2; 70 T parm3; 71 T parm4; 72 }; 73 74 UT ordered_lower; // unsigned 75 UT ordered_upper; // unsigned 76 #if KMP_OS_WINDOWS 77 T last_upper; 78 #endif /* KMP_OS_WINDOWS */ 79 }; 80 81 #else /* KMP_STATIC_STEAL_ENABLED */ 82 83 // replaces dispatch_private_info{32,64} structures and 84 // dispatch_private_info{32,64}_t types 85 template <typename T> struct dispatch_private_infoXX_template { 86 typedef typename traits_t<T>::unsigned_t UT; 87 typedef typename traits_t<T>::signed_t ST; 88 T lb; 89 T ub; 90 ST st; // signed 91 UT tc; // unsigned 92 93 T parm1; 94 T parm2; 95 T parm3; 96 T parm4; 97 98 UT count; // unsigned 99 100 UT ordered_lower; // unsigned 101 UT ordered_upper; // unsigned 102 #if KMP_OS_WINDOWS 103 T last_upper; 104 #endif /* KMP_OS_WINDOWS */ 105 }; 106 107 #endif /* KMP_STATIC_STEAL_ENABLED */ 108 109 // replaces dispatch_private_info structure and dispatch_private_info_t type 110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 111 // duplicate alignment here, otherwise size of structure is not correct in our 112 // compiler 113 union KMP_ALIGN_CACHE private_info_tmpl { 114 dispatch_private_infoXX_template<T> p; 115 dispatch_private_info64_t p64; 116 } u; 117 enum sched_type schedule; /* scheduling algorithm */ 118 kmp_uint32 ordered; /* ordered clause specified */ 119 kmp_uint32 ordered_bumped; 120 // To retain the structure size after making ordered_iteration scalar 121 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 122 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 123 kmp_uint32 nomerge; /* don't merge iters if serialized */ 124 kmp_uint32 type_size; 125 enum cons_type pushed_ws; 126 }; 127 128 // replaces dispatch_shared_info{32,64} structures and 129 // dispatch_shared_info{32,64}_t types 130 template <typename UT> struct dispatch_shared_infoXX_template { 131 /* chunk index under dynamic, number of idle threads under static-steal; 132 iteration index otherwise */ 133 volatile UT iteration; 134 volatile UT num_done; 135 volatile UT ordered_iteration; 136 // to retain the structure size making ordered_iteration scalar 137 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 138 }; 139 140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 141 template <typename UT> struct dispatch_shared_info_template { 142 // we need union here to keep the structure size 143 union shared_info_tmpl { 144 dispatch_shared_infoXX_template<UT> s; 145 dispatch_shared_info64_t s64; 146 } u; 147 volatile kmp_uint32 buffer_index; 148 #if OMP_45_ENABLED 149 volatile kmp_int32 doacross_buf_idx; // teamwise index 150 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 151 kmp_int32 doacross_num_done; // count finished threads 152 #endif 153 #if KMP_USE_HWLOC 154 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 155 // machines (> 48 cores). Performance analysis showed that a cache thrash 156 // was occurring and this padding helps alleviate the problem. 157 char padding[64]; 158 #endif 159 }; 160 161 /* ------------------------------------------------------------------------ */ 162 163 #undef USE_TEST_LOCKS 164 165 // test_then_add template (general template should NOT be used) 166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 167 168 template <> 169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 170 kmp_int32 d) { 171 kmp_int32 r; 172 r = KMP_TEST_THEN_ADD32(p, d); 173 return r; 174 } 175 176 template <> 177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 178 kmp_int64 d) { 179 kmp_int64 r; 180 r = KMP_TEST_THEN_ADD64(p, d); 181 return r; 182 } 183 184 // test_then_inc_acq template (general template should NOT be used) 185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 186 187 template <> 188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 189 kmp_int32 r; 190 r = KMP_TEST_THEN_INC_ACQ32(p); 191 return r; 192 } 193 194 template <> 195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 196 kmp_int64 r; 197 r = KMP_TEST_THEN_INC_ACQ64(p); 198 return r; 199 } 200 201 // test_then_inc template (general template should NOT be used) 202 template <typename T> static __forceinline T test_then_inc(volatile T *p); 203 204 template <> 205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 206 kmp_int32 r; 207 r = KMP_TEST_THEN_INC32(p); 208 return r; 209 } 210 211 template <> 212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 213 kmp_int64 r; 214 r = KMP_TEST_THEN_INC64(p); 215 return r; 216 } 217 218 // compare_and_swap template (general template should NOT be used) 219 template <typename T> 220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 221 222 template <> 223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 224 kmp_int32 c, kmp_int32 s) { 225 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 226 } 227 228 template <> 229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 230 kmp_int64 c, kmp_int64 s) { 231 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 232 } 233 234 /* Spin wait loop that first does pause, then yield. 235 Waits until function returns non-zero when called with *spinner and check. 236 Does NOT put threads to sleep. 237 Arguments: 238 UT is unsigned 4- or 8-byte type 239 spinner - memory location to check value 240 checker - value which spinner is >, <, ==, etc. 241 pred - predicate function to perform binary comparison of some sort 242 #if USE_ITT_BUILD 243 obj -- is higher-level synchronization object to report to ittnotify. 244 It is used to report locks consistently. For example, if lock is 245 acquired immediately, its address is reported to ittnotify via 246 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 247 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 248 same address, not an address of low-level spinner. 249 #endif // USE_ITT_BUILD 250 TODO: make inline function (move to header file for icl) 251 */ 252 template <typename UT> 253 static UT __kmp_wait_yield(volatile UT *spinner, UT checker, 254 kmp_uint32 (*pred)(UT, UT) 255 USE_ITT_BUILD_ARG(void *obj)) { 256 // note: we may not belong to a team at this point 257 volatile UT *spin = spinner; 258 UT check = checker; 259 kmp_uint32 spins; 260 kmp_uint32 (*f)(UT, UT) = pred; 261 UT r; 262 263 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 264 KMP_INIT_YIELD(spins); 265 // main wait spin loop 266 while (!f(r = *spin, check)) { 267 KMP_FSYNC_SPIN_PREPARE(obj); 268 /* GEH - remove this since it was accidentally introduced when kmp_wait was 269 split. It causes problems with infinite recursion because of exit lock */ 270 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 271 __kmp_abort_thread(); */ 272 273 // if we are oversubscribed, or have waited a bit (and 274 // KMP_LIBRARY=throughput, then yield. pause is in the following code 275 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 276 KMP_YIELD_SPIN(spins); 277 } 278 KMP_FSYNC_SPIN_ACQUIRED(obj); 279 return r; 280 } 281 282 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 283 return value == checker; 284 } 285 286 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 287 return value != checker; 288 } 289 290 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 291 return value < checker; 292 } 293 294 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 295 return value >= checker; 296 } 297 298 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 299 return value <= checker; 300 } 301 302 /* ------------------------------------------------------------------------ */ 303 304 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 305 ident_t *loc_ref) { 306 kmp_info_t *th; 307 308 KMP_DEBUG_ASSERT(gtid_ref); 309 310 if (__kmp_env_consistency_check) { 311 th = __kmp_threads[*gtid_ref]; 312 if (th->th.th_root->r.r_active && 313 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 314 #if KMP_USE_DYNAMIC_LOCK 315 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 316 #else 317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 318 #endif 319 } 320 } 321 } 322 323 template <typename UT> 324 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 325 typedef typename traits_t<UT>::signed_t ST; 326 dispatch_private_info_template<UT> *pr; 327 328 int gtid = *gtid_ref; 329 // int cid = *cid_ref; 330 kmp_info_t *th = __kmp_threads[gtid]; 331 KMP_DEBUG_ASSERT(th->th.th_dispatch); 332 333 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 334 if (__kmp_env_consistency_check) { 335 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 336 th->th.th_dispatch->th_dispatch_pr_current); 337 if (pr->pushed_ws != ct_none) { 338 #if KMP_USE_DYNAMIC_LOCK 339 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 340 #else 341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 342 #endif 343 } 344 } 345 346 if (!th->th.th_team->t.t_serialized) { 347 dispatch_shared_info_template<UT> *sh = 348 reinterpret_cast<dispatch_shared_info_template<UT> *>( 349 th->th.th_dispatch->th_dispatch_sh_current); 350 UT lower; 351 352 if (!__kmp_env_consistency_check) { 353 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 354 th->th.th_dispatch->th_dispatch_pr_current); 355 } 356 lower = pr->u.p.ordered_lower; 357 358 #if !defined(KMP_GOMP_COMPAT) 359 if (__kmp_env_consistency_check) { 360 if (pr->ordered_bumped) { 361 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 362 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 363 ct_ordered_in_pdo, loc_ref, 364 &p->stack_data[p->w_top]); 365 } 366 } 367 #endif /* !defined(KMP_GOMP_COMPAT) */ 368 369 KMP_MB(); 370 #ifdef KMP_DEBUG 371 { 372 char *buff; 373 // create format specifiers before the debug output 374 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 375 "ordered_iter:%%%s lower:%%%s\n", 376 traits_t<UT>::spec, traits_t<UT>::spec); 377 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 378 __kmp_str_free(&buff); 379 } 380 #endif 381 382 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 383 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 384 KMP_MB(); /* is this necessary? */ 385 #ifdef KMP_DEBUG 386 { 387 char *buff; 388 // create format specifiers before the debug output 389 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 390 "ordered_iter:%%%s lower:%%%s\n", 391 traits_t<UT>::spec, traits_t<UT>::spec); 392 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 393 __kmp_str_free(&buff); 394 } 395 #endif 396 } 397 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 398 } 399 400 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 401 ident_t *loc_ref) { 402 kmp_info_t *th; 403 404 if (__kmp_env_consistency_check) { 405 th = __kmp_threads[*gtid_ref]; 406 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 407 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 408 } 409 } 410 } 411 412 template <typename UT> 413 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 414 typedef typename traits_t<UT>::signed_t ST; 415 dispatch_private_info_template<UT> *pr; 416 417 int gtid = *gtid_ref; 418 // int cid = *cid_ref; 419 kmp_info_t *th = __kmp_threads[gtid]; 420 KMP_DEBUG_ASSERT(th->th.th_dispatch); 421 422 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 423 if (__kmp_env_consistency_check) { 424 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 425 th->th.th_dispatch->th_dispatch_pr_current); 426 if (pr->pushed_ws != ct_none) { 427 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 428 } 429 } 430 431 if (!th->th.th_team->t.t_serialized) { 432 dispatch_shared_info_template<UT> *sh = 433 reinterpret_cast<dispatch_shared_info_template<UT> *>( 434 th->th.th_dispatch->th_dispatch_sh_current); 435 436 if (!__kmp_env_consistency_check) { 437 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 438 th->th.th_dispatch->th_dispatch_pr_current); 439 } 440 441 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 442 #if !defined(KMP_GOMP_COMPAT) 443 if (__kmp_env_consistency_check) { 444 if (pr->ordered_bumped != 0) { 445 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 446 /* How to test it? - OM */ 447 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 448 ct_ordered_in_pdo, loc_ref, 449 &p->stack_data[p->w_top]); 450 } 451 } 452 #endif /* !defined(KMP_GOMP_COMPAT) */ 453 454 KMP_MB(); /* Flush all pending memory write invalidates. */ 455 456 pr->ordered_bumped += 1; 457 458 KD_TRACE(1000, 459 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 460 gtid, pr->ordered_bumped)); 461 462 KMP_MB(); /* Flush all pending memory write invalidates. */ 463 464 /* TODO use general release procedure? */ 465 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 466 467 KMP_MB(); /* Flush all pending memory write invalidates. */ 468 } 469 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 470 } 471 472 // Computes and returns x to the power of y, where y must a non-negative integer 473 template <typename UT> 474 static __forceinline long double __kmp_pow(long double x, UT y) { 475 long double s = 1.0L; 476 477 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 478 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 479 while (y) { 480 if (y & 1) 481 s *= x; 482 x *= x; 483 y >>= 1; 484 } 485 return s; 486 } 487 488 /* Computes and returns the number of unassigned iterations after idx chunks 489 have been assigned (the total number of unassigned iterations in chunks with 490 index greater than or equal to idx). __forceinline seems to be broken so that 491 if we __forceinline this function, the behavior is wrong 492 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 493 template <typename T> 494 static __inline typename traits_t<T>::unsigned_t 495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 496 typename traits_t<T>::unsigned_t idx) { 497 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 498 ICL 8.1, long double arithmetic may not really have long double precision, 499 even with /Qlong_double. Currently, we workaround that in the caller code, 500 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 501 of precision is not expected to be a correctness issue, though. */ 502 typedef typename traits_t<T>::unsigned_t UT; 503 504 long double x = tc * __kmp_pow<UT>(base, idx); 505 UT r = (UT)x; 506 if (x == r) 507 return r; 508 return r + 1; 509 } 510 511 // Parameters of the guided-iterative algorithm: 512 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 513 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 514 // by default n = 2. For example with n = 3 the chunks distribution will be more 515 // flat. 516 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 517 static int guided_int_param = 2; 518 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 519 520 // UT - unsigned flavor of T, ST - signed flavor of T, 521 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 522 template <typename T> 523 static void 524 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 525 T ub, typename traits_t<T>::signed_t st, 526 typename traits_t<T>::signed_t chunk, int push_ws) { 527 typedef typename traits_t<T>::unsigned_t UT; 528 typedef typename traits_t<T>::signed_t ST; 529 typedef typename traits_t<T>::floating_t DBL; 530 531 int active; 532 T tc; 533 kmp_info_t *th; 534 kmp_team_t *team; 535 kmp_uint32 my_buffer_index; 536 dispatch_private_info_template<T> *pr; 537 dispatch_shared_info_template<UT> volatile *sh; 538 539 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 540 sizeof(dispatch_private_info)); 541 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 542 sizeof(dispatch_shared_info)); 543 544 if (!TCR_4(__kmp_init_parallel)) 545 __kmp_parallel_initialize(); 546 547 #if INCLUDE_SSC_MARKS 548 SSC_MARK_DISPATCH_INIT(); 549 #endif 550 #ifdef KMP_DEBUG 551 { 552 char *buff; 553 // create format specifiers before the debug output 554 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 555 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 556 traits_t<ST>::spec, traits_t<T>::spec, 557 traits_t<T>::spec, traits_t<ST>::spec); 558 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 559 __kmp_str_free(&buff); 560 } 561 #endif 562 /* setup data */ 563 th = __kmp_threads[gtid]; 564 team = th->th.th_team; 565 active = !team->t.t_serialized; 566 th->th.th_ident = loc; 567 568 #if USE_ITT_BUILD 569 kmp_uint64 cur_chunk = chunk; 570 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 571 __kmp_forkjoin_frames_mode == 3 && 572 KMP_MASTER_GTID(gtid) && 573 #if OMP_40_ENABLED 574 th->th.th_teams_microtask == NULL && 575 #endif 576 team->t.t_active_level == 1; 577 #endif 578 if (!active) { 579 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 580 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 581 } else { 582 KMP_DEBUG_ASSERT(th->th.th_dispatch == 583 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 584 585 my_buffer_index = th->th.th_dispatch->th_disp_index++; 586 587 /* What happens when number of threads changes, need to resize buffer? */ 588 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 589 &th->th.th_dispatch 590 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 591 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 592 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 593 } 594 595 #if (KMP_STATIC_STEAL_ENABLED) 596 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 597 // AC: we now have only one implementation of stealing, so use it 598 schedule = kmp_sch_static_steal; 599 else 600 #endif 601 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 602 603 /* Pick up the nomerge/ordered bits from the scheduling type */ 604 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 605 pr->nomerge = TRUE; 606 schedule = 607 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 608 } else { 609 pr->nomerge = FALSE; 610 } 611 pr->type_size = traits_t<T>::type_size; // remember the size of variables 612 if (kmp_ord_lower & schedule) { 613 pr->ordered = TRUE; 614 schedule = 615 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 616 } else { 617 pr->ordered = FALSE; 618 } 619 620 if (schedule == kmp_sch_static) { 621 schedule = __kmp_static; 622 } else { 623 if (schedule == kmp_sch_runtime) { 624 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 625 // not specified) 626 schedule = team->t.t_sched.r_sched_type; 627 // Detail the schedule if needed (global controls are differentiated 628 // appropriately) 629 if (schedule == kmp_sch_guided_chunked) { 630 schedule = __kmp_guided; 631 } else if (schedule == kmp_sch_static) { 632 schedule = __kmp_static; 633 } 634 // Use the chunk size specified by OMP_SCHEDULE (or default if not 635 // specified) 636 chunk = team->t.t_sched.chunk; 637 #if USE_ITT_BUILD 638 cur_chunk = chunk; 639 #endif 640 #ifdef KMP_DEBUG 641 { 642 char *buff; 643 // create format specifiers before the debug output 644 buff = __kmp_str_format( 645 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 646 traits_t<ST>::spec); 647 KD_TRACE(10, (buff, gtid, schedule, chunk)); 648 __kmp_str_free(&buff); 649 } 650 #endif 651 } else { 652 if (schedule == kmp_sch_guided_chunked) { 653 schedule = __kmp_guided; 654 } 655 if (chunk <= 0) { 656 chunk = KMP_DEFAULT_CHUNK; 657 } 658 } 659 660 if (schedule == kmp_sch_auto) { 661 // mapping and differentiation: in the __kmp_do_serial_initialize() 662 schedule = __kmp_auto; 663 #ifdef KMP_DEBUG 664 { 665 char *buff; 666 // create format specifiers before the debug output 667 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 668 "schedule:%%d chunk:%%%s\n", 669 traits_t<ST>::spec); 670 KD_TRACE(10, (buff, gtid, schedule, chunk)); 671 __kmp_str_free(&buff); 672 } 673 #endif 674 } 675 676 /* guided analytical not safe for too many threads */ 677 if (schedule == kmp_sch_guided_analytical_chunked && 678 th->th.th_team_nproc > 1 << 20) { 679 schedule = kmp_sch_guided_iterative_chunked; 680 KMP_WARNING(DispatchManyThreads); 681 } 682 if (schedule == kmp_sch_runtime_simd) { 683 // compiler provides simd_width in the chunk parameter 684 schedule = team->t.t_sched.r_sched_type; 685 // Detail the schedule if needed (global controls are differentiated 686 // appropriately) 687 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 688 schedule == __kmp_static) { 689 schedule = kmp_sch_static_balanced_chunked; 690 } else { 691 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 692 schedule = kmp_sch_guided_simd; 693 } 694 chunk = team->t.t_sched.chunk * chunk; 695 } 696 #if USE_ITT_BUILD 697 cur_chunk = chunk; 698 #endif 699 #ifdef KMP_DEBUG 700 { 701 char *buff; 702 // create format specifiers before the debug output 703 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 704 " chunk:%%%s\n", 705 traits_t<ST>::spec); 706 KD_TRACE(10, (buff, gtid, schedule, chunk)); 707 __kmp_str_free(&buff); 708 } 709 #endif 710 } 711 pr->u.p.parm1 = chunk; 712 } 713 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 714 "unknown scheduling type"); 715 716 pr->u.p.count = 0; 717 718 if (__kmp_env_consistency_check) { 719 if (st == 0) { 720 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 721 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 722 } 723 } 724 // compute trip count 725 if (st == 1) { // most common case 726 if (ub >= lb) { 727 tc = ub - lb + 1; 728 } else { // ub < lb 729 tc = 0; // zero-trip 730 } 731 } else if (st < 0) { 732 if (lb >= ub) { 733 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 734 // where the division needs to be unsigned regardless of the result type 735 tc = (UT)(lb - ub) / (-st) + 1; 736 } else { // lb < ub 737 tc = 0; // zero-trip 738 } 739 } else { // st > 0 740 if (ub >= lb) { 741 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 742 // where the division needs to be unsigned regardless of the result type 743 tc = (UT)(ub - lb) / st + 1; 744 } else { // ub < lb 745 tc = 0; // zero-trip 746 } 747 } 748 749 // Any half-decent optimizer will remove this test when the blocks are empty 750 // since the macros expand to nothing when statistics are disabled. 751 if (schedule == __kmp_static) { 752 KMP_COUNT_BLOCK(OMP_FOR_static); 753 KMP_COUNT_VALUE(FOR_static_iterations, tc); 754 } else { 755 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 756 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 757 } 758 759 pr->u.p.lb = lb; 760 pr->u.p.ub = ub; 761 pr->u.p.st = st; 762 pr->u.p.tc = tc; 763 764 #if KMP_OS_WINDOWS 765 pr->u.p.last_upper = ub + st; 766 #endif /* KMP_OS_WINDOWS */ 767 768 /* NOTE: only the active parallel region(s) has active ordered sections */ 769 770 if (active) { 771 if (pr->ordered == 0) { 772 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 773 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 774 } else { 775 pr->ordered_bumped = 0; 776 777 pr->u.p.ordered_lower = 1; 778 pr->u.p.ordered_upper = 0; 779 780 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 781 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 782 } 783 } 784 785 if (__kmp_env_consistency_check) { 786 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 787 if (push_ws) { 788 __kmp_push_workshare(gtid, ws, loc); 789 pr->pushed_ws = ws; 790 } else { 791 __kmp_check_workshare(gtid, ws, loc); 792 pr->pushed_ws = ct_none; 793 } 794 } 795 796 switch (schedule) { 797 #if (KMP_STATIC_STEAL_ENABLED) 798 case kmp_sch_static_steal: { 799 T nproc = th->th.th_team_nproc; 800 T ntc, init; 801 802 KD_TRACE(100, 803 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 804 805 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 806 if (nproc > 1 && ntc >= nproc) { 807 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 808 T id = __kmp_tid_from_gtid(gtid); 809 T small_chunk, extras; 810 811 small_chunk = ntc / nproc; 812 extras = ntc % nproc; 813 814 init = id * small_chunk + (id < extras ? id : extras); 815 pr->u.p.count = init; 816 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 817 818 pr->u.p.parm2 = lb; 819 // pr->pfields.parm3 = 0; // it's not used in static_steal 820 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 821 pr->u.p.st = st; 822 if (traits_t<T>::type_size > 4) { 823 // AC: TODO: check if 16-byte CAS available and use it to 824 // improve performance (probably wait for explicit request 825 // before spending time on this). 826 // For now use dynamically allocated per-thread lock, 827 // free memory in __kmp_dispatch_next when status==0. 828 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 829 th->th.th_dispatch->th_steal_lock = 830 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 831 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 832 } 833 break; 834 } else { 835 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 836 "kmp_sch_static_balanced\n", 837 gtid)); 838 schedule = kmp_sch_static_balanced; 839 /* too few iterations: fall-through to kmp_sch_static_balanced */ 840 } // if 841 /* FALL-THROUGH to static balanced */ 842 } // case 843 #endif 844 case kmp_sch_static_balanced: { 845 T nproc = th->th.th_team_nproc; 846 T init, limit; 847 848 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 849 gtid)); 850 851 if (nproc > 1) { 852 T id = __kmp_tid_from_gtid(gtid); 853 854 if (tc < nproc) { 855 if (id < tc) { 856 init = id; 857 limit = id; 858 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 859 } else { 860 pr->u.p.count = 1; /* means no more chunks to execute */ 861 pr->u.p.parm1 = FALSE; 862 break; 863 } 864 } else { 865 T small_chunk = tc / nproc; 866 T extras = tc % nproc; 867 init = id * small_chunk + (id < extras ? id : extras); 868 limit = init + small_chunk - (id < extras ? 0 : 1); 869 pr->u.p.parm1 = (id == nproc - 1); 870 } 871 } else { 872 if (tc > 0) { 873 init = 0; 874 limit = tc - 1; 875 pr->u.p.parm1 = TRUE; 876 } else { // zero trip count 877 pr->u.p.count = 1; /* means no more chunks to execute */ 878 pr->u.p.parm1 = FALSE; 879 break; 880 } 881 } 882 #if USE_ITT_BUILD 883 // Calculate chunk for metadata report 884 if (itt_need_metadata_reporting) 885 cur_chunk = limit - init + 1; 886 #endif 887 if (st == 1) { 888 pr->u.p.lb = lb + init; 889 pr->u.p.ub = lb + limit; 890 } else { 891 // calculated upper bound, "ub" is user-defined upper bound 892 T ub_tmp = lb + limit * st; 893 pr->u.p.lb = lb + init * st; 894 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 895 // it exactly 896 if (st > 0) { 897 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 898 } else { 899 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 900 } 901 } 902 if (pr->ordered) { 903 pr->u.p.ordered_lower = init; 904 pr->u.p.ordered_upper = limit; 905 } 906 break; 907 } // case 908 case kmp_sch_static_balanced_chunked: { 909 // similar to balanced, but chunk adjusted to multiple of simd width 910 T nth = th->th.th_team_nproc; 911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" 912 " -> falling-through to static_greedy\n", 913 gtid)); 914 schedule = kmp_sch_static_greedy; 915 if (nth > 1) 916 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 917 else 918 pr->u.p.parm1 = tc; 919 break; 920 } // case 921 case kmp_sch_guided_iterative_chunked: 922 case kmp_sch_guided_simd: { 923 T nproc = th->th.th_team_nproc; 924 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 925 " case\n", 926 gtid)); 927 928 if (nproc > 1) { 929 if ((2L * chunk + 1) * nproc >= tc) { 930 /* chunk size too large, switch to dynamic */ 931 schedule = kmp_sch_dynamic_chunked; 932 } else { 933 // when remaining iters become less than parm2 - switch to dynamic 934 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 935 *(double *)&pr->u.p.parm3 = 936 guided_flt_param / nproc; // may occupy parm3 and parm4 937 } 938 } else { 939 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 940 "kmp_sch_static_greedy\n", 941 gtid)); 942 schedule = kmp_sch_static_greedy; 943 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 944 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 945 gtid)); 946 pr->u.p.parm1 = tc; 947 } // if 948 } // case 949 break; 950 case kmp_sch_guided_analytical_chunked: { 951 T nproc = th->th.th_team_nproc; 952 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 953 " case\n", 954 gtid)); 955 if (nproc > 1) { 956 if ((2L * chunk + 1) * nproc >= tc) { 957 /* chunk size too large, switch to dynamic */ 958 schedule = kmp_sch_dynamic_chunked; 959 } else { 960 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 961 DBL x; 962 963 #if KMP_OS_WINDOWS && KMP_ARCH_X86 964 /* Linux* OS already has 64-bit computation by default for long double, 965 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 966 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 967 instead of the default 53-bit. Even though long double doesn't work 968 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 969 expected to impact the correctness of the algorithm, but this has not 970 been mathematically proven. */ 971 // save original FPCW and set precision to 64-bit, as 972 // Windows* OS on IA-32 architecture defaults to 53-bit 973 unsigned int oldFpcw = _control87(0, 0); 974 _control87(_PC_64, _MCW_PC); // 0,0x30000 975 #endif 976 /* value used for comparison in solver for cross-over point */ 977 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 978 979 /* crossover point--chunk indexes equal to or greater than 980 this point switch to dynamic-style scheduling */ 981 UT cross; 982 983 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 984 x = (long double)1.0 - (long double)0.5 / nproc; 985 986 #ifdef KMP_DEBUG 987 { // test natural alignment 988 struct _test_a { 989 char a; 990 union { 991 char b; 992 DBL d; 993 }; 994 } t; 995 ptrdiff_t natural_alignment = 996 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 997 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 998 // long)natural_alignment ); 999 KMP_DEBUG_ASSERT( 1000 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 1001 } 1002 #endif // KMP_DEBUG 1003 1004 /* save the term in thread private dispatch structure */ 1005 *(DBL *)&pr->u.p.parm3 = x; 1006 1007 /* solve for the crossover point to the nearest integer i for which C_i 1008 <= chunk */ 1009 { 1010 UT left, right, mid; 1011 long double p; 1012 1013 /* estimate initial upper and lower bound */ 1014 1015 /* doesn't matter what value right is as long as it is positive, but 1016 it affects performance of the solver */ 1017 right = 229; 1018 p = __kmp_pow<UT>(x, right); 1019 if (p > target) { 1020 do { 1021 p *= p; 1022 right <<= 1; 1023 } while (p > target && right < (1 << 27)); 1024 /* lower bound is previous (failed) estimate of upper bound */ 1025 left = right >> 1; 1026 } else { 1027 left = 0; 1028 } 1029 1030 /* bisection root-finding method */ 1031 while (left + 1 < right) { 1032 mid = (left + right) / 2; 1033 if (__kmp_pow<UT>(x, mid) > target) { 1034 left = mid; 1035 } else { 1036 right = mid; 1037 } 1038 } // while 1039 cross = right; 1040 } 1041 /* assert sanity of computed crossover point */ 1042 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1043 __kmp_pow<UT>(x, cross) <= target); 1044 1045 /* save the crossover point in thread private dispatch structure */ 1046 pr->u.p.parm2 = cross; 1047 1048 // C75803 1049 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1050 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1051 #else 1052 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1053 #endif 1054 /* dynamic-style scheduling offset */ 1055 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1056 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1057 cross * chunk; 1058 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1059 // restore FPCW 1060 _control87(oldFpcw, _MCW_PC); 1061 #endif 1062 } // if 1063 } else { 1064 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1065 "kmp_sch_static_greedy\n", 1066 gtid)); 1067 schedule = kmp_sch_static_greedy; 1068 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1069 pr->u.p.parm1 = tc; 1070 } // if 1071 } // case 1072 break; 1073 case kmp_sch_static_greedy: 1074 KD_TRACE(100, 1075 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1076 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1077 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1078 : tc; 1079 break; 1080 case kmp_sch_static_chunked: 1081 case kmp_sch_dynamic_chunked: 1082 if (pr->u.p.parm1 <= 0) { 1083 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1084 } 1085 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1086 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1087 gtid)); 1088 break; 1089 case kmp_sch_trapezoidal: { 1090 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1091 1092 T parm1, parm2, parm3, parm4; 1093 KD_TRACE(100, 1094 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1095 1096 parm1 = chunk; 1097 1098 /* F : size of the first cycle */ 1099 parm2 = (tc / (2 * th->th.th_team_nproc)); 1100 1101 if (parm2 < 1) { 1102 parm2 = 1; 1103 } 1104 1105 /* L : size of the last cycle. Make sure the last cycle is not larger 1106 than the first cycle. */ 1107 if (parm1 < 1) { 1108 parm1 = 1; 1109 } else if (parm1 > parm2) { 1110 parm1 = parm2; 1111 } 1112 1113 /* N : number of cycles */ 1114 parm3 = (parm2 + parm1); 1115 parm3 = (2 * tc + parm3 - 1) / parm3; 1116 1117 if (parm3 < 2) { 1118 parm3 = 2; 1119 } 1120 1121 /* sigma : decreasing incr of the trapezoid */ 1122 parm4 = (parm3 - 1); 1123 parm4 = (parm2 - parm1) / parm4; 1124 1125 // pointless check, because parm4 >= 0 always 1126 // if ( parm4 < 0 ) { 1127 // parm4 = 0; 1128 //} 1129 1130 pr->u.p.parm1 = parm1; 1131 pr->u.p.parm2 = parm2; 1132 pr->u.p.parm3 = parm3; 1133 pr->u.p.parm4 = parm4; 1134 } // case 1135 break; 1136 1137 default: { 1138 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1139 KMP_HNT(GetNewerLibrary), // Hint 1140 __kmp_msg_null // Variadic argument list terminator 1141 ); 1142 } break; 1143 } // switch 1144 pr->schedule = schedule; 1145 if (active) { 1146 /* The name of this buffer should be my_buffer_index when it's free to use 1147 * it */ 1148 1149 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1150 "sh->buffer_index:%d\n", 1151 gtid, my_buffer_index, sh->buffer_index)); 1152 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1153 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1154 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1155 // my_buffer_index are *always* 32-bit integers. 1156 KMP_MB(); /* is this necessary? */ 1157 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1158 "sh->buffer_index:%d\n", 1159 gtid, my_buffer_index, sh->buffer_index)); 1160 1161 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1162 th->th.th_dispatch->th_dispatch_sh_current = 1163 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 1164 #if USE_ITT_BUILD 1165 if (pr->ordered) { 1166 __kmp_itt_ordered_init(gtid); 1167 } 1168 // Report loop metadata 1169 if (itt_need_metadata_reporting) { 1170 // Only report metadata by master of active team at level 1 1171 kmp_uint64 schedtype = 0; 1172 switch (schedule) { 1173 case kmp_sch_static_chunked: 1174 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1175 break; 1176 case kmp_sch_static_greedy: 1177 cur_chunk = pr->u.p.parm1; 1178 break; 1179 case kmp_sch_dynamic_chunked: 1180 schedtype = 1; 1181 break; 1182 case kmp_sch_guided_iterative_chunked: 1183 case kmp_sch_guided_analytical_chunked: 1184 case kmp_sch_guided_simd: 1185 schedtype = 2; 1186 break; 1187 default: 1188 // Should we put this case under "static"? 1189 // case kmp_sch_static_steal: 1190 schedtype = 3; 1191 break; 1192 } 1193 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1194 } 1195 #endif /* USE_ITT_BUILD */ 1196 } 1197 1198 #ifdef KMP_DEBUG 1199 { 1200 char *buff; 1201 // create format specifiers before the debug output 1202 buff = __kmp_str_format( 1203 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1204 "lb:%%%s ub:%%%s" 1205 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1206 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1207 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1208 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1209 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1210 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1211 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1212 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1213 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1214 pr->u.p.parm3, pr->u.p.parm4)); 1215 __kmp_str_free(&buff); 1216 } 1217 #endif 1218 #if (KMP_STATIC_STEAL_ENABLED) 1219 // It cannot be guaranteed that after execution of a loop with some other 1220 // schedule kind all the parm3 variables will contain the same value. Even if 1221 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1222 // rather than program life-time increment. So the dedicated variable is 1223 // required. The 'static_steal_counter' is used. 1224 if (schedule == kmp_sch_static_steal) { 1225 // Other threads will inspect this variable when searching for a victim. 1226 // This is a flag showing that other threads may steal from this thread 1227 // since then. 1228 volatile T *p = &pr->u.p.static_steal_counter; 1229 *p = *p + 1; 1230 } 1231 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1232 1233 #if OMPT_SUPPORT && OMPT_OPTIONAL 1234 if (ompt_enabled.ompt_callback_work) { 1235 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1236 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1237 kmp_info_t *thr = __kmp_threads[gtid]; 1238 ompt_callbacks.ompt_callback(ompt_callback_work)( 1239 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1240 &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1241 } 1242 #endif 1243 } 1244 1245 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1246 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1247 * every chunk of iterations. If the ordered section(s) were not executed 1248 * for this iteration (or every iteration in this chunk), we need to set the 1249 * ordered iteration counters so that the next thread can proceed. */ 1250 template <typename UT> 1251 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1252 typedef typename traits_t<UT>::signed_t ST; 1253 kmp_info_t *th = __kmp_threads[gtid]; 1254 1255 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1256 if (!th->th.th_team->t.t_serialized) { 1257 1258 dispatch_private_info_template<UT> *pr = 1259 reinterpret_cast<dispatch_private_info_template<UT> *>( 1260 th->th.th_dispatch->th_dispatch_pr_current); 1261 dispatch_shared_info_template<UT> volatile *sh = 1262 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1263 th->th.th_dispatch->th_dispatch_sh_current); 1264 KMP_DEBUG_ASSERT(pr); 1265 KMP_DEBUG_ASSERT(sh); 1266 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1267 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1268 1269 if (pr->ordered_bumped) { 1270 KD_TRACE( 1271 1000, 1272 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1273 gtid)); 1274 pr->ordered_bumped = 0; 1275 } else { 1276 UT lower = pr->u.p.ordered_lower; 1277 1278 #ifdef KMP_DEBUG 1279 { 1280 char *buff; 1281 // create format specifiers before the debug output 1282 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1283 "ordered_iteration:%%%s lower:%%%s\n", 1284 traits_t<UT>::spec, traits_t<UT>::spec); 1285 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1286 __kmp_str_free(&buff); 1287 } 1288 #endif 1289 1290 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1291 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1292 KMP_MB(); /* is this necessary? */ 1293 #ifdef KMP_DEBUG 1294 { 1295 char *buff; 1296 // create format specifiers before the debug output 1297 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1298 "ordered_iteration:%%%s lower:%%%s\n", 1299 traits_t<UT>::spec, traits_t<UT>::spec); 1300 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1301 __kmp_str_free(&buff); 1302 } 1303 #endif 1304 1305 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1306 } // if 1307 } // if 1308 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1309 } 1310 1311 #ifdef KMP_GOMP_COMPAT 1312 1313 template <typename UT> 1314 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1315 typedef typename traits_t<UT>::signed_t ST; 1316 kmp_info_t *th = __kmp_threads[gtid]; 1317 1318 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1319 if (!th->th.th_team->t.t_serialized) { 1320 // int cid; 1321 dispatch_private_info_template<UT> *pr = 1322 reinterpret_cast<dispatch_private_info_template<UT> *>( 1323 th->th.th_dispatch->th_dispatch_pr_current); 1324 dispatch_shared_info_template<UT> volatile *sh = 1325 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1326 th->th.th_dispatch->th_dispatch_sh_current); 1327 KMP_DEBUG_ASSERT(pr); 1328 KMP_DEBUG_ASSERT(sh); 1329 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1330 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1331 1332 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1333 UT lower = pr->u.p.ordered_lower; 1334 UT upper = pr->u.p.ordered_upper; 1335 UT inc = upper - lower + 1; 1336 1337 if (pr->ordered_bumped == inc) { 1338 KD_TRACE( 1339 1000, 1340 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1341 gtid)); 1342 pr->ordered_bumped = 0; 1343 } else { 1344 inc -= pr->ordered_bumped; 1345 1346 #ifdef KMP_DEBUG 1347 { 1348 char *buff; 1349 // create format specifiers before the debug output 1350 buff = __kmp_str_format( 1351 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1352 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1353 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1354 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1355 __kmp_str_free(&buff); 1356 } 1357 #endif 1358 1359 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1360 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1361 1362 KMP_MB(); /* is this necessary? */ 1363 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1364 "ordered_bumped to zero\n", 1365 gtid)); 1366 pr->ordered_bumped = 0; 1367 //!!!!! TODO check if the inc should be unsigned, or signed??? 1368 #ifdef KMP_DEBUG 1369 { 1370 char *buff; 1371 // create format specifiers before the debug output 1372 buff = __kmp_str_format( 1373 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1374 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1375 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1376 traits_t<UT>::spec); 1377 KD_TRACE(1000, 1378 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1379 __kmp_str_free(&buff); 1380 } 1381 #endif 1382 1383 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1384 } 1385 // } 1386 } 1387 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1388 } 1389 1390 #endif /* KMP_GOMP_COMPAT */ 1391 1392 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1393 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1394 is not called. */ 1395 #if OMPT_SUPPORT && OMPT_OPTIONAL 1396 #define OMPT_LOOP_END \ 1397 if (status == 0) { \ 1398 if (ompt_enabled.ompt_callback_work) { \ 1399 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1400 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1401 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1402 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1403 &(task_info->task_data), 0, codeptr); \ 1404 } \ 1405 } 1406 // TODO: implement count 1407 #else 1408 #define OMPT_LOOP_END // no-op 1409 #endif 1410 1411 template <typename T> 1412 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1413 T *p_lb, T *p_ub, 1414 typename traits_t<T>::signed_t *p_st 1415 #if OMPT_SUPPORT && OMPT_OPTIONAL 1416 , 1417 void *codeptr 1418 #endif 1419 ) { 1420 1421 typedef typename traits_t<T>::unsigned_t UT; 1422 typedef typename traits_t<T>::signed_t ST; 1423 typedef typename traits_t<T>::floating_t DBL; 1424 1425 // This is potentially slightly misleading, schedule(runtime) will appear here 1426 // even if the actual runtme schedule is static. (Which points out a 1427 // disadavantage of schedule(runtime): even when static scheduling is used it 1428 // costs more than a compile time choice to use static scheduling would.) 1429 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1430 1431 int status; 1432 dispatch_private_info_template<T> *pr; 1433 kmp_info_t *th = __kmp_threads[gtid]; 1434 kmp_team_t *team = th->th.th_team; 1435 1436 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1437 #ifdef KMP_DEBUG 1438 { 1439 char *buff; 1440 // create format specifiers before the debug output 1441 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1442 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1443 traits_t<T>::spec, traits_t<T>::spec, 1444 traits_t<ST>::spec); 1445 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1446 __kmp_str_free(&buff); 1447 } 1448 #endif 1449 1450 if (team->t.t_serialized) { 1451 /* NOTE: serialize this dispatch becase we are not at the active level */ 1452 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1453 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1454 KMP_DEBUG_ASSERT(pr); 1455 1456 if ((status = (pr->u.p.tc != 0)) == 0) { 1457 *p_lb = 0; 1458 *p_ub = 0; 1459 // if ( p_last != NULL ) 1460 // *p_last = 0; 1461 if (p_st != NULL) 1462 *p_st = 0; 1463 if (__kmp_env_consistency_check) { 1464 if (pr->pushed_ws != ct_none) { 1465 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1466 } 1467 } 1468 } else if (pr->nomerge) { 1469 kmp_int32 last; 1470 T start; 1471 UT limit, trip, init; 1472 ST incr; 1473 T chunk = pr->u.p.parm1; 1474 1475 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1476 gtid)); 1477 1478 init = chunk * pr->u.p.count++; 1479 trip = pr->u.p.tc - 1; 1480 1481 if ((status = (init <= trip)) == 0) { 1482 *p_lb = 0; 1483 *p_ub = 0; 1484 // if ( p_last != NULL ) 1485 // *p_last = 0; 1486 if (p_st != NULL) 1487 *p_st = 0; 1488 if (__kmp_env_consistency_check) { 1489 if (pr->pushed_ws != ct_none) { 1490 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1491 } 1492 } 1493 } else { 1494 start = pr->u.p.lb; 1495 limit = chunk + init - 1; 1496 incr = pr->u.p.st; 1497 1498 if ((last = (limit >= trip)) != 0) { 1499 limit = trip; 1500 #if KMP_OS_WINDOWS 1501 pr->u.p.last_upper = pr->u.p.ub; 1502 #endif /* KMP_OS_WINDOWS */ 1503 } 1504 if (p_last != NULL) 1505 *p_last = last; 1506 if (p_st != NULL) 1507 *p_st = incr; 1508 if (incr == 1) { 1509 *p_lb = start + init; 1510 *p_ub = start + limit; 1511 } else { 1512 *p_lb = start + init * incr; 1513 *p_ub = start + limit * incr; 1514 } 1515 1516 if (pr->ordered) { 1517 pr->u.p.ordered_lower = init; 1518 pr->u.p.ordered_upper = limit; 1519 #ifdef KMP_DEBUG 1520 { 1521 char *buff; 1522 // create format specifiers before the debug output 1523 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1524 "ordered_lower:%%%s ordered_upper:%%%s\n", 1525 traits_t<UT>::spec, traits_t<UT>::spec); 1526 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1527 pr->u.p.ordered_upper)); 1528 __kmp_str_free(&buff); 1529 } 1530 #endif 1531 } // if 1532 } // if 1533 } else { 1534 pr->u.p.tc = 0; 1535 *p_lb = pr->u.p.lb; 1536 *p_ub = pr->u.p.ub; 1537 #if KMP_OS_WINDOWS 1538 pr->u.p.last_upper = *p_ub; 1539 #endif /* KMP_OS_WINDOWS */ 1540 if (p_last != NULL) 1541 *p_last = TRUE; 1542 if (p_st != NULL) 1543 *p_st = pr->u.p.st; 1544 } // if 1545 #ifdef KMP_DEBUG 1546 { 1547 char *buff; 1548 // create format specifiers before the debug output 1549 buff = __kmp_str_format( 1550 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1551 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1552 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1553 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1554 __kmp_str_free(&buff); 1555 } 1556 #endif 1557 #if INCLUDE_SSC_MARKS 1558 SSC_MARK_DISPATCH_NEXT(); 1559 #endif 1560 OMPT_LOOP_END; 1561 return status; 1562 } else { 1563 kmp_int32 last = 0; 1564 dispatch_shared_info_template<UT> *sh; 1565 T start; 1566 ST incr; 1567 UT limit, trip, init; 1568 1569 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1570 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1571 1572 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1573 th->th.th_dispatch->th_dispatch_pr_current); 1574 KMP_DEBUG_ASSERT(pr); 1575 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1576 th->th.th_dispatch->th_dispatch_sh_current); 1577 KMP_DEBUG_ASSERT(sh); 1578 1579 if (pr->u.p.tc == 0) { 1580 // zero trip count 1581 status = 0; 1582 } else { 1583 switch (pr->schedule) { 1584 #if (KMP_STATIC_STEAL_ENABLED) 1585 case kmp_sch_static_steal: { 1586 T chunk = pr->u.p.parm1; 1587 int nproc = th->th.th_team_nproc; 1588 1589 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1590 gtid)); 1591 1592 trip = pr->u.p.tc - 1; 1593 1594 if (traits_t<T>::type_size > 4) { 1595 // use lock for 8-byte and CAS for 4-byte induction 1596 // variable. TODO (optional): check and use 16-byte CAS 1597 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1598 KMP_DEBUG_ASSERT(lck != NULL); 1599 if (pr->u.p.count < (UT)pr->u.p.ub) { 1600 __kmp_acquire_lock(lck, gtid); 1601 // try to get own chunk of iterations 1602 init = (pr->u.p.count)++; 1603 status = (init < (UT)pr->u.p.ub); 1604 __kmp_release_lock(lck, gtid); 1605 } else { 1606 status = 0; // no own chunks 1607 } 1608 if (!status) { // try to steal 1609 kmp_info_t **other_threads = team->t.t_threads; 1610 int while_limit = nproc; // nproc attempts to find a victim 1611 int while_index = 0; 1612 // TODO: algorithm of searching for a victim 1613 // should be cleaned up and measured 1614 while ((!status) && (while_limit != ++while_index)) { 1615 T remaining; 1616 T victimIdx = pr->u.p.parm4; 1617 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1618 dispatch_private_info_template<T> *victim = 1619 reinterpret_cast<dispatch_private_info_template<T> *>( 1620 other_threads[victimIdx] 1621 ->th.th_dispatch->th_dispatch_pr_current); 1622 while ((victim == NULL || victim == pr || 1623 (*(volatile T *)&victim->u.p.static_steal_counter != 1624 *(volatile T *)&pr->u.p.static_steal_counter)) && 1625 oldVictimIdx != victimIdx) { 1626 victimIdx = (victimIdx + 1) % nproc; 1627 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1628 other_threads[victimIdx] 1629 ->th.th_dispatch->th_dispatch_pr_current); 1630 } 1631 if (!victim || 1632 (*(volatile T *)&victim->u.p.static_steal_counter != 1633 *(volatile T *)&pr->u.p.static_steal_counter)) { 1634 continue; // try once more (nproc attempts in total) 1635 // no victim is ready yet to participate in stealing 1636 // because all victims are still in kmp_init_dispatch 1637 } 1638 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1639 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1640 continue; // not enough chunks to steal, goto next victim 1641 } 1642 1643 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1644 KMP_ASSERT(lck != NULL); 1645 __kmp_acquire_lock(lck, gtid); 1646 limit = victim->u.p.ub; // keep initial ub 1647 if (victim->u.p.count >= limit || 1648 (remaining = limit - victim->u.p.count) < 2) { 1649 __kmp_release_lock(lck, gtid); 1650 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1651 continue; // not enough chunks to steal 1652 } 1653 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1654 // or by 1 1655 if (remaining > 3) { 1656 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1657 init = (victim->u.p.ub -= 1658 (remaining >> 2)); // steal 1/4 of remaining 1659 } else { 1660 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1661 init = 1662 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1663 } 1664 __kmp_release_lock(lck, gtid); 1665 1666 KMP_DEBUG_ASSERT(init + 1 <= limit); 1667 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1668 status = 1; 1669 while_index = 0; 1670 // now update own count and ub with stolen range but init chunk 1671 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1672 pr->u.p.count = init + 1; 1673 pr->u.p.ub = limit; 1674 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1675 } // while (search for victim) 1676 } // if (try to find victim and steal) 1677 } else { 1678 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1679 typedef union { 1680 struct { 1681 UT count; 1682 T ub; 1683 } p; 1684 kmp_int64 b; 1685 } union_i4; 1686 // All operations on 'count' or 'ub' must be combined atomically 1687 // together. 1688 { 1689 union_i4 vold, vnew; 1690 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1691 vnew = vold; 1692 vnew.p.count++; 1693 while (!KMP_COMPARE_AND_STORE_ACQ64( 1694 (volatile kmp_int64 *)&pr->u.p.count, 1695 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1696 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1697 KMP_CPU_PAUSE(); 1698 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1699 vnew = vold; 1700 vnew.p.count++; 1701 } 1702 vnew = vold; 1703 init = vnew.p.count; 1704 status = (init < (UT)vnew.p.ub); 1705 } 1706 1707 if (!status) { 1708 kmp_info_t **other_threads = team->t.t_threads; 1709 int while_limit = nproc; // nproc attempts to find a victim 1710 int while_index = 0; 1711 1712 // TODO: algorithm of searching for a victim 1713 // should be cleaned up and measured 1714 while ((!status) && (while_limit != ++while_index)) { 1715 union_i4 vold, vnew; 1716 kmp_int32 remaining; 1717 T victimIdx = pr->u.p.parm4; 1718 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1719 dispatch_private_info_template<T> *victim = 1720 reinterpret_cast<dispatch_private_info_template<T> *>( 1721 other_threads[victimIdx] 1722 ->th.th_dispatch->th_dispatch_pr_current); 1723 while ((victim == NULL || victim == pr || 1724 (*(volatile T *)&victim->u.p.static_steal_counter != 1725 *(volatile T *)&pr->u.p.static_steal_counter)) && 1726 oldVictimIdx != victimIdx) { 1727 victimIdx = (victimIdx + 1) % nproc; 1728 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1729 other_threads[victimIdx] 1730 ->th.th_dispatch->th_dispatch_pr_current); 1731 } 1732 if (!victim || 1733 (*(volatile T *)&victim->u.p.static_steal_counter != 1734 *(volatile T *)&pr->u.p.static_steal_counter)) { 1735 continue; // try once more (nproc attempts in total) 1736 // no victim is ready yet to participate in stealing 1737 // because all victims are still in kmp_init_dispatch 1738 } 1739 pr->u.p.parm4 = victimIdx; // new victim found 1740 while (1) { // CAS loop if victim has enough chunks to steal 1741 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1742 vnew = vold; 1743 1744 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1745 if (vnew.p.count >= (UT)vnew.p.ub || 1746 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1747 pr->u.p.parm4 = 1748 (victimIdx + 1) % nproc; // shift start victim id 1749 break; // not enough chunks to steal, goto next victim 1750 } 1751 if (remaining > 3) { 1752 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1753 } else { 1754 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1755 } 1756 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1757 // TODO: Should this be acquire or release? 1758 if (KMP_COMPARE_AND_STORE_ACQ64( 1759 (volatile kmp_int64 *)&victim->u.p.count, 1760 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1761 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1762 // stealing succeeded 1763 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1764 vold.p.ub - vnew.p.ub); 1765 status = 1; 1766 while_index = 0; 1767 // now update own count and ub 1768 init = vnew.p.ub; 1769 vold.p.count = init + 1; 1770 #if KMP_ARCH_X86 1771 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1772 vold.b); 1773 #else 1774 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1775 #endif 1776 break; 1777 } // if (check CAS result) 1778 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1779 } // while (try to steal from particular victim) 1780 } // while (search for victim) 1781 } // if (try to find victim and steal) 1782 } // if (4-byte induction variable) 1783 if (!status) { 1784 *p_lb = 0; 1785 *p_ub = 0; 1786 if (p_st != NULL) 1787 *p_st = 0; 1788 } else { 1789 start = pr->u.p.parm2; 1790 init *= chunk; 1791 limit = chunk + init - 1; 1792 incr = pr->u.p.st; 1793 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1794 1795 KMP_DEBUG_ASSERT(init <= trip); 1796 if ((last = (limit >= trip)) != 0) 1797 limit = trip; 1798 if (p_st != NULL) 1799 *p_st = incr; 1800 1801 if (incr == 1) { 1802 *p_lb = start + init; 1803 *p_ub = start + limit; 1804 } else { 1805 *p_lb = start + init * incr; 1806 *p_ub = start + limit * incr; 1807 } 1808 1809 if (pr->ordered) { 1810 pr->u.p.ordered_lower = init; 1811 pr->u.p.ordered_upper = limit; 1812 #ifdef KMP_DEBUG 1813 { 1814 char *buff; 1815 // create format specifiers before the debug output 1816 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1817 "ordered_lower:%%%s ordered_upper:%%%s\n", 1818 traits_t<UT>::spec, traits_t<UT>::spec); 1819 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1820 pr->u.p.ordered_upper)); 1821 __kmp_str_free(&buff); 1822 } 1823 #endif 1824 } // if 1825 } // if 1826 break; 1827 } // case 1828 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1829 case kmp_sch_static_balanced: { 1830 KD_TRACE( 1831 100, 1832 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1833 if ((status = !pr->u.p.count) != 1834 0) { /* check if thread has any iteration to do */ 1835 pr->u.p.count = 1; 1836 *p_lb = pr->u.p.lb; 1837 *p_ub = pr->u.p.ub; 1838 last = pr->u.p.parm1; 1839 if (p_st != NULL) 1840 *p_st = pr->u.p.st; 1841 } else { /* no iterations to do */ 1842 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1843 } 1844 if (pr->ordered) { 1845 #ifdef KMP_DEBUG 1846 { 1847 char *buff; 1848 // create format specifiers before the debug output 1849 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1850 "ordered_lower:%%%s ordered_upper:%%%s\n", 1851 traits_t<UT>::spec, traits_t<UT>::spec); 1852 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1853 pr->u.p.ordered_upper)); 1854 __kmp_str_free(&buff); 1855 } 1856 #endif 1857 } // if 1858 } // case 1859 break; 1860 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1861 merged here */ 1862 case kmp_sch_static_chunked: { 1863 T parm1; 1864 1865 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1866 "kmp_sch_static_[affinity|chunked] case\n", 1867 gtid)); 1868 parm1 = pr->u.p.parm1; 1869 1870 trip = pr->u.p.tc - 1; 1871 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1872 1873 if ((status = (init <= trip)) != 0) { 1874 start = pr->u.p.lb; 1875 incr = pr->u.p.st; 1876 limit = parm1 + init - 1; 1877 1878 if ((last = (limit >= trip)) != 0) 1879 limit = trip; 1880 1881 if (p_st != NULL) 1882 *p_st = incr; 1883 1884 pr->u.p.count += th->th.th_team_nproc; 1885 1886 if (incr == 1) { 1887 *p_lb = start + init; 1888 *p_ub = start + limit; 1889 } else { 1890 *p_lb = start + init * incr; 1891 *p_ub = start + limit * incr; 1892 } 1893 1894 if (pr->ordered) { 1895 pr->u.p.ordered_lower = init; 1896 pr->u.p.ordered_upper = limit; 1897 #ifdef KMP_DEBUG 1898 { 1899 char *buff; 1900 // create format specifiers before the debug output 1901 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1902 "ordered_lower:%%%s ordered_upper:%%%s\n", 1903 traits_t<UT>::spec, traits_t<UT>::spec); 1904 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1905 pr->u.p.ordered_upper)); 1906 __kmp_str_free(&buff); 1907 } 1908 #endif 1909 } // if 1910 } // if 1911 } // case 1912 break; 1913 1914 case kmp_sch_dynamic_chunked: { 1915 T chunk = pr->u.p.parm1; 1916 1917 KD_TRACE( 1918 100, 1919 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1920 1921 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1922 trip = pr->u.p.tc - 1; 1923 1924 if ((status = (init <= trip)) == 0) { 1925 *p_lb = 0; 1926 *p_ub = 0; 1927 if (p_st != NULL) 1928 *p_st = 0; 1929 } else { 1930 start = pr->u.p.lb; 1931 limit = chunk + init - 1; 1932 incr = pr->u.p.st; 1933 1934 if ((last = (limit >= trip)) != 0) 1935 limit = trip; 1936 1937 if (p_st != NULL) 1938 *p_st = incr; 1939 1940 if (incr == 1) { 1941 *p_lb = start + init; 1942 *p_ub = start + limit; 1943 } else { 1944 *p_lb = start + init * incr; 1945 *p_ub = start + limit * incr; 1946 } 1947 1948 if (pr->ordered) { 1949 pr->u.p.ordered_lower = init; 1950 pr->u.p.ordered_upper = limit; 1951 #ifdef KMP_DEBUG 1952 { 1953 char *buff; 1954 // create format specifiers before the debug output 1955 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1956 "ordered_lower:%%%s ordered_upper:%%%s\n", 1957 traits_t<UT>::spec, traits_t<UT>::spec); 1958 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1959 pr->u.p.ordered_upper)); 1960 __kmp_str_free(&buff); 1961 } 1962 #endif 1963 } // if 1964 } // if 1965 } // case 1966 break; 1967 1968 case kmp_sch_guided_iterative_chunked: { 1969 T chunkspec = pr->u.p.parm1; 1970 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1971 "iterative case\n", 1972 gtid)); 1973 trip = pr->u.p.tc; 1974 // Start atomic part of calculations 1975 while (1) { 1976 ST remaining; // signed, because can be < 0 1977 init = sh->u.s.iteration; // shared value 1978 remaining = trip - init; 1979 if (remaining <= 0) { // AC: need to compare with 0 first 1980 // nothing to do, don't try atomic op 1981 status = 0; 1982 break; 1983 } 1984 if ((T)remaining < 1985 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1986 // use dynamic-style shcedule 1987 // atomically inrement iterations, get old value 1988 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1989 (ST)chunkspec); 1990 remaining = trip - init; 1991 if (remaining <= 0) { 1992 status = 0; // all iterations got by other threads 1993 } else { // got some iterations to work on 1994 status = 1; 1995 if ((T)remaining > chunkspec) { 1996 limit = init + chunkspec - 1; 1997 } else { 1998 last = 1; // the last chunk 1999 limit = init + remaining - 1; 2000 } // if 2001 } // if 2002 break; 2003 } // if 2004 limit = init + (UT)(remaining * 2005 *(double *)&pr->u.p.parm3); // divide by K*nproc 2006 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2007 (ST)init, (ST)limit)) { 2008 // CAS was successful, chunk obtained 2009 status = 1; 2010 --limit; 2011 break; 2012 } // if 2013 } // while 2014 if (status != 0) { 2015 start = pr->u.p.lb; 2016 incr = pr->u.p.st; 2017 if (p_st != NULL) 2018 *p_st = incr; 2019 *p_lb = start + init * incr; 2020 *p_ub = start + limit * incr; 2021 if (pr->ordered) { 2022 pr->u.p.ordered_lower = init; 2023 pr->u.p.ordered_upper = limit; 2024 #ifdef KMP_DEBUG 2025 { 2026 char *buff; 2027 // create format specifiers before the debug output 2028 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2029 "ordered_lower:%%%s ordered_upper:%%%s\n", 2030 traits_t<UT>::spec, traits_t<UT>::spec); 2031 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2032 pr->u.p.ordered_upper)); 2033 __kmp_str_free(&buff); 2034 } 2035 #endif 2036 } // if 2037 } else { 2038 *p_lb = 0; 2039 *p_ub = 0; 2040 if (p_st != NULL) 2041 *p_st = 0; 2042 } // if 2043 } // case 2044 break; 2045 2046 case kmp_sch_guided_simd: { 2047 // same as iterative but curr-chunk adjusted to be multiple of given 2048 // chunk 2049 T chunk = pr->u.p.parm1; 2050 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", 2051 gtid)); 2052 trip = pr->u.p.tc; 2053 // Start atomic part of calculations 2054 while (1) { 2055 ST remaining; // signed, because can be < 0 2056 init = sh->u.s.iteration; // shared value 2057 remaining = trip - init; 2058 if (remaining <= 0) { // AC: need to compare with 0 first 2059 status = 0; // nothing to do, don't try atomic op 2060 break; 2061 } 2062 KMP_DEBUG_ASSERT(init % chunk == 0); 2063 // compare with K*nproc*(chunk+1), K=2 by default 2064 if ((T)remaining < pr->u.p.parm2) { 2065 // use dynamic-style shcedule 2066 // atomically inrement iterations, get old value 2067 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2068 (ST)chunk); 2069 remaining = trip - init; 2070 if (remaining <= 0) { 2071 status = 0; // all iterations got by other threads 2072 } else { 2073 // got some iterations to work on 2074 status = 1; 2075 if ((T)remaining > chunk) { 2076 limit = init + chunk - 1; 2077 } else { 2078 last = 1; // the last chunk 2079 limit = init + remaining - 1; 2080 } // if 2081 } // if 2082 break; 2083 } // if 2084 // divide by K*nproc 2085 UT span = remaining * (*(double *)&pr->u.p.parm3); 2086 UT rem = span % chunk; 2087 if (rem) // adjust so that span%chunk == 0 2088 span += chunk - rem; 2089 limit = init + span; 2090 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2091 (ST)init, (ST)limit)) { 2092 // CAS was successful, chunk obtained 2093 status = 1; 2094 --limit; 2095 break; 2096 } // if 2097 } // while 2098 if (status != 0) { 2099 start = pr->u.p.lb; 2100 incr = pr->u.p.st; 2101 if (p_st != NULL) 2102 *p_st = incr; 2103 *p_lb = start + init * incr; 2104 *p_ub = start + limit * incr; 2105 if (pr->ordered) { 2106 pr->u.p.ordered_lower = init; 2107 pr->u.p.ordered_upper = limit; 2108 #ifdef KMP_DEBUG 2109 { 2110 char *buff; 2111 // create format specifiers before the debug output 2112 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2113 "ordered_lower:%%%s ordered_upper:%%%s\n", 2114 traits_t<UT>::spec, traits_t<UT>::spec); 2115 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2116 pr->u.p.ordered_upper)); 2117 __kmp_str_free(&buff); 2118 } 2119 #endif 2120 } // if 2121 } else { 2122 *p_lb = 0; 2123 *p_ub = 0; 2124 if (p_st != NULL) 2125 *p_st = 0; 2126 } // if 2127 } // case 2128 break; 2129 2130 case kmp_sch_guided_analytical_chunked: { 2131 T chunkspec = pr->u.p.parm1; 2132 UT chunkIdx; 2133 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2134 /* for storing original FPCW value for Windows* OS on 2135 IA-32 architecture 8-byte version */ 2136 unsigned int oldFpcw; 2137 unsigned int fpcwSet = 0; 2138 #endif 2139 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2140 "analytical case\n", 2141 gtid)); 2142 2143 trip = pr->u.p.tc; 2144 2145 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2146 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2147 trip); 2148 2149 while (1) { /* this while loop is a safeguard against unexpected zero 2150 chunk sizes */ 2151 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2152 if (chunkIdx >= (UT)pr->u.p.parm2) { 2153 --trip; 2154 /* use dynamic-style scheduling */ 2155 init = chunkIdx * chunkspec + pr->u.p.count; 2156 /* need to verify init > 0 in case of overflow in the above 2157 * calculation */ 2158 if ((status = (init > 0 && init <= trip)) != 0) { 2159 limit = init + chunkspec - 1; 2160 2161 if ((last = (limit >= trip)) != 0) 2162 limit = trip; 2163 } 2164 break; 2165 } else { 2166 /* use exponential-style scheduling */ 2167 /* The following check is to workaround the lack of long double precision on 2168 Windows* OS. 2169 This check works around the possible effect that init != 0 for chunkIdx == 0. 2170 */ 2171 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2172 /* If we haven't already done so, save original FPCW and set 2173 precision to 64-bit, as Windows* OS on IA-32 architecture 2174 defaults to 53-bit */ 2175 if (!fpcwSet) { 2176 oldFpcw = _control87(0, 0); 2177 _control87(_PC_64, _MCW_PC); 2178 fpcwSet = 0x30000; 2179 } 2180 #endif 2181 if (chunkIdx) { 2182 init = __kmp_dispatch_guided_remaining<T>( 2183 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2184 KMP_DEBUG_ASSERT(init); 2185 init = trip - init; 2186 } else 2187 init = 0; 2188 limit = trip - __kmp_dispatch_guided_remaining<T>( 2189 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2190 KMP_ASSERT(init <= limit); 2191 if (init < limit) { 2192 KMP_DEBUG_ASSERT(limit <= trip); 2193 --limit; 2194 status = 1; 2195 break; 2196 } // if 2197 } // if 2198 } // while (1) 2199 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2200 /* restore FPCW if necessary 2201 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2202 here */ 2203 if (fpcwSet && (oldFpcw & fpcwSet)) 2204 _control87(oldFpcw, _MCW_PC); 2205 #endif 2206 if (status != 0) { 2207 start = pr->u.p.lb; 2208 incr = pr->u.p.st; 2209 if (p_st != NULL) 2210 *p_st = incr; 2211 *p_lb = start + init * incr; 2212 *p_ub = start + limit * incr; 2213 if (pr->ordered) { 2214 pr->u.p.ordered_lower = init; 2215 pr->u.p.ordered_upper = limit; 2216 #ifdef KMP_DEBUG 2217 { 2218 char *buff; 2219 // create format specifiers before the debug output 2220 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2221 "ordered_lower:%%%s ordered_upper:%%%s\n", 2222 traits_t<UT>::spec, traits_t<UT>::spec); 2223 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2224 pr->u.p.ordered_upper)); 2225 __kmp_str_free(&buff); 2226 } 2227 #endif 2228 } 2229 } else { 2230 *p_lb = 0; 2231 *p_ub = 0; 2232 if (p_st != NULL) 2233 *p_st = 0; 2234 } 2235 } // case 2236 break; 2237 2238 case kmp_sch_trapezoidal: { 2239 UT index; 2240 T parm2 = pr->u.p.parm2; 2241 T parm3 = pr->u.p.parm3; 2242 T parm4 = pr->u.p.parm4; 2243 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2244 gtid)); 2245 2246 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2247 2248 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2249 trip = pr->u.p.tc - 1; 2250 2251 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2252 *p_lb = 0; 2253 *p_ub = 0; 2254 if (p_st != NULL) 2255 *p_st = 0; 2256 } else { 2257 start = pr->u.p.lb; 2258 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2259 incr = pr->u.p.st; 2260 2261 if ((last = (limit >= trip)) != 0) 2262 limit = trip; 2263 2264 if (p_st != NULL) 2265 *p_st = incr; 2266 2267 if (incr == 1) { 2268 *p_lb = start + init; 2269 *p_ub = start + limit; 2270 } else { 2271 *p_lb = start + init * incr; 2272 *p_ub = start + limit * incr; 2273 } 2274 2275 if (pr->ordered) { 2276 pr->u.p.ordered_lower = init; 2277 pr->u.p.ordered_upper = limit; 2278 #ifdef KMP_DEBUG 2279 { 2280 char *buff; 2281 // create format specifiers before the debug output 2282 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2283 "ordered_lower:%%%s ordered_upper:%%%s\n", 2284 traits_t<UT>::spec, traits_t<UT>::spec); 2285 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2286 pr->u.p.ordered_upper)); 2287 __kmp_str_free(&buff); 2288 } 2289 #endif 2290 } // if 2291 } // if 2292 } // case 2293 break; 2294 default: { 2295 status = 0; // to avoid complaints on uninitialized variable use 2296 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 2297 KMP_HNT(GetNewerLibrary), // Hint 2298 __kmp_msg_null // Variadic argument list terminator 2299 ); 2300 } break; 2301 } // switch 2302 } // if tc == 0; 2303 2304 if (status == 0) { 2305 UT num_done; 2306 2307 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2308 #ifdef KMP_DEBUG 2309 { 2310 char *buff; 2311 // create format specifiers before the debug output 2312 buff = __kmp_str_format( 2313 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2314 traits_t<UT>::spec); 2315 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2316 __kmp_str_free(&buff); 2317 } 2318 #endif 2319 2320 if ((ST)num_done == th->th.th_team_nproc - 1) { 2321 #if (KMP_STATIC_STEAL_ENABLED) 2322 if (pr->schedule == kmp_sch_static_steal && 2323 traits_t<T>::type_size > 4) { 2324 int i; 2325 kmp_info_t **other_threads = team->t.t_threads; 2326 // loop complete, safe to destroy locks used for stealing 2327 for (i = 0; i < th->th.th_team_nproc; ++i) { 2328 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2329 KMP_ASSERT(lck != NULL); 2330 __kmp_destroy_lock(lck); 2331 __kmp_free(lck); 2332 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2333 } 2334 } 2335 #endif 2336 /* NOTE: release this buffer to be reused */ 2337 2338 KMP_MB(); /* Flush all pending memory write invalidates. */ 2339 2340 sh->u.s.num_done = 0; 2341 sh->u.s.iteration = 0; 2342 2343 /* TODO replace with general release procedure? */ 2344 if (pr->ordered) { 2345 sh->u.s.ordered_iteration = 0; 2346 } 2347 2348 KMP_MB(); /* Flush all pending memory write invalidates. */ 2349 2350 sh->buffer_index += __kmp_dispatch_num_buffers; 2351 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2352 gtid, sh->buffer_index)); 2353 2354 KMP_MB(); /* Flush all pending memory write invalidates. */ 2355 2356 } // if 2357 if (__kmp_env_consistency_check) { 2358 if (pr->pushed_ws != ct_none) { 2359 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2360 } 2361 } 2362 2363 th->th.th_dispatch->th_deo_fcn = NULL; 2364 th->th.th_dispatch->th_dxo_fcn = NULL; 2365 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2366 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2367 } // if (status == 0) 2368 #if KMP_OS_WINDOWS 2369 else if (last) { 2370 pr->u.p.last_upper = pr->u.p.ub; 2371 } 2372 #endif /* KMP_OS_WINDOWS */ 2373 if (p_last != NULL && status != 0) 2374 *p_last = last; 2375 } // if 2376 2377 #ifdef KMP_DEBUG 2378 { 2379 char *buff; 2380 // create format specifiers before the debug output 2381 buff = __kmp_str_format( 2382 "__kmp_dispatch_next: T#%%d normal case: " 2383 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2384 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2385 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2386 __kmp_str_free(&buff); 2387 } 2388 #endif 2389 #if INCLUDE_SSC_MARKS 2390 SSC_MARK_DISPATCH_NEXT(); 2391 #endif 2392 OMPT_LOOP_END; 2393 return status; 2394 } 2395 2396 template <typename T> 2397 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2398 kmp_int32 *plastiter, T *plower, T *pupper, 2399 typename traits_t<T>::signed_t incr) { 2400 typedef typename traits_t<T>::unsigned_t UT; 2401 typedef typename traits_t<T>::signed_t ST; 2402 kmp_uint32 team_id; 2403 kmp_uint32 nteams; 2404 UT trip_count; 2405 kmp_team_t *team; 2406 kmp_info_t *th; 2407 2408 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2409 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2410 #ifdef KMP_DEBUG 2411 { 2412 char *buff; 2413 // create format specifiers before the debug output 2414 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2415 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2416 traits_t<T>::spec, traits_t<T>::spec, 2417 traits_t<ST>::spec, traits_t<T>::spec); 2418 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2419 __kmp_str_free(&buff); 2420 } 2421 #endif 2422 2423 if (__kmp_env_consistency_check) { 2424 if (incr == 0) { 2425 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2426 loc); 2427 } 2428 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2429 // The loop is illegal. 2430 // Some zero-trip loops maintained by compiler, e.g.: 2431 // for(i=10;i<0;++i) // lower >= upper - run-time check 2432 // for(i=0;i>10;--i) // lower <= upper - run-time check 2433 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2434 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2435 // Compiler does not check the following illegal loops: 2436 // for(i=0;i<10;i+=incr) // where incr<0 2437 // for(i=10;i>0;i-=incr) // where incr<0 2438 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2439 } 2440 } 2441 th = __kmp_threads[gtid]; 2442 team = th->th.th_team; 2443 #if OMP_40_ENABLED 2444 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2445 nteams = th->th.th_teams_size.nteams; 2446 #endif 2447 team_id = team->t.t_master_tid; 2448 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2449 2450 // compute global trip count 2451 if (incr == 1) { 2452 trip_count = *pupper - *plower + 1; 2453 } else if (incr == -1) { 2454 trip_count = *plower - *pupper + 1; 2455 } else if (incr > 0) { 2456 // upper-lower can exceed the limit of signed type 2457 trip_count = (UT)(*pupper - *plower) / incr + 1; 2458 } else { 2459 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2460 } 2461 2462 if (trip_count <= nteams) { 2463 KMP_DEBUG_ASSERT( 2464 __kmp_static == kmp_sch_static_greedy || 2465 __kmp_static == 2466 kmp_sch_static_balanced); // Unknown static scheduling type. 2467 // only some teams get single iteration, others get nothing 2468 if (team_id < trip_count) { 2469 *pupper = *plower = *plower + team_id * incr; 2470 } else { 2471 *plower = *pupper + incr; // zero-trip loop 2472 } 2473 if (plastiter != NULL) 2474 *plastiter = (team_id == trip_count - 1); 2475 } else { 2476 if (__kmp_static == kmp_sch_static_balanced) { 2477 UT chunk = trip_count / nteams; 2478 UT extras = trip_count % nteams; 2479 *plower += 2480 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2481 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2482 if (plastiter != NULL) 2483 *plastiter = (team_id == nteams - 1); 2484 } else { 2485 T chunk_inc_count = 2486 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2487 T upper = *pupper; 2488 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2489 // Unknown static scheduling type. 2490 *plower += team_id * chunk_inc_count; 2491 *pupper = *plower + chunk_inc_count - incr; 2492 // Check/correct bounds if needed 2493 if (incr > 0) { 2494 if (*pupper < *plower) 2495 *pupper = traits_t<T>::max_value; 2496 if (plastiter != NULL) 2497 *plastiter = *plower <= upper && *pupper > upper - incr; 2498 if (*pupper > upper) 2499 *pupper = upper; // tracker C73258 2500 } else { 2501 if (*pupper > *plower) 2502 *pupper = traits_t<T>::min_value; 2503 if (plastiter != NULL) 2504 *plastiter = *plower >= upper && *pupper < upper - incr; 2505 if (*pupper < upper) 2506 *pupper = upper; // tracker C73258 2507 } 2508 } 2509 } 2510 } 2511 2512 //----------------------------------------------------------------------------- 2513 // Dispatch routines 2514 // Transfer call to template< type T > 2515 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2516 // T lb, T ub, ST st, ST chunk ) 2517 extern "C" { 2518 2519 /*! 2520 @ingroup WORK_SHARING 2521 @{ 2522 @param loc Source location 2523 @param gtid Global thread id 2524 @param schedule Schedule type 2525 @param lb Lower bound 2526 @param ub Upper bound 2527 @param st Step (or increment if you prefer) 2528 @param chunk The chunk size to block with 2529 2530 This function prepares the runtime to start a dynamically scheduled for loop, 2531 saving the loop arguments. 2532 These functions are all identical apart from the types of the arguments. 2533 */ 2534 2535 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2536 enum sched_type schedule, kmp_int32 lb, 2537 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2538 KMP_DEBUG_ASSERT(__kmp_init_serial); 2539 #if OMPT_SUPPORT && OMPT_OPTIONAL 2540 OMPT_STORE_RETURN_ADDRESS(gtid); 2541 #endif 2542 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2543 } 2544 /*! 2545 See @ref __kmpc_dispatch_init_4 2546 */ 2547 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2548 enum sched_type schedule, kmp_uint32 lb, 2549 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2550 KMP_DEBUG_ASSERT(__kmp_init_serial); 2551 #if OMPT_SUPPORT && OMPT_OPTIONAL 2552 OMPT_STORE_RETURN_ADDRESS(gtid); 2553 #endif 2554 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2555 } 2556 2557 /*! 2558 See @ref __kmpc_dispatch_init_4 2559 */ 2560 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2561 enum sched_type schedule, kmp_int64 lb, 2562 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2563 KMP_DEBUG_ASSERT(__kmp_init_serial); 2564 #if OMPT_SUPPORT && OMPT_OPTIONAL 2565 OMPT_STORE_RETURN_ADDRESS(gtid); 2566 #endif 2567 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2568 } 2569 2570 /*! 2571 See @ref __kmpc_dispatch_init_4 2572 */ 2573 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2574 enum sched_type schedule, kmp_uint64 lb, 2575 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2576 KMP_DEBUG_ASSERT(__kmp_init_serial); 2577 #if OMPT_SUPPORT && OMPT_OPTIONAL 2578 OMPT_STORE_RETURN_ADDRESS(gtid); 2579 #endif 2580 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2581 } 2582 2583 /*! 2584 See @ref __kmpc_dispatch_init_4 2585 2586 Difference from __kmpc_dispatch_init set of functions is these functions 2587 are called for composite distribute parallel for construct. Thus before 2588 regular iterations dispatching we need to calc per-team iteration space. 2589 2590 These functions are all identical apart from the types of the arguments. 2591 */ 2592 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2593 enum sched_type schedule, kmp_int32 *p_last, 2594 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2595 kmp_int32 chunk) { 2596 KMP_DEBUG_ASSERT(__kmp_init_serial); 2597 #if OMPT_SUPPORT && OMPT_OPTIONAL 2598 OMPT_STORE_RETURN_ADDRESS(gtid); 2599 #endif 2600 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2601 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2602 } 2603 2604 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2605 enum sched_type schedule, kmp_int32 *p_last, 2606 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2607 kmp_int32 chunk) { 2608 KMP_DEBUG_ASSERT(__kmp_init_serial); 2609 #if OMPT_SUPPORT && OMPT_OPTIONAL 2610 OMPT_STORE_RETURN_ADDRESS(gtid); 2611 #endif 2612 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2613 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2614 } 2615 2616 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2617 enum sched_type schedule, kmp_int32 *p_last, 2618 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2619 kmp_int64 chunk) { 2620 KMP_DEBUG_ASSERT(__kmp_init_serial); 2621 #if OMPT_SUPPORT && OMPT_OPTIONAL 2622 OMPT_STORE_RETURN_ADDRESS(gtid); 2623 #endif 2624 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2625 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2626 } 2627 2628 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2629 enum sched_type schedule, kmp_int32 *p_last, 2630 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2631 kmp_int64 chunk) { 2632 KMP_DEBUG_ASSERT(__kmp_init_serial); 2633 #if OMPT_SUPPORT && OMPT_OPTIONAL 2634 OMPT_STORE_RETURN_ADDRESS(gtid); 2635 #endif 2636 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2637 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2638 } 2639 2640 /*! 2641 @param loc Source code location 2642 @param gtid Global thread id 2643 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2644 otherwise 2645 @param p_lb Pointer to the lower bound for the next chunk of work 2646 @param p_ub Pointer to the upper bound for the next chunk of work 2647 @param p_st Pointer to the stride for the next chunk of work 2648 @return one if there is work to be done, zero otherwise 2649 2650 Get the next dynamically allocated chunk of work for this thread. 2651 If there is no more work, then the lb,ub and stride need not be modified. 2652 */ 2653 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2654 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2655 #if OMPT_SUPPORT && OMPT_OPTIONAL 2656 OMPT_STORE_RETURN_ADDRESS(gtid); 2657 #endif 2658 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2659 #if OMPT_SUPPORT && OMPT_OPTIONAL 2660 , 2661 OMPT_LOAD_RETURN_ADDRESS(gtid) 2662 #endif 2663 ); 2664 } 2665 2666 /*! 2667 See @ref __kmpc_dispatch_next_4 2668 */ 2669 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2670 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2671 kmp_int32 *p_st) { 2672 #if OMPT_SUPPORT && OMPT_OPTIONAL 2673 OMPT_STORE_RETURN_ADDRESS(gtid); 2674 #endif 2675 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2676 #if OMPT_SUPPORT && OMPT_OPTIONAL 2677 , 2678 OMPT_LOAD_RETURN_ADDRESS(gtid) 2679 #endif 2680 ); 2681 } 2682 2683 /*! 2684 See @ref __kmpc_dispatch_next_4 2685 */ 2686 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2687 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2688 #if OMPT_SUPPORT && OMPT_OPTIONAL 2689 OMPT_STORE_RETURN_ADDRESS(gtid); 2690 #endif 2691 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2692 #if OMPT_SUPPORT && OMPT_OPTIONAL 2693 , 2694 OMPT_LOAD_RETURN_ADDRESS(gtid) 2695 #endif 2696 ); 2697 } 2698 2699 /*! 2700 See @ref __kmpc_dispatch_next_4 2701 */ 2702 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2703 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2704 kmp_int64 *p_st) { 2705 #if OMPT_SUPPORT && OMPT_OPTIONAL 2706 OMPT_STORE_RETURN_ADDRESS(gtid); 2707 #endif 2708 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2709 #if OMPT_SUPPORT && OMPT_OPTIONAL 2710 , 2711 OMPT_LOAD_RETURN_ADDRESS(gtid) 2712 #endif 2713 ); 2714 } 2715 2716 /*! 2717 @param loc Source code location 2718 @param gtid Global thread id 2719 2720 Mark the end of a dynamic loop. 2721 */ 2722 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2723 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2724 } 2725 2726 /*! 2727 See @ref __kmpc_dispatch_fini_4 2728 */ 2729 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2730 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2731 } 2732 2733 /*! 2734 See @ref __kmpc_dispatch_fini_4 2735 */ 2736 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2737 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2738 } 2739 2740 /*! 2741 See @ref __kmpc_dispatch_fini_4 2742 */ 2743 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2744 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2745 } 2746 /*! @} */ 2747 2748 //----------------------------------------------------------------------------- 2749 // Non-template routines from kmp_dispatch.cpp used in other sources 2750 2751 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2752 return value == checker; 2753 } 2754 2755 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2756 return value != checker; 2757 } 2758 2759 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2760 return value < checker; 2761 } 2762 2763 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2764 return value >= checker; 2765 } 2766 2767 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2768 return value <= checker; 2769 } 2770 2771 kmp_uint32 2772 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2773 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2774 void *obj // Higher-level synchronization object, or NULL. 2775 ) { 2776 // note: we may not belong to a team at this point 2777 volatile kmp_uint32 *spin = spinner; 2778 kmp_uint32 check = checker; 2779 kmp_uint32 spins; 2780 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2781 kmp_uint32 r; 2782 2783 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2784 KMP_INIT_YIELD(spins); 2785 // main wait spin loop 2786 while (!f(r = TCR_4(*spin), check)) { 2787 KMP_FSYNC_SPIN_PREPARE(obj); 2788 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2789 split. It causes problems with infinite recursion because of exit lock */ 2790 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2791 __kmp_abort_thread(); */ 2792 2793 /* if we have waited a bit, or are oversubscribed, yield */ 2794 /* pause is in the following code */ 2795 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2796 KMP_YIELD_SPIN(spins); 2797 } 2798 KMP_FSYNC_SPIN_ACQUIRED(obj); 2799 return r; 2800 } 2801 2802 void __kmp_wait_yield_4_ptr( 2803 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2804 void *obj // Higher-level synchronization object, or NULL. 2805 ) { 2806 // note: we may not belong to a team at this point 2807 void *spin = spinner; 2808 kmp_uint32 check = checker; 2809 kmp_uint32 spins; 2810 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2811 2812 KMP_FSYNC_SPIN_INIT(obj, spin); 2813 KMP_INIT_YIELD(spins); 2814 // main wait spin loop 2815 while (!f(spin, check)) { 2816 KMP_FSYNC_SPIN_PREPARE(obj); 2817 /* if we have waited a bit, or are oversubscribed, yield */ 2818 /* pause is in the following code */ 2819 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2820 KMP_YIELD_SPIN(spins); 2821 } 2822 KMP_FSYNC_SPIN_ACQUIRED(obj); 2823 } 2824 2825 } // extern "C" 2826 2827 #ifdef KMP_GOMP_COMPAT 2828 2829 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2830 enum sched_type schedule, kmp_int32 lb, 2831 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2832 int push_ws) { 2833 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2834 push_ws); 2835 } 2836 2837 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2838 enum sched_type schedule, kmp_uint32 lb, 2839 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2840 int push_ws) { 2841 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2842 push_ws); 2843 } 2844 2845 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2846 enum sched_type schedule, kmp_int64 lb, 2847 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2848 int push_ws) { 2849 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2850 push_ws); 2851 } 2852 2853 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2854 enum sched_type schedule, kmp_uint64 lb, 2855 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2856 int push_ws) { 2857 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2858 push_ws); 2859 } 2860 2861 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2862 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2863 } 2864 2865 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2866 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2867 } 2868 2869 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2870 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2871 } 2872 2873 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2874 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2875 } 2876 2877 #endif /* KMP_GOMP_COMPAT */ 2878 2879 /* ------------------------------------------------------------------------ */ 2880