1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 /* Dynamic scheduling initialization and dispatch. 15 * 16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 17 * it may change values between parallel regions. __kmp_max_nth 18 * is the largest value __kmp_nth may take, 1 is the smallest. 19 */ 20 21 // Need to raise Win version from XP to Vista here for support of 22 // InterlockedExchange64 23 #if defined(_WIN32_WINNT) && defined(_M_IX86) 24 #undef _WIN32_WINNT 25 #define _WIN32_WINNT 0x0502 26 #endif 27 28 #include "kmp.h" 29 #include "kmp_error.h" 30 #include "kmp_i18n.h" 31 #include "kmp_itt.h" 32 #include "kmp_stats.h" 33 #include "kmp_str.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-specific.h" 40 #endif 41 42 /* ------------------------------------------------------------------------ */ 43 44 #if KMP_STATIC_STEAL_ENABLED 45 46 // replaces dispatch_private_info{32,64} structures and 47 // dispatch_private_info{32,64}_t types 48 template <typename T> struct dispatch_private_infoXX_template { 49 typedef typename traits_t<T>::unsigned_t UT; 50 typedef typename traits_t<T>::signed_t ST; 51 UT count; // unsigned 52 T ub; 53 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 54 T lb; 55 ST st; // signed 56 UT tc; // unsigned 57 T static_steal_counter; // for static_steal only; maybe better to put after ub 58 59 /* parm[1-4] are used in different ways by different scheduling algorithms */ 60 61 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 62 // a) parm3 is properly aligned and 63 // b) all parm1-4 are in the same cache line. 64 // Because of parm1-4 are used together, performance seems to be better 65 // if they are in the same line (not measured though). 66 67 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 68 T parm1; 69 T parm2; 70 T parm3; 71 T parm4; 72 }; 73 74 UT ordered_lower; // unsigned 75 UT ordered_upper; // unsigned 76 #if KMP_OS_WINDOWS 77 T last_upper; 78 #endif /* KMP_OS_WINDOWS */ 79 }; 80 81 #else /* KMP_STATIC_STEAL_ENABLED */ 82 83 // replaces dispatch_private_info{32,64} structures and 84 // dispatch_private_info{32,64}_t types 85 template <typename T> struct dispatch_private_infoXX_template { 86 typedef typename traits_t<T>::unsigned_t UT; 87 typedef typename traits_t<T>::signed_t ST; 88 T lb; 89 T ub; 90 ST st; // signed 91 UT tc; // unsigned 92 93 T parm1; 94 T parm2; 95 T parm3; 96 T parm4; 97 98 UT count; // unsigned 99 100 UT ordered_lower; // unsigned 101 UT ordered_upper; // unsigned 102 #if KMP_OS_WINDOWS 103 T last_upper; 104 #endif /* KMP_OS_WINDOWS */ 105 }; 106 107 #endif /* KMP_STATIC_STEAL_ENABLED */ 108 109 // replaces dispatch_private_info structure and dispatch_private_info_t type 110 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { 111 // duplicate alignment here, otherwise size of structure is not correct in our 112 // compiler 113 union KMP_ALIGN_CACHE private_info_tmpl { 114 dispatch_private_infoXX_template<T> p; 115 dispatch_private_info64_t p64; 116 } u; 117 enum sched_type schedule; /* scheduling algorithm */ 118 kmp_uint32 ordered; /* ordered clause specified */ 119 kmp_uint32 ordered_bumped; 120 // To retain the structure size after making ordered_iteration scalar 121 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; 122 dispatch_private_info *next; /* stack of buffers for nest of serial regions */ 123 kmp_uint32 nomerge; /* don't merge iters if serialized */ 124 kmp_uint32 type_size; 125 enum cons_type pushed_ws; 126 }; 127 128 // replaces dispatch_shared_info{32,64} structures and 129 // dispatch_shared_info{32,64}_t types 130 template <typename UT> struct dispatch_shared_infoXX_template { 131 /* chunk index under dynamic, number of idle threads under static-steal; 132 iteration index otherwise */ 133 volatile UT iteration; 134 volatile UT num_done; 135 volatile UT ordered_iteration; 136 // to retain the structure size making ordered_iteration scalar 137 UT ordered_dummy[KMP_MAX_ORDERED - 3]; 138 }; 139 140 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 141 template <typename UT> struct dispatch_shared_info_template { 142 // we need union here to keep the structure size 143 union shared_info_tmpl { 144 dispatch_shared_infoXX_template<UT> s; 145 dispatch_shared_info64_t s64; 146 } u; 147 volatile kmp_uint32 buffer_index; 148 #if OMP_45_ENABLED 149 volatile kmp_int32 doacross_buf_idx; // teamwise index 150 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 151 kmp_int32 doacross_num_done; // count finished threads 152 #endif 153 #if KMP_USE_HWLOC 154 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 155 // machines (> 48 cores). Performance analysis showed that a cache thrash 156 // was occurring and this padding helps alleviate the problem. 157 char padding[64]; 158 #endif 159 }; 160 161 /* ------------------------------------------------------------------------ */ 162 163 #undef USE_TEST_LOCKS 164 165 // test_then_add template (general template should NOT be used) 166 template <typename T> static __forceinline T test_then_add(volatile T *p, T d); 167 168 template <> 169 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, 170 kmp_int32 d) { 171 kmp_int32 r; 172 r = KMP_TEST_THEN_ADD32(p, d); 173 return r; 174 } 175 176 template <> 177 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, 178 kmp_int64 d) { 179 kmp_int64 r; 180 r = KMP_TEST_THEN_ADD64(p, d); 181 return r; 182 } 183 184 // test_then_inc_acq template (general template should NOT be used) 185 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); 186 187 template <> 188 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { 189 kmp_int32 r; 190 r = KMP_TEST_THEN_INC_ACQ32(p); 191 return r; 192 } 193 194 template <> 195 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { 196 kmp_int64 r; 197 r = KMP_TEST_THEN_INC_ACQ64(p); 198 return r; 199 } 200 201 // test_then_inc template (general template should NOT be used) 202 template <typename T> static __forceinline T test_then_inc(volatile T *p); 203 204 template <> 205 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { 206 kmp_int32 r; 207 r = KMP_TEST_THEN_INC32(p); 208 return r; 209 } 210 211 template <> 212 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { 213 kmp_int64 r; 214 r = KMP_TEST_THEN_INC64(p); 215 return r; 216 } 217 218 // compare_and_swap template (general template should NOT be used) 219 template <typename T> 220 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); 221 222 template <> 223 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, 224 kmp_int32 c, kmp_int32 s) { 225 return KMP_COMPARE_AND_STORE_REL32(p, c, s); 226 } 227 228 template <> 229 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, 230 kmp_int64 c, kmp_int64 s) { 231 return KMP_COMPARE_AND_STORE_REL64(p, c, s); 232 } 233 234 /* Spin wait loop that first does pause, then yield. 235 Waits until function returns non-zero when called with *spinner and check. 236 Does NOT put threads to sleep. 237 #if USE_ITT_BUILD 238 Arguments: 239 obj -- is higher-level synchronization object to report to ittnotify. 240 It is used to report locks consistently. For example, if lock is 241 acquired immediately, its address is reported to ittnotify via 242 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately 243 and lock routine calls to KMP_WAIT_YIELD(), the later should report the 244 same address, not an address of low-level spinner. 245 #endif // USE_ITT_BUILD 246 */ 247 template <typename UT> 248 // ToDo: make inline function (move to header file for icl) 249 static UT // unsigned 4- or 8-byte type 250 __kmp_wait_yield( 251 volatile UT *spinner, UT checker, 252 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG( 253 void *obj) // Higher-level synchronization object, or NULL. 254 ) { 255 // note: we may not belong to a team at this point 256 volatile UT *spin = spinner; 257 UT check = checker; 258 kmp_uint32 spins; 259 kmp_uint32 (*f)(UT, UT) = pred; 260 UT r; 261 262 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); 263 KMP_INIT_YIELD(spins); 264 // main wait spin loop 265 while (!f(r = *spin, check)) { 266 KMP_FSYNC_SPIN_PREPARE(obj); 267 /* GEH - remove this since it was accidentally introduced when kmp_wait was 268 split. It causes problems with infinite recursion because of exit lock */ 269 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 270 __kmp_abort_thread(); */ 271 272 // if we are oversubscribed, or have waited a bit (and 273 // KMP_LIBRARY=throughput, then yield. pause is in the following code 274 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 275 KMP_YIELD_SPIN(spins); 276 } 277 KMP_FSYNC_SPIN_ACQUIRED(obj); 278 return r; 279 } 280 281 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) { 282 return value == checker; 283 } 284 285 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) { 286 return value != checker; 287 } 288 289 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) { 290 return value < checker; 291 } 292 293 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) { 294 return value >= checker; 295 } 296 297 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) { 298 return value <= checker; 299 } 300 301 /* ------------------------------------------------------------------------ */ 302 303 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, 304 ident_t *loc_ref) { 305 kmp_info_t *th; 306 307 KMP_DEBUG_ASSERT(gtid_ref); 308 309 if (__kmp_env_consistency_check) { 310 th = __kmp_threads[*gtid_ref]; 311 if (th->th.th_root->r.r_active && 312 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { 313 #if KMP_USE_DYNAMIC_LOCK 314 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0); 315 #else 316 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL); 317 #endif 318 } 319 } 320 } 321 322 template <typename UT> 323 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 324 typedef typename traits_t<UT>::signed_t ST; 325 dispatch_private_info_template<UT> *pr; 326 327 int gtid = *gtid_ref; 328 // int cid = *cid_ref; 329 kmp_info_t *th = __kmp_threads[gtid]; 330 KMP_DEBUG_ASSERT(th->th.th_dispatch); 331 332 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); 333 if (__kmp_env_consistency_check) { 334 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 335 th->th.th_dispatch->th_dispatch_pr_current); 336 if (pr->pushed_ws != ct_none) { 337 #if KMP_USE_DYNAMIC_LOCK 338 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); 339 #else 340 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); 341 #endif 342 } 343 } 344 345 if (!th->th.th_team->t.t_serialized) { 346 dispatch_shared_info_template<UT> *sh = 347 reinterpret_cast<dispatch_shared_info_template<UT> *>( 348 th->th.th_dispatch->th_dispatch_sh_current); 349 UT lower; 350 351 if (!__kmp_env_consistency_check) { 352 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 353 th->th.th_dispatch->th_dispatch_pr_current); 354 } 355 lower = pr->u.p.ordered_lower; 356 357 #if !defined(KMP_GOMP_COMPAT) 358 if (__kmp_env_consistency_check) { 359 if (pr->ordered_bumped) { 360 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 361 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 362 ct_ordered_in_pdo, loc_ref, 363 &p->stack_data[p->w_top]); 364 } 365 } 366 #endif /* !defined(KMP_GOMP_COMPAT) */ 367 368 KMP_MB(); 369 #ifdef KMP_DEBUG 370 { 371 char *buff; 372 // create format specifiers before the debug output 373 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " 374 "ordered_iter:%%%s lower:%%%s\n", 375 traits_t<UT>::spec, traits_t<UT>::spec); 376 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 377 __kmp_str_free(&buff); 378 } 379 #endif 380 381 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 382 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 383 KMP_MB(); /* is this necessary? */ 384 #ifdef KMP_DEBUG 385 { 386 char *buff; 387 // create format specifiers before the debug output 388 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " 389 "ordered_iter:%%%s lower:%%%s\n", 390 traits_t<UT>::spec, traits_t<UT>::spec); 391 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 392 __kmp_str_free(&buff); 393 } 394 #endif 395 } 396 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); 397 } 398 399 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, 400 ident_t *loc_ref) { 401 kmp_info_t *th; 402 403 if (__kmp_env_consistency_check) { 404 th = __kmp_threads[*gtid_ref]; 405 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { 406 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); 407 } 408 } 409 } 410 411 template <typename UT> 412 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 413 typedef typename traits_t<UT>::signed_t ST; 414 dispatch_private_info_template<UT> *pr; 415 416 int gtid = *gtid_ref; 417 // int cid = *cid_ref; 418 kmp_info_t *th = __kmp_threads[gtid]; 419 KMP_DEBUG_ASSERT(th->th.th_dispatch); 420 421 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); 422 if (__kmp_env_consistency_check) { 423 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 424 th->th.th_dispatch->th_dispatch_pr_current); 425 if (pr->pushed_ws != ct_none) { 426 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); 427 } 428 } 429 430 if (!th->th.th_team->t.t_serialized) { 431 dispatch_shared_info_template<UT> *sh = 432 reinterpret_cast<dispatch_shared_info_template<UT> *>( 433 th->th.th_dispatch->th_dispatch_sh_current); 434 435 if (!__kmp_env_consistency_check) { 436 pr = reinterpret_cast<dispatch_private_info_template<UT> *>( 437 th->th.th_dispatch->th_dispatch_pr_current); 438 } 439 440 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); 441 #if !defined(KMP_GOMP_COMPAT) 442 if (__kmp_env_consistency_check) { 443 if (pr->ordered_bumped != 0) { 444 struct cons_header *p = __kmp_threads[gtid]->th.th_cons; 445 /* How to test it? - OM */ 446 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, 447 ct_ordered_in_pdo, loc_ref, 448 &p->stack_data[p->w_top]); 449 } 450 } 451 #endif /* !defined(KMP_GOMP_COMPAT) */ 452 453 KMP_MB(); /* Flush all pending memory write invalidates. */ 454 455 pr->ordered_bumped += 1; 456 457 KD_TRACE(1000, 458 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 459 gtid, pr->ordered_bumped)); 460 461 KMP_MB(); /* Flush all pending memory write invalidates. */ 462 463 /* TODO use general release procedure? */ 464 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 465 466 KMP_MB(); /* Flush all pending memory write invalidates. */ 467 } 468 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); 469 } 470 471 // Computes and returns x to the power of y, where y must a non-negative integer 472 template <typename UT> 473 static __forceinline long double __kmp_pow(long double x, UT y) { 474 long double s = 1.0L; 475 476 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 477 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 478 while (y) { 479 if (y & 1) 480 s *= x; 481 x *= x; 482 y >>= 1; 483 } 484 return s; 485 } 486 487 /* Computes and returns the number of unassigned iterations after idx chunks 488 have been assigned (the total number of unassigned iterations in chunks with 489 index greater than or equal to idx). __forceinline seems to be broken so that 490 if we __forceinline this function, the behavior is wrong 491 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ 492 template <typename T> 493 static __inline typename traits_t<T>::unsigned_t 494 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, 495 typename traits_t<T>::unsigned_t idx) { 496 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for 497 ICL 8.1, long double arithmetic may not really have long double precision, 498 even with /Qlong_double. Currently, we workaround that in the caller code, 499 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack 500 of precision is not expected to be a correctness issue, though. */ 501 typedef typename traits_t<T>::unsigned_t UT; 502 503 long double x = tc * __kmp_pow<UT>(base, idx); 504 UT r = (UT)x; 505 if (x == r) 506 return r; 507 return r + 1; 508 } 509 510 // Parameters of the guided-iterative algorithm: 511 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 512 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 513 // by default n = 2. For example with n = 3 the chunks distribution will be more 514 // flat. 515 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 516 static int guided_int_param = 2; 517 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param; 518 519 // UT - unsigned flavor of T, ST - signed flavor of T, 520 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 521 template <typename T> 522 static void 523 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, 524 T ub, typename traits_t<T>::signed_t st, 525 typename traits_t<T>::signed_t chunk, int push_ws) { 526 typedef typename traits_t<T>::unsigned_t UT; 527 typedef typename traits_t<T>::signed_t ST; 528 typedef typename traits_t<T>::floating_t DBL; 529 530 int active; 531 T tc; 532 kmp_info_t *th; 533 kmp_team_t *team; 534 kmp_uint32 my_buffer_index; 535 dispatch_private_info_template<T> *pr; 536 dispatch_shared_info_template<UT> volatile *sh; 537 538 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) == 539 sizeof(dispatch_private_info)); 540 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) == 541 sizeof(dispatch_shared_info)); 542 543 if (!TCR_4(__kmp_init_parallel)) 544 __kmp_parallel_initialize(); 545 546 #if INCLUDE_SSC_MARKS 547 SSC_MARK_DISPATCH_INIT(); 548 #endif 549 #ifdef KMP_DEBUG 550 { 551 char *buff; 552 // create format specifiers before the debug output 553 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " 554 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 555 traits_t<ST>::spec, traits_t<T>::spec, 556 traits_t<T>::spec, traits_t<ST>::spec); 557 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st)); 558 __kmp_str_free(&buff); 559 } 560 #endif 561 /* setup data */ 562 th = __kmp_threads[gtid]; 563 team = th->th.th_team; 564 active = !team->t.t_serialized; 565 th->th.th_ident = loc; 566 567 #if USE_ITT_BUILD 568 kmp_uint64 cur_chunk = chunk; 569 int itt_need_metadata_reporting = __itt_metadata_add_ptr && 570 __kmp_forkjoin_frames_mode == 3 && 571 KMP_MASTER_GTID(gtid) && 572 #if OMP_40_ENABLED 573 th->th.th_teams_microtask == NULL && 574 #endif 575 team->t.t_active_level == 1; 576 #endif 577 if (!active) { 578 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 579 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 580 } else { 581 KMP_DEBUG_ASSERT(th->th.th_dispatch == 582 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 583 584 my_buffer_index = th->th.th_dispatch->th_disp_index++; 585 586 /* What happens when number of threads changes, need to resize buffer? */ 587 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 588 &th->th.th_dispatch 589 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 590 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 591 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); 592 } 593 594 #if (KMP_STATIC_STEAL_ENABLED) 595 if (SCHEDULE_HAS_NONMONOTONIC(schedule)) 596 // AC: we now have only one implementation of stealing, so use it 597 schedule = kmp_sch_static_steal; 598 else 599 #endif 600 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 601 602 /* Pick up the nomerge/ordered bits from the scheduling type */ 603 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { 604 pr->nomerge = TRUE; 605 schedule = 606 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 607 } else { 608 pr->nomerge = FALSE; 609 } 610 pr->type_size = traits_t<T>::type_size; // remember the size of variables 611 if (kmp_ord_lower & schedule) { 612 pr->ordered = TRUE; 613 schedule = 614 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 615 } else { 616 pr->ordered = FALSE; 617 } 618 619 if (schedule == kmp_sch_static) { 620 schedule = __kmp_static; 621 } else { 622 if (schedule == kmp_sch_runtime) { 623 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if 624 // not specified) 625 schedule = team->t.t_sched.r_sched_type; 626 // Detail the schedule if needed (global controls are differentiated 627 // appropriately) 628 if (schedule == kmp_sch_guided_chunked) { 629 schedule = __kmp_guided; 630 } else if (schedule == kmp_sch_static) { 631 schedule = __kmp_static; 632 } 633 // Use the chunk size specified by OMP_SCHEDULE (or default if not 634 // specified) 635 chunk = team->t.t_sched.chunk; 636 #if USE_ITT_BUILD 637 cur_chunk = chunk; 638 #endif 639 #ifdef KMP_DEBUG 640 { 641 char *buff; 642 // create format specifiers before the debug output 643 buff = __kmp_str_format( 644 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 645 traits_t<ST>::spec); 646 KD_TRACE(10, (buff, gtid, schedule, chunk)); 647 __kmp_str_free(&buff); 648 } 649 #endif 650 } else { 651 if (schedule == kmp_sch_guided_chunked) { 652 schedule = __kmp_guided; 653 } 654 if (chunk <= 0) { 655 chunk = KMP_DEFAULT_CHUNK; 656 } 657 } 658 659 if (schedule == kmp_sch_auto) { 660 // mapping and differentiation: in the __kmp_do_serial_initialize() 661 schedule = __kmp_auto; 662 #ifdef KMP_DEBUG 663 { 664 char *buff; 665 // create format specifiers before the debug output 666 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: " 667 "schedule:%%d chunk:%%%s\n", 668 traits_t<ST>::spec); 669 KD_TRACE(10, (buff, gtid, schedule, chunk)); 670 __kmp_str_free(&buff); 671 } 672 #endif 673 } 674 675 /* guided analytical not safe for too many threads */ 676 if (schedule == kmp_sch_guided_analytical_chunked && 677 th->th.th_team_nproc > 1 << 20) { 678 schedule = kmp_sch_guided_iterative_chunked; 679 KMP_WARNING(DispatchManyThreads); 680 } 681 if (schedule == kmp_sch_runtime_simd) { 682 // compiler provides simd_width in the chunk parameter 683 schedule = team->t.t_sched.r_sched_type; 684 // Detail the schedule if needed (global controls are differentiated 685 // appropriately) 686 if (schedule == kmp_sch_static || schedule == kmp_sch_auto || 687 schedule == __kmp_static) { 688 schedule = kmp_sch_static_balanced_chunked; 689 } else { 690 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { 691 schedule = kmp_sch_guided_simd; 692 } 693 chunk = team->t.t_sched.chunk * chunk; 694 } 695 #if USE_ITT_BUILD 696 cur_chunk = chunk; 697 #endif 698 #ifdef KMP_DEBUG 699 { 700 char *buff; 701 // create format specifiers before the debug output 702 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" 703 " chunk:%%%s\n", 704 traits_t<ST>::spec); 705 KD_TRACE(10, (buff, gtid, schedule, chunk)); 706 __kmp_str_free(&buff); 707 } 708 #endif 709 } 710 pr->u.p.parm1 = chunk; 711 } 712 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), 713 "unknown scheduling type"); 714 715 pr->u.p.count = 0; 716 717 if (__kmp_env_consistency_check) { 718 if (st == 0) { 719 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, 720 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc); 721 } 722 } 723 // compute trip count 724 if (st == 1) { // most common case 725 if (ub >= lb) { 726 tc = ub - lb + 1; 727 } else { // ub < lb 728 tc = 0; // zero-trip 729 } 730 } else if (st < 0) { 731 if (lb >= ub) { 732 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 733 // where the division needs to be unsigned regardless of the result type 734 tc = (UT)(lb - ub) / (-st) + 1; 735 } else { // lb < ub 736 tc = 0; // zero-trip 737 } 738 } else { // st > 0 739 if (ub >= lb) { 740 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 741 // where the division needs to be unsigned regardless of the result type 742 tc = (UT)(ub - lb) / st + 1; 743 } else { // ub < lb 744 tc = 0; // zero-trip 745 } 746 } 747 748 // Any half-decent optimizer will remove this test when the blocks are empty 749 // since the macros expand to nothing when statistics are disabled. 750 if (schedule == __kmp_static) { 751 KMP_COUNT_BLOCK(OMP_FOR_static); 752 KMP_COUNT_VALUE(FOR_static_iterations, tc); 753 } else { 754 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 755 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 756 } 757 758 pr->u.p.lb = lb; 759 pr->u.p.ub = ub; 760 pr->u.p.st = st; 761 pr->u.p.tc = tc; 762 763 #if KMP_OS_WINDOWS 764 pr->u.p.last_upper = ub + st; 765 #endif /* KMP_OS_WINDOWS */ 766 767 /* NOTE: only the active parallel region(s) has active ordered sections */ 768 769 if (active) { 770 if (pr->ordered == 0) { 771 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; 772 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; 773 } else { 774 pr->ordered_bumped = 0; 775 776 pr->u.p.ordered_lower = 1; 777 pr->u.p.ordered_upper = 0; 778 779 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; 780 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; 781 } 782 } 783 784 if (__kmp_env_consistency_check) { 785 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 786 if (push_ws) { 787 __kmp_push_workshare(gtid, ws, loc); 788 pr->pushed_ws = ws; 789 } else { 790 __kmp_check_workshare(gtid, ws, loc); 791 pr->pushed_ws = ct_none; 792 } 793 } 794 795 switch (schedule) { 796 #if (KMP_STATIC_STEAL_ENABLED) 797 case kmp_sch_static_steal: { 798 T nproc = th->th.th_team_nproc; 799 T ntc, init; 800 801 KD_TRACE(100, 802 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid)); 803 804 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 805 if (nproc > 1 && ntc >= nproc) { 806 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 807 T id = __kmp_tid_from_gtid(gtid); 808 T small_chunk, extras; 809 810 small_chunk = ntc / nproc; 811 extras = ntc % nproc; 812 813 init = id * small_chunk + (id < extras ? id : extras); 814 pr->u.p.count = init; 815 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); 816 817 pr->u.p.parm2 = lb; 818 // pr->pfields.parm3 = 0; // it's not used in static_steal 819 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 820 pr->u.p.st = st; 821 if (traits_t<T>::type_size > 4) { 822 // AC: TODO: check if 16-byte CAS available and use it to 823 // improve performance (probably wait for explicit request 824 // before spending time on this). 825 // For now use dynamically allocated per-thread lock, 826 // free memory in __kmp_dispatch_next when status==0. 827 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 828 th->th.th_dispatch->th_steal_lock = 829 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); 830 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 831 } 832 break; 833 } else { 834 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 835 "kmp_sch_static_balanced\n", 836 gtid)); 837 schedule = kmp_sch_static_balanced; 838 /* too few iterations: fall-through to kmp_sch_static_balanced */ 839 } // if 840 /* FALL-THROUGH to static balanced */ 841 } // case 842 #endif 843 case kmp_sch_static_balanced: { 844 T nproc = th->th.th_team_nproc; 845 T init, limit; 846 847 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 848 gtid)); 849 850 if (nproc > 1) { 851 T id = __kmp_tid_from_gtid(gtid); 852 853 if (tc < nproc) { 854 if (id < tc) { 855 init = id; 856 limit = id; 857 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 858 } else { 859 pr->u.p.count = 1; /* means no more chunks to execute */ 860 pr->u.p.parm1 = FALSE; 861 break; 862 } 863 } else { 864 T small_chunk = tc / nproc; 865 T extras = tc % nproc; 866 init = id * small_chunk + (id < extras ? id : extras); 867 limit = init + small_chunk - (id < extras ? 0 : 1); 868 pr->u.p.parm1 = (id == nproc - 1); 869 } 870 } else { 871 if (tc > 0) { 872 init = 0; 873 limit = tc - 1; 874 pr->u.p.parm1 = TRUE; 875 } else { // zero trip count 876 pr->u.p.count = 1; /* means no more chunks to execute */ 877 pr->u.p.parm1 = FALSE; 878 break; 879 } 880 } 881 #if USE_ITT_BUILD 882 // Calculate chunk for metadata report 883 if (itt_need_metadata_reporting) 884 cur_chunk = limit - init + 1; 885 #endif 886 if (st == 1) { 887 pr->u.p.lb = lb + init; 888 pr->u.p.ub = lb + limit; 889 } else { 890 // calculated upper bound, "ub" is user-defined upper bound 891 T ub_tmp = lb + limit * st; 892 pr->u.p.lb = lb + init * st; 893 // adjust upper bound to "ub" if needed, so that MS lastprivate will match 894 // it exactly 895 if (st > 0) { 896 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); 897 } else { 898 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); 899 } 900 } 901 if (pr->ordered) { 902 pr->u.p.ordered_lower = init; 903 pr->u.p.ordered_upper = limit; 904 } 905 break; 906 } // case 907 case kmp_sch_static_balanced_chunked: { 908 // similar to balanced, but chunk adjusted to multiple of simd width 909 T nth = th->th.th_team_nproc; 910 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" 911 " -> falling-through to static_greedy\n", 912 gtid)); 913 schedule = kmp_sch_static_greedy; 914 if (nth > 1) 915 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); 916 else 917 pr->u.p.parm1 = tc; 918 break; 919 } // case 920 case kmp_sch_guided_iterative_chunked: 921 case kmp_sch_guided_simd: { 922 T nproc = th->th.th_team_nproc; 923 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" 924 " case\n", 925 gtid)); 926 927 if (nproc > 1) { 928 if ((2L * chunk + 1) * nproc >= tc) { 929 /* chunk size too large, switch to dynamic */ 930 schedule = kmp_sch_dynamic_chunked; 931 } else { 932 // when remaining iters become less than parm2 - switch to dynamic 933 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); 934 *(double *)&pr->u.p.parm3 = 935 guided_flt_param / nproc; // may occupy parm3 and parm4 936 } 937 } else { 938 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 939 "kmp_sch_static_greedy\n", 940 gtid)); 941 schedule = kmp_sch_static_greedy; 942 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 943 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", 944 gtid)); 945 pr->u.p.parm1 = tc; 946 } // if 947 } // case 948 break; 949 case kmp_sch_guided_analytical_chunked: { 950 T nproc = th->th.th_team_nproc; 951 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked" 952 " case\n", 953 gtid)); 954 if (nproc > 1) { 955 if ((2L * chunk + 1) * nproc >= tc) { 956 /* chunk size too large, switch to dynamic */ 957 schedule = kmp_sch_dynamic_chunked; 958 } else { 959 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 960 DBL x; 961 962 #if KMP_OS_WINDOWS && KMP_ARCH_X86 963 /* Linux* OS already has 64-bit computation by default for long double, 964 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On 965 Windows* OS on IA-32 architecture, we need to set precision to 64-bit 966 instead of the default 53-bit. Even though long double doesn't work 967 on Windows* OS on Intel(R) 64, the resulting lack of precision is not 968 expected to impact the correctness of the algorithm, but this has not 969 been mathematically proven. */ 970 // save original FPCW and set precision to 64-bit, as 971 // Windows* OS on IA-32 architecture defaults to 53-bit 972 unsigned int oldFpcw = _control87(0, 0); 973 _control87(_PC_64, _MCW_PC); // 0,0x30000 974 #endif 975 /* value used for comparison in solver for cross-over point */ 976 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 977 978 /* crossover point--chunk indexes equal to or greater than 979 this point switch to dynamic-style scheduling */ 980 UT cross; 981 982 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 983 x = (long double)1.0 - (long double)0.5 / nproc; 984 985 #ifdef KMP_DEBUG 986 { // test natural alignment 987 struct _test_a { 988 char a; 989 union { 990 char b; 991 DBL d; 992 }; 993 } t; 994 ptrdiff_t natural_alignment = 995 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 996 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long 997 // long)natural_alignment ); 998 KMP_DEBUG_ASSERT( 999 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0); 1000 } 1001 #endif // KMP_DEBUG 1002 1003 /* save the term in thread private dispatch structure */ 1004 *(DBL *)&pr->u.p.parm3 = x; 1005 1006 /* solve for the crossover point to the nearest integer i for which C_i 1007 <= chunk */ 1008 { 1009 UT left, right, mid; 1010 long double p; 1011 1012 /* estimate initial upper and lower bound */ 1013 1014 /* doesn't matter what value right is as long as it is positive, but 1015 it affects performance of the solver */ 1016 right = 229; 1017 p = __kmp_pow<UT>(x, right); 1018 if (p > target) { 1019 do { 1020 p *= p; 1021 right <<= 1; 1022 } while (p > target && right < (1 << 27)); 1023 /* lower bound is previous (failed) estimate of upper bound */ 1024 left = right >> 1; 1025 } else { 1026 left = 0; 1027 } 1028 1029 /* bisection root-finding method */ 1030 while (left + 1 < right) { 1031 mid = (left + right) / 2; 1032 if (__kmp_pow<UT>(x, mid) > target) { 1033 left = mid; 1034 } else { 1035 right = mid; 1036 } 1037 } // while 1038 cross = right; 1039 } 1040 /* assert sanity of computed crossover point */ 1041 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target && 1042 __kmp_pow<UT>(x, cross) <= target); 1043 1044 /* save the crossover point in thread private dispatch structure */ 1045 pr->u.p.parm2 = cross; 1046 1047 // C75803 1048 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 1049 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 1050 #else 1051 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1052 #endif 1053 /* dynamic-style scheduling offset */ 1054 pr->u.p.count = tc - __kmp_dispatch_guided_remaining( 1055 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - 1056 cross * chunk; 1057 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1058 // restore FPCW 1059 _control87(oldFpcw, _MCW_PC); 1060 #endif 1061 } // if 1062 } else { 1063 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to " 1064 "kmp_sch_static_greedy\n", 1065 gtid)); 1066 schedule = kmp_sch_static_greedy; 1067 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1068 pr->u.p.parm1 = tc; 1069 } // if 1070 } // case 1071 break; 1072 case kmp_sch_static_greedy: 1073 KD_TRACE(100, 1074 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid)); 1075 pr->u.p.parm1 = (th->th.th_team_nproc > 1) 1076 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc 1077 : tc; 1078 break; 1079 case kmp_sch_static_chunked: 1080 case kmp_sch_dynamic_chunked: 1081 if (pr->u.p.parm1 <= 0) { 1082 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1083 } 1084 KD_TRACE(100, ("__kmp_dispatch_init: T#%d " 1085 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", 1086 gtid)); 1087 break; 1088 case kmp_sch_trapezoidal: { 1089 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1090 1091 T parm1, parm2, parm3, parm4; 1092 KD_TRACE(100, 1093 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid)); 1094 1095 parm1 = chunk; 1096 1097 /* F : size of the first cycle */ 1098 parm2 = (tc / (2 * th->th.th_team_nproc)); 1099 1100 if (parm2 < 1) { 1101 parm2 = 1; 1102 } 1103 1104 /* L : size of the last cycle. Make sure the last cycle is not larger 1105 than the first cycle. */ 1106 if (parm1 < 1) { 1107 parm1 = 1; 1108 } else if (parm1 > parm2) { 1109 parm1 = parm2; 1110 } 1111 1112 /* N : number of cycles */ 1113 parm3 = (parm2 + parm1); 1114 parm3 = (2 * tc + parm3 - 1) / parm3; 1115 1116 if (parm3 < 2) { 1117 parm3 = 2; 1118 } 1119 1120 /* sigma : decreasing incr of the trapezoid */ 1121 parm4 = (parm3 - 1); 1122 parm4 = (parm2 - parm1) / parm4; 1123 1124 // pointless check, because parm4 >= 0 always 1125 // if ( parm4 < 0 ) { 1126 // parm4 = 0; 1127 //} 1128 1129 pr->u.p.parm1 = parm1; 1130 pr->u.p.parm2 = parm2; 1131 pr->u.p.parm3 = parm3; 1132 pr->u.p.parm4 = parm4; 1133 } // case 1134 break; 1135 1136 default: { 1137 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 1138 KMP_HNT(GetNewerLibrary), // Hint 1139 __kmp_msg_null // Variadic argument list terminator 1140 ); 1141 } break; 1142 } // switch 1143 pr->schedule = schedule; 1144 if (active) { 1145 /* The name of this buffer should be my_buffer_index when it's free to use 1146 * it */ 1147 1148 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 1149 "sh->buffer_index:%d\n", 1150 gtid, my_buffer_index, sh->buffer_index)); 1151 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index, 1152 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL)); 1153 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and 1154 // my_buffer_index are *always* 32-bit integers. 1155 KMP_MB(); /* is this necessary? */ 1156 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 1157 "sh->buffer_index:%d\n", 1158 gtid, my_buffer_index, sh->buffer_index)); 1159 1160 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; 1161 th->th.th_dispatch->th_dispatch_sh_current = 1162 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); 1163 #if USE_ITT_BUILD 1164 if (pr->ordered) { 1165 __kmp_itt_ordered_init(gtid); 1166 } 1167 // Report loop metadata 1168 if (itt_need_metadata_reporting) { 1169 // Only report metadata by master of active team at level 1 1170 kmp_uint64 schedtype = 0; 1171 switch (schedule) { 1172 case kmp_sch_static_chunked: 1173 case kmp_sch_static_balanced: // Chunk is calculated in the switch above 1174 break; 1175 case kmp_sch_static_greedy: 1176 cur_chunk = pr->u.p.parm1; 1177 break; 1178 case kmp_sch_dynamic_chunked: 1179 schedtype = 1; 1180 break; 1181 case kmp_sch_guided_iterative_chunked: 1182 case kmp_sch_guided_analytical_chunked: 1183 case kmp_sch_guided_simd: 1184 schedtype = 2; 1185 break; 1186 default: 1187 // Should we put this case under "static"? 1188 // case kmp_sch_static_steal: 1189 schedtype = 3; 1190 break; 1191 } 1192 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1193 } 1194 #endif /* USE_ITT_BUILD */ 1195 } 1196 1197 #ifdef KMP_DEBUG 1198 { 1199 char *buff; 1200 // create format specifiers before the debug output 1201 buff = __kmp_str_format( 1202 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 1203 "lb:%%%s ub:%%%s" 1204 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 1205 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1206 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, 1207 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1208 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, 1209 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); 1210 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1211 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower, 1212 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2, 1213 pr->u.p.parm3, pr->u.p.parm4)); 1214 __kmp_str_free(&buff); 1215 } 1216 #endif 1217 #if (KMP_STATIC_STEAL_ENABLED) 1218 // It cannot be guaranteed that after execution of a loop with some other 1219 // schedule kind all the parm3 variables will contain the same value. Even if 1220 // all parm3 will be the same, it still exists a bad case like using 0 and 1 1221 // rather than program life-time increment. So the dedicated variable is 1222 // required. The 'static_steal_counter' is used. 1223 if (schedule == kmp_sch_static_steal) { 1224 // Other threads will inspect this variable when searching for a victim. 1225 // This is a flag showing that other threads may steal from this thread 1226 // since then. 1227 volatile T *p = &pr->u.p.static_steal_counter; 1228 *p = *p + 1; 1229 } 1230 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1231 1232 #if OMPT_SUPPORT && OMPT_OPTIONAL 1233 if (ompt_enabled.ompt_callback_work) { 1234 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1235 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1236 kmp_info_t *thr = __kmp_threads[gtid]; 1237 ompt_callbacks.ompt_callback(ompt_callback_work)( 1238 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), 1239 &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid)); 1240 } 1241 #endif 1242 } 1243 1244 /* For ordered loops, either __kmp_dispatch_finish() should be called after 1245 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1246 * every chunk of iterations. If the ordered section(s) were not executed 1247 * for this iteration (or every iteration in this chunk), we need to set the 1248 * ordered iteration counters so that the next thread can proceed. */ 1249 template <typename UT> 1250 static void __kmp_dispatch_finish(int gtid, ident_t *loc) { 1251 typedef typename traits_t<UT>::signed_t ST; 1252 kmp_info_t *th = __kmp_threads[gtid]; 1253 1254 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid)); 1255 if (!th->th.th_team->t.t_serialized) { 1256 1257 dispatch_private_info_template<UT> *pr = 1258 reinterpret_cast<dispatch_private_info_template<UT> *>( 1259 th->th.th_dispatch->th_dispatch_pr_current); 1260 dispatch_shared_info_template<UT> volatile *sh = 1261 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1262 th->th.th_dispatch->th_dispatch_sh_current); 1263 KMP_DEBUG_ASSERT(pr); 1264 KMP_DEBUG_ASSERT(sh); 1265 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1266 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1267 1268 if (pr->ordered_bumped) { 1269 KD_TRACE( 1270 1000, 1271 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1272 gtid)); 1273 pr->ordered_bumped = 0; 1274 } else { 1275 UT lower = pr->u.p.ordered_lower; 1276 1277 #ifdef KMP_DEBUG 1278 { 1279 char *buff; 1280 // create format specifiers before the debug output 1281 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " 1282 "ordered_iteration:%%%s lower:%%%s\n", 1283 traits_t<UT>::spec, traits_t<UT>::spec); 1284 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1285 __kmp_str_free(&buff); 1286 } 1287 #endif 1288 1289 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1290 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1291 KMP_MB(); /* is this necessary? */ 1292 #ifdef KMP_DEBUG 1293 { 1294 char *buff; 1295 // create format specifiers before the debug output 1296 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " 1297 "ordered_iteration:%%%s lower:%%%s\n", 1298 traits_t<UT>::spec, traits_t<UT>::spec); 1299 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); 1300 __kmp_str_free(&buff); 1301 } 1302 #endif 1303 1304 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); 1305 } // if 1306 } // if 1307 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid)); 1308 } 1309 1310 #ifdef KMP_GOMP_COMPAT 1311 1312 template <typename UT> 1313 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { 1314 typedef typename traits_t<UT>::signed_t ST; 1315 kmp_info_t *th = __kmp_threads[gtid]; 1316 1317 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid)); 1318 if (!th->th.th_team->t.t_serialized) { 1319 // int cid; 1320 dispatch_private_info_template<UT> *pr = 1321 reinterpret_cast<dispatch_private_info_template<UT> *>( 1322 th->th.th_dispatch->th_dispatch_pr_current); 1323 dispatch_shared_info_template<UT> volatile *sh = 1324 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( 1325 th->th.th_dispatch->th_dispatch_sh_current); 1326 KMP_DEBUG_ASSERT(pr); 1327 KMP_DEBUG_ASSERT(sh); 1328 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1329 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1330 1331 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1332 UT lower = pr->u.p.ordered_lower; 1333 UT upper = pr->u.p.ordered_upper; 1334 UT inc = upper - lower + 1; 1335 1336 if (pr->ordered_bumped == inc) { 1337 KD_TRACE( 1338 1000, 1339 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1340 gtid)); 1341 pr->ordered_bumped = 0; 1342 } else { 1343 inc -= pr->ordered_bumped; 1344 1345 #ifdef KMP_DEBUG 1346 { 1347 char *buff; 1348 // create format specifiers before the debug output 1349 buff = __kmp_str_format( 1350 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1351 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1352 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); 1353 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper)); 1354 __kmp_str_free(&buff); 1355 } 1356 #endif 1357 1358 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, 1359 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); 1360 1361 KMP_MB(); /* is this necessary? */ 1362 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting " 1363 "ordered_bumped to zero\n", 1364 gtid)); 1365 pr->ordered_bumped = 0; 1366 //!!!!! TODO check if the inc should be unsigned, or signed??? 1367 #ifdef KMP_DEBUG 1368 { 1369 char *buff; 1370 // create format specifiers before the debug output 1371 buff = __kmp_str_format( 1372 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1373 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1374 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, 1375 traits_t<UT>::spec); 1376 KD_TRACE(1000, 1377 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper)); 1378 __kmp_str_free(&buff); 1379 } 1380 #endif 1381 1382 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); 1383 } 1384 // } 1385 } 1386 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid)); 1387 } 1388 1389 #endif /* KMP_GOMP_COMPAT */ 1390 1391 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more 1392 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() 1393 is not called. */ 1394 #if OMPT_SUPPORT && OMPT_OPTIONAL 1395 #define OMPT_LOOP_END \ 1396 if (status == 0) { \ 1397 if (ompt_enabled.ompt_callback_work) { \ 1398 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1399 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1400 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1401 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1402 &(task_info->task_data), 0, codeptr); \ 1403 } \ 1404 } 1405 // TODO: implement count 1406 #else 1407 #define OMPT_LOOP_END // no-op 1408 #endif 1409 1410 template <typename T> 1411 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, 1412 T *p_lb, T *p_ub, 1413 typename traits_t<T>::signed_t *p_st 1414 #if OMPT_SUPPORT && OMPT_OPTIONAL 1415 , 1416 void *codeptr 1417 #endif 1418 ) { 1419 1420 typedef typename traits_t<T>::unsigned_t UT; 1421 typedef typename traits_t<T>::signed_t ST; 1422 typedef typename traits_t<T>::floating_t DBL; 1423 1424 // This is potentially slightly misleading, schedule(runtime) will appear here 1425 // even if the actual runtme schedule is static. (Which points out a 1426 // disadavantage of schedule(runtime): even when static scheduling is used it 1427 // costs more than a compile time choice to use static scheduling would.) 1428 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1429 1430 int status; 1431 dispatch_private_info_template<T> *pr; 1432 kmp_info_t *th = __kmp_threads[gtid]; 1433 kmp_team_t *team = th->th.th_team; 1434 1435 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL 1436 #ifdef KMP_DEBUG 1437 { 1438 char *buff; 1439 // create format specifiers before the debug output 1440 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s " 1441 "p_ub:%%%s p_st:%%%s p_last: %%p\n", 1442 traits_t<T>::spec, traits_t<T>::spec, 1443 traits_t<ST>::spec); 1444 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last)); 1445 __kmp_str_free(&buff); 1446 } 1447 #endif 1448 1449 if (team->t.t_serialized) { 1450 /* NOTE: serialize this dispatch becase we are not at the active level */ 1451 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1452 th->th.th_dispatch->th_disp_buffer); /* top of the stack */ 1453 KMP_DEBUG_ASSERT(pr); 1454 1455 if ((status = (pr->u.p.tc != 0)) == 0) { 1456 *p_lb = 0; 1457 *p_ub = 0; 1458 // if ( p_last != NULL ) 1459 // *p_last = 0; 1460 if (p_st != NULL) 1461 *p_st = 0; 1462 if (__kmp_env_consistency_check) { 1463 if (pr->pushed_ws != ct_none) { 1464 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1465 } 1466 } 1467 } else if (pr->nomerge) { 1468 kmp_int32 last; 1469 T start; 1470 UT limit, trip, init; 1471 ST incr; 1472 T chunk = pr->u.p.parm1; 1473 1474 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1475 gtid)); 1476 1477 init = chunk * pr->u.p.count++; 1478 trip = pr->u.p.tc - 1; 1479 1480 if ((status = (init <= trip)) == 0) { 1481 *p_lb = 0; 1482 *p_ub = 0; 1483 // if ( p_last != NULL ) 1484 // *p_last = 0; 1485 if (p_st != NULL) 1486 *p_st = 0; 1487 if (__kmp_env_consistency_check) { 1488 if (pr->pushed_ws != ct_none) { 1489 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 1490 } 1491 } 1492 } else { 1493 start = pr->u.p.lb; 1494 limit = chunk + init - 1; 1495 incr = pr->u.p.st; 1496 1497 if ((last = (limit >= trip)) != 0) { 1498 limit = trip; 1499 #if KMP_OS_WINDOWS 1500 pr->u.p.last_upper = pr->u.p.ub; 1501 #endif /* KMP_OS_WINDOWS */ 1502 } 1503 if (p_last != NULL) 1504 *p_last = last; 1505 if (p_st != NULL) 1506 *p_st = incr; 1507 if (incr == 1) { 1508 *p_lb = start + init; 1509 *p_ub = start + limit; 1510 } else { 1511 *p_lb = start + init * incr; 1512 *p_ub = start + limit * incr; 1513 } 1514 1515 if (pr->ordered) { 1516 pr->u.p.ordered_lower = init; 1517 pr->u.p.ordered_upper = limit; 1518 #ifdef KMP_DEBUG 1519 { 1520 char *buff; 1521 // create format specifiers before the debug output 1522 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1523 "ordered_lower:%%%s ordered_upper:%%%s\n", 1524 traits_t<UT>::spec, traits_t<UT>::spec); 1525 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1526 pr->u.p.ordered_upper)); 1527 __kmp_str_free(&buff); 1528 } 1529 #endif 1530 } // if 1531 } // if 1532 } else { 1533 pr->u.p.tc = 0; 1534 *p_lb = pr->u.p.lb; 1535 *p_ub = pr->u.p.ub; 1536 #if KMP_OS_WINDOWS 1537 pr->u.p.last_upper = *p_ub; 1538 #endif /* KMP_OS_WINDOWS */ 1539 if (p_last != NULL) 1540 *p_last = TRUE; 1541 if (p_st != NULL) 1542 *p_st = pr->u.p.st; 1543 } // if 1544 #ifdef KMP_DEBUG 1545 { 1546 char *buff; 1547 // create format specifiers before the debug output 1548 buff = __kmp_str_format( 1549 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1550 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1551 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 1552 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status)); 1553 __kmp_str_free(&buff); 1554 } 1555 #endif 1556 #if INCLUDE_SSC_MARKS 1557 SSC_MARK_DISPATCH_NEXT(); 1558 #endif 1559 OMPT_LOOP_END; 1560 return status; 1561 } else { 1562 kmp_int32 last = 0; 1563 dispatch_shared_info_template<UT> *sh; 1564 T start; 1565 ST incr; 1566 UT limit, trip, init; 1567 1568 KMP_DEBUG_ASSERT(th->th.th_dispatch == 1569 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); 1570 1571 pr = reinterpret_cast<dispatch_private_info_template<T> *>( 1572 th->th.th_dispatch->th_dispatch_pr_current); 1573 KMP_DEBUG_ASSERT(pr); 1574 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>( 1575 th->th.th_dispatch->th_dispatch_sh_current); 1576 KMP_DEBUG_ASSERT(sh); 1577 1578 if (pr->u.p.tc == 0) { 1579 // zero trip count 1580 status = 0; 1581 } else { 1582 switch (pr->schedule) { 1583 #if (KMP_STATIC_STEAL_ENABLED) 1584 case kmp_sch_static_steal: { 1585 T chunk = pr->u.p.parm1; 1586 int nproc = th->th.th_team_nproc; 1587 1588 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", 1589 gtid)); 1590 1591 trip = pr->u.p.tc - 1; 1592 1593 if (traits_t<T>::type_size > 4) { 1594 // use lock for 8-byte and CAS for 4-byte induction 1595 // variable. TODO (optional): check and use 16-byte CAS 1596 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; 1597 KMP_DEBUG_ASSERT(lck != NULL); 1598 if (pr->u.p.count < (UT)pr->u.p.ub) { 1599 __kmp_acquire_lock(lck, gtid); 1600 // try to get own chunk of iterations 1601 init = (pr->u.p.count)++; 1602 status = (init < (UT)pr->u.p.ub); 1603 __kmp_release_lock(lck, gtid); 1604 } else { 1605 status = 0; // no own chunks 1606 } 1607 if (!status) { // try to steal 1608 kmp_info_t **other_threads = team->t.t_threads; 1609 int while_limit = nproc; // nproc attempts to find a victim 1610 int while_index = 0; 1611 // TODO: algorithm of searching for a victim 1612 // should be cleaned up and measured 1613 while ((!status) && (while_limit != ++while_index)) { 1614 T remaining; 1615 T victimIdx = pr->u.p.parm4; 1616 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1617 dispatch_private_info_template<T> *victim = 1618 reinterpret_cast<dispatch_private_info_template<T> *>( 1619 other_threads[victimIdx] 1620 ->th.th_dispatch->th_dispatch_pr_current); 1621 while ((victim == NULL || victim == pr || 1622 (*(volatile T *)&victim->u.p.static_steal_counter != 1623 *(volatile T *)&pr->u.p.static_steal_counter)) && 1624 oldVictimIdx != victimIdx) { 1625 victimIdx = (victimIdx + 1) % nproc; 1626 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1627 other_threads[victimIdx] 1628 ->th.th_dispatch->th_dispatch_pr_current); 1629 } 1630 if (!victim || 1631 (*(volatile T *)&victim->u.p.static_steal_counter != 1632 *(volatile T *)&pr->u.p.static_steal_counter)) { 1633 continue; // try once more (nproc attempts in total) 1634 // no victim is ready yet to participate in stealing 1635 // because all victims are still in kmp_init_dispatch 1636 } 1637 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { 1638 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1639 continue; // not enough chunks to steal, goto next victim 1640 } 1641 1642 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1643 KMP_ASSERT(lck != NULL); 1644 __kmp_acquire_lock(lck, gtid); 1645 limit = victim->u.p.ub; // keep initial ub 1646 if (victim->u.p.count >= limit || 1647 (remaining = limit - victim->u.p.count) < 2) { 1648 __kmp_release_lock(lck, gtid); 1649 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1650 continue; // not enough chunks to steal 1651 } 1652 // stealing succeded, reduce victim's ub by 1/4 of undone chunks 1653 // or by 1 1654 if (remaining > 3) { 1655 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2); 1656 init = (victim->u.p.ub -= 1657 (remaining >> 2)); // steal 1/4 of remaining 1658 } else { 1659 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1660 init = 1661 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining 1662 } 1663 __kmp_release_lock(lck, gtid); 1664 1665 KMP_DEBUG_ASSERT(init + 1 <= limit); 1666 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1667 status = 1; 1668 while_index = 0; 1669 // now update own count and ub with stolen range but init chunk 1670 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1671 pr->u.p.count = init + 1; 1672 pr->u.p.ub = limit; 1673 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1674 } // while (search for victim) 1675 } // if (try to find victim and steal) 1676 } else { 1677 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1678 typedef union { 1679 struct { 1680 UT count; 1681 T ub; 1682 } p; 1683 kmp_int64 b; 1684 } union_i4; 1685 // All operations on 'count' or 'ub' must be combined atomically 1686 // together. 1687 { 1688 union_i4 vold, vnew; 1689 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1690 vnew = vold; 1691 vnew.p.count++; 1692 while (!KMP_COMPARE_AND_STORE_ACQ64( 1693 (volatile kmp_int64 *)&pr->u.p.count, 1694 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1695 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1696 KMP_CPU_PAUSE(); 1697 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); 1698 vnew = vold; 1699 vnew.p.count++; 1700 } 1701 vnew = vold; 1702 init = vnew.p.count; 1703 status = (init < (UT)vnew.p.ub); 1704 } 1705 1706 if (!status) { 1707 kmp_info_t **other_threads = team->t.t_threads; 1708 int while_limit = nproc; // nproc attempts to find a victim 1709 int while_index = 0; 1710 1711 // TODO: algorithm of searching for a victim 1712 // should be cleaned up and measured 1713 while ((!status) && (while_limit != ++while_index)) { 1714 union_i4 vold, vnew; 1715 kmp_int32 remaining; 1716 T victimIdx = pr->u.p.parm4; 1717 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1718 dispatch_private_info_template<T> *victim = 1719 reinterpret_cast<dispatch_private_info_template<T> *>( 1720 other_threads[victimIdx] 1721 ->th.th_dispatch->th_dispatch_pr_current); 1722 while ((victim == NULL || victim == pr || 1723 (*(volatile T *)&victim->u.p.static_steal_counter != 1724 *(volatile T *)&pr->u.p.static_steal_counter)) && 1725 oldVictimIdx != victimIdx) { 1726 victimIdx = (victimIdx + 1) % nproc; 1727 victim = reinterpret_cast<dispatch_private_info_template<T> *>( 1728 other_threads[victimIdx] 1729 ->th.th_dispatch->th_dispatch_pr_current); 1730 } 1731 if (!victim || 1732 (*(volatile T *)&victim->u.p.static_steal_counter != 1733 *(volatile T *)&pr->u.p.static_steal_counter)) { 1734 continue; // try once more (nproc attempts in total) 1735 // no victim is ready yet to participate in stealing 1736 // because all victims are still in kmp_init_dispatch 1737 } 1738 pr->u.p.parm4 = victimIdx; // new victim found 1739 while (1) { // CAS loop if victim has enough chunks to steal 1740 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); 1741 vnew = vold; 1742 1743 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1744 if (vnew.p.count >= (UT)vnew.p.ub || 1745 (remaining = vnew.p.ub - vnew.p.count) < 2) { 1746 pr->u.p.parm4 = 1747 (victimIdx + 1) % nproc; // shift start victim id 1748 break; // not enough chunks to steal, goto next victim 1749 } 1750 if (remaining > 3) { 1751 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining 1752 } else { 1753 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1754 } 1755 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1756 // TODO: Should this be acquire or release? 1757 if (KMP_COMPARE_AND_STORE_ACQ64( 1758 (volatile kmp_int64 *)&victim->u.p.count, 1759 *VOLATILE_CAST(kmp_int64 *) & vold.b, 1760 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { 1761 // stealing succeeded 1762 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1763 vold.p.ub - vnew.p.ub); 1764 status = 1; 1765 while_index = 0; 1766 // now update own count and ub 1767 init = vnew.p.ub; 1768 vold.p.count = init + 1; 1769 #if KMP_ARCH_X86 1770 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), 1771 vold.b); 1772 #else 1773 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; 1774 #endif 1775 break; 1776 } // if (check CAS result) 1777 KMP_CPU_PAUSE(); // CAS failed, repeat attempt 1778 } // while (try to steal from particular victim) 1779 } // while (search for victim) 1780 } // if (try to find victim and steal) 1781 } // if (4-byte induction variable) 1782 if (!status) { 1783 *p_lb = 0; 1784 *p_ub = 0; 1785 if (p_st != NULL) 1786 *p_st = 0; 1787 } else { 1788 start = pr->u.p.parm2; 1789 init *= chunk; 1790 limit = chunk + init - 1; 1791 incr = pr->u.p.st; 1792 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1793 1794 KMP_DEBUG_ASSERT(init <= trip); 1795 if ((last = (limit >= trip)) != 0) 1796 limit = trip; 1797 if (p_st != NULL) 1798 *p_st = incr; 1799 1800 if (incr == 1) { 1801 *p_lb = start + init; 1802 *p_ub = start + limit; 1803 } else { 1804 *p_lb = start + init * incr; 1805 *p_ub = start + limit * incr; 1806 } 1807 1808 if (pr->ordered) { 1809 pr->u.p.ordered_lower = init; 1810 pr->u.p.ordered_upper = limit; 1811 #ifdef KMP_DEBUG 1812 { 1813 char *buff; 1814 // create format specifiers before the debug output 1815 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1816 "ordered_lower:%%%s ordered_upper:%%%s\n", 1817 traits_t<UT>::spec, traits_t<UT>::spec); 1818 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1819 pr->u.p.ordered_upper)); 1820 __kmp_str_free(&buff); 1821 } 1822 #endif 1823 } // if 1824 } // if 1825 break; 1826 } // case 1827 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1828 case kmp_sch_static_balanced: { 1829 KD_TRACE( 1830 100, 1831 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid)); 1832 if ((status = !pr->u.p.count) != 1833 0) { /* check if thread has any iteration to do */ 1834 pr->u.p.count = 1; 1835 *p_lb = pr->u.p.lb; 1836 *p_ub = pr->u.p.ub; 1837 last = pr->u.p.parm1; 1838 if (p_st != NULL) 1839 *p_st = pr->u.p.st; 1840 } else { /* no iterations to do */ 1841 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1842 } 1843 if (pr->ordered) { 1844 #ifdef KMP_DEBUG 1845 { 1846 char *buff; 1847 // create format specifiers before the debug output 1848 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1849 "ordered_lower:%%%s ordered_upper:%%%s\n", 1850 traits_t<UT>::spec, traits_t<UT>::spec); 1851 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1852 pr->u.p.ordered_upper)); 1853 __kmp_str_free(&buff); 1854 } 1855 #endif 1856 } // if 1857 } // case 1858 break; 1859 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was 1860 merged here */ 1861 case kmp_sch_static_chunked: { 1862 T parm1; 1863 1864 KD_TRACE(100, ("__kmp_dispatch_next: T#%d " 1865 "kmp_sch_static_[affinity|chunked] case\n", 1866 gtid)); 1867 parm1 = pr->u.p.parm1; 1868 1869 trip = pr->u.p.tc - 1; 1870 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1871 1872 if ((status = (init <= trip)) != 0) { 1873 start = pr->u.p.lb; 1874 incr = pr->u.p.st; 1875 limit = parm1 + init - 1; 1876 1877 if ((last = (limit >= trip)) != 0) 1878 limit = trip; 1879 1880 if (p_st != NULL) 1881 *p_st = incr; 1882 1883 pr->u.p.count += th->th.th_team_nproc; 1884 1885 if (incr == 1) { 1886 *p_lb = start + init; 1887 *p_ub = start + limit; 1888 } else { 1889 *p_lb = start + init * incr; 1890 *p_ub = start + limit * incr; 1891 } 1892 1893 if (pr->ordered) { 1894 pr->u.p.ordered_lower = init; 1895 pr->u.p.ordered_upper = limit; 1896 #ifdef KMP_DEBUG 1897 { 1898 char *buff; 1899 // create format specifiers before the debug output 1900 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1901 "ordered_lower:%%%s ordered_upper:%%%s\n", 1902 traits_t<UT>::spec, traits_t<UT>::spec); 1903 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1904 pr->u.p.ordered_upper)); 1905 __kmp_str_free(&buff); 1906 } 1907 #endif 1908 } // if 1909 } // if 1910 } // case 1911 break; 1912 1913 case kmp_sch_dynamic_chunked: { 1914 T chunk = pr->u.p.parm1; 1915 1916 KD_TRACE( 1917 100, 1918 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid)); 1919 1920 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 1921 trip = pr->u.p.tc - 1; 1922 1923 if ((status = (init <= trip)) == 0) { 1924 *p_lb = 0; 1925 *p_ub = 0; 1926 if (p_st != NULL) 1927 *p_st = 0; 1928 } else { 1929 start = pr->u.p.lb; 1930 limit = chunk + init - 1; 1931 incr = pr->u.p.st; 1932 1933 if ((last = (limit >= trip)) != 0) 1934 limit = trip; 1935 1936 if (p_st != NULL) 1937 *p_st = incr; 1938 1939 if (incr == 1) { 1940 *p_lb = start + init; 1941 *p_ub = start + limit; 1942 } else { 1943 *p_lb = start + init * incr; 1944 *p_ub = start + limit * incr; 1945 } 1946 1947 if (pr->ordered) { 1948 pr->u.p.ordered_lower = init; 1949 pr->u.p.ordered_upper = limit; 1950 #ifdef KMP_DEBUG 1951 { 1952 char *buff; 1953 // create format specifiers before the debug output 1954 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 1955 "ordered_lower:%%%s ordered_upper:%%%s\n", 1956 traits_t<UT>::spec, traits_t<UT>::spec); 1957 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 1958 pr->u.p.ordered_upper)); 1959 __kmp_str_free(&buff); 1960 } 1961 #endif 1962 } // if 1963 } // if 1964 } // case 1965 break; 1966 1967 case kmp_sch_guided_iterative_chunked: { 1968 T chunkspec = pr->u.p.parm1; 1969 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 1970 "iterative case\n", 1971 gtid)); 1972 trip = pr->u.p.tc; 1973 // Start atomic part of calculations 1974 while (1) { 1975 ST remaining; // signed, because can be < 0 1976 init = sh->u.s.iteration; // shared value 1977 remaining = trip - init; 1978 if (remaining <= 0) { // AC: need to compare with 0 first 1979 // nothing to do, don't try atomic op 1980 status = 0; 1981 break; 1982 } 1983 if ((T)remaining < 1984 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default 1985 // use dynamic-style shcedule 1986 // atomically inrement iterations, get old value 1987 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 1988 (ST)chunkspec); 1989 remaining = trip - init; 1990 if (remaining <= 0) { 1991 status = 0; // all iterations got by other threads 1992 } else { // got some iterations to work on 1993 status = 1; 1994 if ((T)remaining > chunkspec) { 1995 limit = init + chunkspec - 1; 1996 } else { 1997 last = 1; // the last chunk 1998 limit = init + remaining - 1; 1999 } // if 2000 } // if 2001 break; 2002 } // if 2003 limit = init + (UT)(remaining * 2004 *(double *)&pr->u.p.parm3); // divide by K*nproc 2005 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2006 (ST)init, (ST)limit)) { 2007 // CAS was successful, chunk obtained 2008 status = 1; 2009 --limit; 2010 break; 2011 } // if 2012 } // while 2013 if (status != 0) { 2014 start = pr->u.p.lb; 2015 incr = pr->u.p.st; 2016 if (p_st != NULL) 2017 *p_st = incr; 2018 *p_lb = start + init * incr; 2019 *p_ub = start + limit * incr; 2020 if (pr->ordered) { 2021 pr->u.p.ordered_lower = init; 2022 pr->u.p.ordered_upper = limit; 2023 #ifdef KMP_DEBUG 2024 { 2025 char *buff; 2026 // create format specifiers before the debug output 2027 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2028 "ordered_lower:%%%s ordered_upper:%%%s\n", 2029 traits_t<UT>::spec, traits_t<UT>::spec); 2030 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2031 pr->u.p.ordered_upper)); 2032 __kmp_str_free(&buff); 2033 } 2034 #endif 2035 } // if 2036 } else { 2037 *p_lb = 0; 2038 *p_ub = 0; 2039 if (p_st != NULL) 2040 *p_st = 0; 2041 } // if 2042 } // case 2043 break; 2044 2045 case kmp_sch_guided_simd: { 2046 // same as iterative but curr-chunk adjusted to be multiple of given 2047 // chunk 2048 T chunk = pr->u.p.parm1; 2049 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", 2050 gtid)); 2051 trip = pr->u.p.tc; 2052 // Start atomic part of calculations 2053 while (1) { 2054 ST remaining; // signed, because can be < 0 2055 init = sh->u.s.iteration; // shared value 2056 remaining = trip - init; 2057 if (remaining <= 0) { // AC: need to compare with 0 first 2058 status = 0; // nothing to do, don't try atomic op 2059 break; 2060 } 2061 KMP_DEBUG_ASSERT(init % chunk == 0); 2062 // compare with K*nproc*(chunk+1), K=2 by default 2063 if ((T)remaining < pr->u.p.parm2) { 2064 // use dynamic-style shcedule 2065 // atomically inrement iterations, get old value 2066 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2067 (ST)chunk); 2068 remaining = trip - init; 2069 if (remaining <= 0) { 2070 status = 0; // all iterations got by other threads 2071 } else { 2072 // got some iterations to work on 2073 status = 1; 2074 if ((T)remaining > chunk) { 2075 limit = init + chunk - 1; 2076 } else { 2077 last = 1; // the last chunk 2078 limit = init + remaining - 1; 2079 } // if 2080 } // if 2081 break; 2082 } // if 2083 // divide by K*nproc 2084 UT span = remaining * (*(double *)&pr->u.p.parm3); 2085 UT rem = span % chunk; 2086 if (rem) // adjust so that span%chunk == 0 2087 span += chunk - rem; 2088 limit = init + span; 2089 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration), 2090 (ST)init, (ST)limit)) { 2091 // CAS was successful, chunk obtained 2092 status = 1; 2093 --limit; 2094 break; 2095 } // if 2096 } // while 2097 if (status != 0) { 2098 start = pr->u.p.lb; 2099 incr = pr->u.p.st; 2100 if (p_st != NULL) 2101 *p_st = incr; 2102 *p_lb = start + init * incr; 2103 *p_ub = start + limit * incr; 2104 if (pr->ordered) { 2105 pr->u.p.ordered_lower = init; 2106 pr->u.p.ordered_upper = limit; 2107 #ifdef KMP_DEBUG 2108 { 2109 char *buff; 2110 // create format specifiers before the debug output 2111 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2112 "ordered_lower:%%%s ordered_upper:%%%s\n", 2113 traits_t<UT>::spec, traits_t<UT>::spec); 2114 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2115 pr->u.p.ordered_upper)); 2116 __kmp_str_free(&buff); 2117 } 2118 #endif 2119 } // if 2120 } else { 2121 *p_lb = 0; 2122 *p_ub = 0; 2123 if (p_st != NULL) 2124 *p_st = 0; 2125 } // if 2126 } // case 2127 break; 2128 2129 case kmp_sch_guided_analytical_chunked: { 2130 T chunkspec = pr->u.p.parm1; 2131 UT chunkIdx; 2132 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2133 /* for storing original FPCW value for Windows* OS on 2134 IA-32 architecture 8-byte version */ 2135 unsigned int oldFpcw; 2136 unsigned int fpcwSet = 0; 2137 #endif 2138 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked " 2139 "analytical case\n", 2140 gtid)); 2141 2142 trip = pr->u.p.tc; 2143 2144 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2145 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < 2146 trip); 2147 2148 while (1) { /* this while loop is a safeguard against unexpected zero 2149 chunk sizes */ 2150 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); 2151 if (chunkIdx >= (UT)pr->u.p.parm2) { 2152 --trip; 2153 /* use dynamic-style scheduling */ 2154 init = chunkIdx * chunkspec + pr->u.p.count; 2155 /* need to verify init > 0 in case of overflow in the above 2156 * calculation */ 2157 if ((status = (init > 0 && init <= trip)) != 0) { 2158 limit = init + chunkspec - 1; 2159 2160 if ((last = (limit >= trip)) != 0) 2161 limit = trip; 2162 } 2163 break; 2164 } else { 2165 /* use exponential-style scheduling */ 2166 /* The following check is to workaround the lack of long double precision on 2167 Windows* OS. 2168 This check works around the possible effect that init != 0 for chunkIdx == 0. 2169 */ 2170 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2171 /* If we haven't already done so, save original FPCW and set 2172 precision to 64-bit, as Windows* OS on IA-32 architecture 2173 defaults to 53-bit */ 2174 if (!fpcwSet) { 2175 oldFpcw = _control87(0, 0); 2176 _control87(_PC_64, _MCW_PC); 2177 fpcwSet = 0x30000; 2178 } 2179 #endif 2180 if (chunkIdx) { 2181 init = __kmp_dispatch_guided_remaining<T>( 2182 trip, *(DBL *)&pr->u.p.parm3, chunkIdx); 2183 KMP_DEBUG_ASSERT(init); 2184 init = trip - init; 2185 } else 2186 init = 0; 2187 limit = trip - __kmp_dispatch_guided_remaining<T>( 2188 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); 2189 KMP_ASSERT(init <= limit); 2190 if (init < limit) { 2191 KMP_DEBUG_ASSERT(limit <= trip); 2192 --limit; 2193 status = 1; 2194 break; 2195 } // if 2196 } // if 2197 } // while (1) 2198 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2199 /* restore FPCW if necessary 2200 AC: check fpcwSet flag first because oldFpcw can be uninitialized 2201 here */ 2202 if (fpcwSet && (oldFpcw & fpcwSet)) 2203 _control87(oldFpcw, _MCW_PC); 2204 #endif 2205 if (status != 0) { 2206 start = pr->u.p.lb; 2207 incr = pr->u.p.st; 2208 if (p_st != NULL) 2209 *p_st = incr; 2210 *p_lb = start + init * incr; 2211 *p_ub = start + limit * incr; 2212 if (pr->ordered) { 2213 pr->u.p.ordered_lower = init; 2214 pr->u.p.ordered_upper = limit; 2215 #ifdef KMP_DEBUG 2216 { 2217 char *buff; 2218 // create format specifiers before the debug output 2219 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2220 "ordered_lower:%%%s ordered_upper:%%%s\n", 2221 traits_t<UT>::spec, traits_t<UT>::spec); 2222 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2223 pr->u.p.ordered_upper)); 2224 __kmp_str_free(&buff); 2225 } 2226 #endif 2227 } 2228 } else { 2229 *p_lb = 0; 2230 *p_ub = 0; 2231 if (p_st != NULL) 2232 *p_st = 0; 2233 } 2234 } // case 2235 break; 2236 2237 case kmp_sch_trapezoidal: { 2238 UT index; 2239 T parm2 = pr->u.p.parm2; 2240 T parm3 = pr->u.p.parm3; 2241 T parm4 = pr->u.p.parm4; 2242 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2243 gtid)); 2244 2245 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); 2246 2247 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; 2248 trip = pr->u.p.tc - 1; 2249 2250 if ((status = ((T)index < parm3 && init <= trip)) == 0) { 2251 *p_lb = 0; 2252 *p_ub = 0; 2253 if (p_st != NULL) 2254 *p_st = 0; 2255 } else { 2256 start = pr->u.p.lb; 2257 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; 2258 incr = pr->u.p.st; 2259 2260 if ((last = (limit >= trip)) != 0) 2261 limit = trip; 2262 2263 if (p_st != NULL) 2264 *p_st = incr; 2265 2266 if (incr == 1) { 2267 *p_lb = start + init; 2268 *p_ub = start + limit; 2269 } else { 2270 *p_lb = start + init * incr; 2271 *p_ub = start + limit * incr; 2272 } 2273 2274 if (pr->ordered) { 2275 pr->u.p.ordered_lower = init; 2276 pr->u.p.ordered_upper = limit; 2277 #ifdef KMP_DEBUG 2278 { 2279 char *buff; 2280 // create format specifiers before the debug output 2281 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " 2282 "ordered_lower:%%%s ordered_upper:%%%s\n", 2283 traits_t<UT>::spec, traits_t<UT>::spec); 2284 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, 2285 pr->u.p.ordered_upper)); 2286 __kmp_str_free(&buff); 2287 } 2288 #endif 2289 } // if 2290 } // if 2291 } // case 2292 break; 2293 default: { 2294 status = 0; // to avoid complaints on uninitialized variable use 2295 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message 2296 KMP_HNT(GetNewerLibrary), // Hint 2297 __kmp_msg_null // Variadic argument list terminator 2298 ); 2299 } break; 2300 } // switch 2301 } // if tc == 0; 2302 2303 if (status == 0) { 2304 UT num_done; 2305 2306 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done); 2307 #ifdef KMP_DEBUG 2308 { 2309 char *buff; 2310 // create format specifiers before the debug output 2311 buff = __kmp_str_format( 2312 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2313 traits_t<UT>::spec); 2314 KD_TRACE(100, (buff, gtid, sh->u.s.num_done)); 2315 __kmp_str_free(&buff); 2316 } 2317 #endif 2318 2319 if ((ST)num_done == th->th.th_team_nproc - 1) { 2320 #if (KMP_STATIC_STEAL_ENABLED) 2321 if (pr->schedule == kmp_sch_static_steal && 2322 traits_t<T>::type_size > 4) { 2323 int i; 2324 kmp_info_t **other_threads = team->t.t_threads; 2325 // loop complete, safe to destroy locks used for stealing 2326 for (i = 0; i < th->th.th_team_nproc; ++i) { 2327 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2328 KMP_ASSERT(lck != NULL); 2329 __kmp_destroy_lock(lck); 2330 __kmp_free(lck); 2331 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2332 } 2333 } 2334 #endif 2335 /* NOTE: release this buffer to be reused */ 2336 2337 KMP_MB(); /* Flush all pending memory write invalidates. */ 2338 2339 sh->u.s.num_done = 0; 2340 sh->u.s.iteration = 0; 2341 2342 /* TODO replace with general release procedure? */ 2343 if (pr->ordered) { 2344 sh->u.s.ordered_iteration = 0; 2345 } 2346 2347 KMP_MB(); /* Flush all pending memory write invalidates. */ 2348 2349 sh->buffer_index += __kmp_dispatch_num_buffers; 2350 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2351 gtid, sh->buffer_index)); 2352 2353 KMP_MB(); /* Flush all pending memory write invalidates. */ 2354 2355 } // if 2356 if (__kmp_env_consistency_check) { 2357 if (pr->pushed_ws != ct_none) { 2358 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); 2359 } 2360 } 2361 2362 th->th.th_dispatch->th_deo_fcn = NULL; 2363 th->th.th_dispatch->th_dxo_fcn = NULL; 2364 th->th.th_dispatch->th_dispatch_sh_current = NULL; 2365 th->th.th_dispatch->th_dispatch_pr_current = NULL; 2366 } // if (status == 0) 2367 #if KMP_OS_WINDOWS 2368 else if (last) { 2369 pr->u.p.last_upper = pr->u.p.ub; 2370 } 2371 #endif /* KMP_OS_WINDOWS */ 2372 if (p_last != NULL && status != 0) 2373 *p_last = last; 2374 } // if 2375 2376 #ifdef KMP_DEBUG 2377 { 2378 char *buff; 2379 // create format specifiers before the debug output 2380 buff = __kmp_str_format( 2381 "__kmp_dispatch_next: T#%%d normal case: " 2382 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2383 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); 2384 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status)); 2385 __kmp_str_free(&buff); 2386 } 2387 #endif 2388 #if INCLUDE_SSC_MARKS 2389 SSC_MARK_DISPATCH_NEXT(); 2390 #endif 2391 OMPT_LOOP_END; 2392 return status; 2393 } 2394 2395 template <typename T> 2396 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, 2397 kmp_int32 *plastiter, T *plower, T *pupper, 2398 typename traits_t<T>::signed_t incr) { 2399 typedef typename traits_t<T>::unsigned_t UT; 2400 typedef typename traits_t<T>::signed_t ST; 2401 kmp_uint32 team_id; 2402 kmp_uint32 nteams; 2403 UT trip_count; 2404 kmp_team_t *team; 2405 kmp_info_t *th; 2406 2407 KMP_DEBUG_ASSERT(plastiter && plower && pupper); 2408 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2409 #ifdef KMP_DEBUG 2410 { 2411 char *buff; 2412 // create format specifiers before the debug output 2413 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " 2414 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2415 traits_t<T>::spec, traits_t<T>::spec, 2416 traits_t<ST>::spec, traits_t<T>::spec); 2417 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr)); 2418 __kmp_str_free(&buff); 2419 } 2420 #endif 2421 2422 if (__kmp_env_consistency_check) { 2423 if (incr == 0) { 2424 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 2425 loc); 2426 } 2427 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { 2428 // The loop is illegal. 2429 // Some zero-trip loops maintained by compiler, e.g.: 2430 // for(i=10;i<0;++i) // lower >= upper - run-time check 2431 // for(i=0;i>10;--i) // lower <= upper - run-time check 2432 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2433 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2434 // Compiler does not check the following illegal loops: 2435 // for(i=0;i<10;i+=incr) // where incr<0 2436 // for(i=10;i>0;i-=incr) // where incr<0 2437 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); 2438 } 2439 } 2440 th = __kmp_threads[gtid]; 2441 team = th->th.th_team; 2442 #if OMP_40_ENABLED 2443 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2444 nteams = th->th.th_teams_size.nteams; 2445 #endif 2446 team_id = team->t.t_master_tid; 2447 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2448 2449 // compute global trip count 2450 if (incr == 1) { 2451 trip_count = *pupper - *plower + 1; 2452 } else if (incr == -1) { 2453 trip_count = *plower - *pupper + 1; 2454 } else if (incr > 0) { 2455 // upper-lower can exceed the limit of signed type 2456 trip_count = (UT)(*pupper - *plower) / incr + 1; 2457 } else { 2458 trip_count = (UT)(*plower - *pupper) / (-incr) + 1; 2459 } 2460 2461 if (trip_count <= nteams) { 2462 KMP_DEBUG_ASSERT( 2463 __kmp_static == kmp_sch_static_greedy || 2464 __kmp_static == 2465 kmp_sch_static_balanced); // Unknown static scheduling type. 2466 // only some teams get single iteration, others get nothing 2467 if (team_id < trip_count) { 2468 *pupper = *plower = *plower + team_id * incr; 2469 } else { 2470 *plower = *pupper + incr; // zero-trip loop 2471 } 2472 if (plastiter != NULL) 2473 *plastiter = (team_id == trip_count - 1); 2474 } else { 2475 if (__kmp_static == kmp_sch_static_balanced) { 2476 UT chunk = trip_count / nteams; 2477 UT extras = trip_count % nteams; 2478 *plower += 2479 incr * (team_id * chunk + (team_id < extras ? team_id : extras)); 2480 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); 2481 if (plastiter != NULL) 2482 *plastiter = (team_id == nteams - 1); 2483 } else { 2484 T chunk_inc_count = 2485 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; 2486 T upper = *pupper; 2487 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy); 2488 // Unknown static scheduling type. 2489 *plower += team_id * chunk_inc_count; 2490 *pupper = *plower + chunk_inc_count - incr; 2491 // Check/correct bounds if needed 2492 if (incr > 0) { 2493 if (*pupper < *plower) 2494 *pupper = traits_t<T>::max_value; 2495 if (plastiter != NULL) 2496 *plastiter = *plower <= upper && *pupper > upper - incr; 2497 if (*pupper > upper) 2498 *pupper = upper; // tracker C73258 2499 } else { 2500 if (*pupper > *plower) 2501 *pupper = traits_t<T>::min_value; 2502 if (plastiter != NULL) 2503 *plastiter = *plower >= upper && *pupper < upper - incr; 2504 if (*pupper < upper) 2505 *pupper = upper; // tracker C73258 2506 } 2507 } 2508 } 2509 } 2510 2511 //----------------------------------------------------------------------------- 2512 // Dispatch routines 2513 // Transfer call to template< type T > 2514 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2515 // T lb, T ub, ST st, ST chunk ) 2516 extern "C" { 2517 2518 /*! 2519 @ingroup WORK_SHARING 2520 @{ 2521 @param loc Source location 2522 @param gtid Global thread id 2523 @param schedule Schedule type 2524 @param lb Lower bound 2525 @param ub Upper bound 2526 @param st Step (or increment if you prefer) 2527 @param chunk The chunk size to block with 2528 2529 This function prepares the runtime to start a dynamically scheduled for loop, 2530 saving the loop arguments. 2531 These functions are all identical apart from the types of the arguments. 2532 */ 2533 2534 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2535 enum sched_type schedule, kmp_int32 lb, 2536 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { 2537 KMP_DEBUG_ASSERT(__kmp_init_serial); 2538 #if OMPT_SUPPORT && OMPT_OPTIONAL 2539 OMPT_STORE_RETURN_ADDRESS(gtid); 2540 #endif 2541 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2542 } 2543 /*! 2544 See @ref __kmpc_dispatch_init_4 2545 */ 2546 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2547 enum sched_type schedule, kmp_uint32 lb, 2548 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { 2549 KMP_DEBUG_ASSERT(__kmp_init_serial); 2550 #if OMPT_SUPPORT && OMPT_OPTIONAL 2551 OMPT_STORE_RETURN_ADDRESS(gtid); 2552 #endif 2553 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2554 } 2555 2556 /*! 2557 See @ref __kmpc_dispatch_init_4 2558 */ 2559 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2560 enum sched_type schedule, kmp_int64 lb, 2561 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { 2562 KMP_DEBUG_ASSERT(__kmp_init_serial); 2563 #if OMPT_SUPPORT && OMPT_OPTIONAL 2564 OMPT_STORE_RETURN_ADDRESS(gtid); 2565 #endif 2566 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2567 } 2568 2569 /*! 2570 See @ref __kmpc_dispatch_init_4 2571 */ 2572 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2573 enum sched_type schedule, kmp_uint64 lb, 2574 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { 2575 KMP_DEBUG_ASSERT(__kmp_init_serial); 2576 #if OMPT_SUPPORT && OMPT_OPTIONAL 2577 OMPT_STORE_RETURN_ADDRESS(gtid); 2578 #endif 2579 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2580 } 2581 2582 /*! 2583 See @ref __kmpc_dispatch_init_4 2584 2585 Difference from __kmpc_dispatch_init set of functions is these functions 2586 are called for composite distribute parallel for construct. Thus before 2587 regular iterations dispatching we need to calc per-team iteration space. 2588 2589 These functions are all identical apart from the types of the arguments. 2590 */ 2591 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2592 enum sched_type schedule, kmp_int32 *p_last, 2593 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2594 kmp_int32 chunk) { 2595 KMP_DEBUG_ASSERT(__kmp_init_serial); 2596 #if OMPT_SUPPORT && OMPT_OPTIONAL 2597 OMPT_STORE_RETURN_ADDRESS(gtid); 2598 #endif 2599 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); 2600 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2601 } 2602 2603 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2604 enum sched_type schedule, kmp_int32 *p_last, 2605 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2606 kmp_int32 chunk) { 2607 KMP_DEBUG_ASSERT(__kmp_init_serial); 2608 #if OMPT_SUPPORT && OMPT_OPTIONAL 2609 OMPT_STORE_RETURN_ADDRESS(gtid); 2610 #endif 2611 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); 2612 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); 2613 } 2614 2615 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2616 enum sched_type schedule, kmp_int32 *p_last, 2617 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2618 kmp_int64 chunk) { 2619 KMP_DEBUG_ASSERT(__kmp_init_serial); 2620 #if OMPT_SUPPORT && OMPT_OPTIONAL 2621 OMPT_STORE_RETURN_ADDRESS(gtid); 2622 #endif 2623 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); 2624 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2625 } 2626 2627 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2628 enum sched_type schedule, kmp_int32 *p_last, 2629 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2630 kmp_int64 chunk) { 2631 KMP_DEBUG_ASSERT(__kmp_init_serial); 2632 #if OMPT_SUPPORT && OMPT_OPTIONAL 2633 OMPT_STORE_RETURN_ADDRESS(gtid); 2634 #endif 2635 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); 2636 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); 2637 } 2638 2639 /*! 2640 @param loc Source code location 2641 @param gtid Global thread id 2642 @param p_last Pointer to a flag set to one if this is the last chunk or zero 2643 otherwise 2644 @param p_lb Pointer to the lower bound for the next chunk of work 2645 @param p_ub Pointer to the upper bound for the next chunk of work 2646 @param p_st Pointer to the stride for the next chunk of work 2647 @return one if there is work to be done, zero otherwise 2648 2649 Get the next dynamically allocated chunk of work for this thread. 2650 If there is no more work, then the lb,ub and stride need not be modified. 2651 */ 2652 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2653 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { 2654 #if OMPT_SUPPORT && OMPT_OPTIONAL 2655 OMPT_STORE_RETURN_ADDRESS(gtid); 2656 #endif 2657 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st 2658 #if OMPT_SUPPORT && OMPT_OPTIONAL 2659 , 2660 OMPT_LOAD_RETURN_ADDRESS(gtid) 2661 #endif 2662 ); 2663 } 2664 2665 /*! 2666 See @ref __kmpc_dispatch_next_4 2667 */ 2668 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2669 kmp_uint32 *p_lb, kmp_uint32 *p_ub, 2670 kmp_int32 *p_st) { 2671 #if OMPT_SUPPORT && OMPT_OPTIONAL 2672 OMPT_STORE_RETURN_ADDRESS(gtid); 2673 #endif 2674 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st 2675 #if OMPT_SUPPORT && OMPT_OPTIONAL 2676 , 2677 OMPT_LOAD_RETURN_ADDRESS(gtid) 2678 #endif 2679 ); 2680 } 2681 2682 /*! 2683 See @ref __kmpc_dispatch_next_4 2684 */ 2685 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2686 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { 2687 #if OMPT_SUPPORT && OMPT_OPTIONAL 2688 OMPT_STORE_RETURN_ADDRESS(gtid); 2689 #endif 2690 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st 2691 #if OMPT_SUPPORT && OMPT_OPTIONAL 2692 , 2693 OMPT_LOAD_RETURN_ADDRESS(gtid) 2694 #endif 2695 ); 2696 } 2697 2698 /*! 2699 See @ref __kmpc_dispatch_next_4 2700 */ 2701 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2702 kmp_uint64 *p_lb, kmp_uint64 *p_ub, 2703 kmp_int64 *p_st) { 2704 #if OMPT_SUPPORT && OMPT_OPTIONAL 2705 OMPT_STORE_RETURN_ADDRESS(gtid); 2706 #endif 2707 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st 2708 #if OMPT_SUPPORT && OMPT_OPTIONAL 2709 , 2710 OMPT_LOAD_RETURN_ADDRESS(gtid) 2711 #endif 2712 ); 2713 } 2714 2715 /*! 2716 @param loc Source code location 2717 @param gtid Global thread id 2718 2719 Mark the end of a dynamic loop. 2720 */ 2721 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { 2722 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2723 } 2724 2725 /*! 2726 See @ref __kmpc_dispatch_fini_4 2727 */ 2728 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { 2729 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2730 } 2731 2732 /*! 2733 See @ref __kmpc_dispatch_fini_4 2734 */ 2735 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { 2736 __kmp_dispatch_finish<kmp_uint32>(gtid, loc); 2737 } 2738 2739 /*! 2740 See @ref __kmpc_dispatch_fini_4 2741 */ 2742 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { 2743 __kmp_dispatch_finish<kmp_uint64>(gtid, loc); 2744 } 2745 /*! @} */ 2746 2747 //----------------------------------------------------------------------------- 2748 // Non-template routines from kmp_dispatch.cpp used in other sources 2749 2750 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { 2751 return value == checker; 2752 } 2753 2754 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { 2755 return value != checker; 2756 } 2757 2758 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { 2759 return value < checker; 2760 } 2761 2762 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { 2763 return value >= checker; 2764 } 2765 2766 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { 2767 return value <= checker; 2768 } 2769 2770 kmp_uint32 2771 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, 2772 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), 2773 void *obj // Higher-level synchronization object, or NULL. 2774 ) { 2775 // note: we may not belong to a team at this point 2776 volatile kmp_uint32 *spin = spinner; 2777 kmp_uint32 check = checker; 2778 kmp_uint32 spins; 2779 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; 2780 kmp_uint32 r; 2781 2782 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); 2783 KMP_INIT_YIELD(spins); 2784 // main wait spin loop 2785 while (!f(r = TCR_4(*spin), check)) { 2786 KMP_FSYNC_SPIN_PREPARE(obj); 2787 /* GEH - remove this since it was accidentally introduced when kmp_wait was 2788 split. It causes problems with infinite recursion because of exit lock */ 2789 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2790 __kmp_abort_thread(); */ 2791 2792 /* if we have waited a bit, or are oversubscribed, yield */ 2793 /* pause is in the following code */ 2794 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2795 KMP_YIELD_SPIN(spins); 2796 } 2797 KMP_FSYNC_SPIN_ACQUIRED(obj); 2798 return r; 2799 } 2800 2801 void __kmp_wait_yield_4_ptr( 2802 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32), 2803 void *obj // Higher-level synchronization object, or NULL. 2804 ) { 2805 // note: we may not belong to a team at this point 2806 void *spin = spinner; 2807 kmp_uint32 check = checker; 2808 kmp_uint32 spins; 2809 kmp_uint32 (*f)(void *, kmp_uint32) = pred; 2810 2811 KMP_FSYNC_SPIN_INIT(obj, spin); 2812 KMP_INIT_YIELD(spins); 2813 // main wait spin loop 2814 while (!f(spin, check)) { 2815 KMP_FSYNC_SPIN_PREPARE(obj); 2816 /* if we have waited a bit, or are oversubscribed, yield */ 2817 /* pause is in the following code */ 2818 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2819 KMP_YIELD_SPIN(spins); 2820 } 2821 KMP_FSYNC_SPIN_ACQUIRED(obj); 2822 } 2823 2824 } // extern "C" 2825 2826 #ifdef KMP_GOMP_COMPAT 2827 2828 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, 2829 enum sched_type schedule, kmp_int32 lb, 2830 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, 2831 int push_ws) { 2832 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, 2833 push_ws); 2834 } 2835 2836 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, 2837 enum sched_type schedule, kmp_uint32 lb, 2838 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, 2839 int push_ws) { 2840 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, 2841 push_ws); 2842 } 2843 2844 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, 2845 enum sched_type schedule, kmp_int64 lb, 2846 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, 2847 int push_ws) { 2848 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, 2849 push_ws); 2850 } 2851 2852 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, 2853 enum sched_type schedule, kmp_uint64 lb, 2854 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, 2855 int push_ws) { 2856 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, 2857 push_ws); 2858 } 2859 2860 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { 2861 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2862 } 2863 2864 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { 2865 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2866 } 2867 2868 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { 2869 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); 2870 } 2871 2872 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { 2873 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); 2874 } 2875 2876 #endif /* KMP_GOMP_COMPAT */ 2877 2878 /* ------------------------------------------------------------------------ */ 2879