1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64 29 #if defined(_WIN32_WINNT) && defined(_M_IX86) 30 #undef _WIN32_WINNT 31 #define _WIN32_WINNT 0x0502 32 #endif 33 34 #include "kmp.h" 35 #include "kmp_i18n.h" 36 #include "kmp_itt.h" 37 #include "kmp_str.h" 38 #include "kmp_error.h" 39 #include "kmp_stats.h" 40 #if KMP_OS_WINDOWS && KMP_ARCH_X86 41 #include <float.h> 42 #endif 43 44 #if OMPT_SUPPORT 45 #include "ompt-internal.h" 46 #include "ompt-specific.h" 47 #endif 48 49 /* ------------------------------------------------------------------------ */ 50 /* ------------------------------------------------------------------------ */ 51 52 // template for type limits 53 template< typename T > 54 struct i_maxmin { 55 static const T mx; 56 static const T mn; 57 }; 58 template<> 59 struct i_maxmin< int > { 60 static const int mx = 0x7fffffff; 61 static const int mn = 0x80000000; 62 }; 63 template<> 64 struct i_maxmin< unsigned int > { 65 static const unsigned int mx = 0xffffffff; 66 static const unsigned int mn = 0x00000000; 67 }; 68 template<> 69 struct i_maxmin< long long > { 70 static const long long mx = 0x7fffffffffffffffLL; 71 static const long long mn = 0x8000000000000000LL; 72 }; 73 template<> 74 struct i_maxmin< unsigned long long > { 75 static const unsigned long long mx = 0xffffffffffffffffLL; 76 static const unsigned long long mn = 0x0000000000000000LL; 77 }; 78 //------------------------------------------------------------------------- 79 80 #if KMP_STATIC_STEAL_ENABLED 81 82 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 83 template< typename T > 84 struct dispatch_private_infoXX_template { 85 typedef typename traits_t< T >::unsigned_t UT; 86 typedef typename traits_t< T >::signed_t ST; 87 UT count; // unsigned 88 T ub; 89 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 90 T lb; 91 ST st; // signed 92 UT tc; // unsigned 93 T static_steal_counter; // for static_steal only; maybe better to put after ub 94 95 /* parm[1-4] are used in different ways by different scheduling algorithms */ 96 97 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 98 // a) parm3 is properly aligned and 99 // b) all parm1-4 are in the same cache line. 100 // Because of parm1-4 are used together, performance seems to be better 101 // if they are in the same line (not measured though). 102 103 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 104 T parm1; 105 T parm2; 106 T parm3; 107 T parm4; 108 }; 109 110 UT ordered_lower; // unsigned 111 UT ordered_upper; // unsigned 112 #if KMP_OS_WINDOWS 113 T last_upper; 114 #endif /* KMP_OS_WINDOWS */ 115 }; 116 117 #else /* KMP_STATIC_STEAL_ENABLED */ 118 119 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 120 template< typename T > 121 struct dispatch_private_infoXX_template { 122 typedef typename traits_t< T >::unsigned_t UT; 123 typedef typename traits_t< T >::signed_t ST; 124 T lb; 125 T ub; 126 ST st; // signed 127 UT tc; // unsigned 128 129 T parm1; 130 T parm2; 131 T parm3; 132 T parm4; 133 134 UT count; // unsigned 135 136 UT ordered_lower; // unsigned 137 UT ordered_upper; // unsigned 138 #if KMP_OS_WINDOWS 139 T last_upper; 140 #endif /* KMP_OS_WINDOWS */ 141 }; 142 143 #endif /* KMP_STATIC_STEAL_ENABLED */ 144 145 // replaces dispatch_private_info structure and dispatch_private_info_t type 146 template< typename T > 147 struct KMP_ALIGN_CACHE dispatch_private_info_template { 148 // duplicate alignment here, otherwise size of structure is not correct in our compiler 149 union KMP_ALIGN_CACHE private_info_tmpl { 150 dispatch_private_infoXX_template< T > p; 151 dispatch_private_info64_t p64; 152 } u; 153 enum sched_type schedule; /* scheduling algorithm */ 154 kmp_uint32 ordered; /* ordered clause specified */ 155 kmp_uint32 ordered_bumped; 156 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 157 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 158 kmp_uint32 nomerge; /* don't merge iters if serialized */ 159 kmp_uint32 type_size; 160 enum cons_type pushed_ws; 161 }; 162 163 164 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 165 template< typename UT > 166 struct dispatch_shared_infoXX_template { 167 /* chunk index under dynamic, number of idle threads under static-steal; 168 iteration index otherwise */ 169 volatile UT iteration; 170 volatile UT num_done; 171 volatile UT ordered_iteration; 172 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar 173 }; 174 175 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 176 template< typename UT > 177 struct dispatch_shared_info_template { 178 // we need union here to keep the structure size 179 union shared_info_tmpl { 180 dispatch_shared_infoXX_template< UT > s; 181 dispatch_shared_info64_t s64; 182 } u; 183 volatile kmp_uint32 buffer_index; 184 #if OMP_45_ENABLED 185 volatile kmp_int32 doacross_buf_idx; // teamwise index 186 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 187 kmp_int32 doacross_num_done; // count finished threads 188 #endif 189 #if KMP_USE_HWLOC 190 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 191 // machines (> 48 cores). Performance analysis showed that a cache thrash 192 // was occurring and this padding helps alleviate the problem. 193 char padding[64]; 194 #endif 195 }; 196 197 /* ------------------------------------------------------------------------ */ 198 /* ------------------------------------------------------------------------ */ 199 200 #undef USE_TEST_LOCKS 201 202 // test_then_add template (general template should NOT be used) 203 template< typename T > 204 static __forceinline T 205 test_then_add( volatile T *p, T d ); 206 207 template<> 208 __forceinline kmp_int32 209 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 210 { 211 kmp_int32 r; 212 r = KMP_TEST_THEN_ADD32( p, d ); 213 return r; 214 } 215 216 template<> 217 __forceinline kmp_int64 218 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 219 { 220 kmp_int64 r; 221 r = KMP_TEST_THEN_ADD64( p, d ); 222 return r; 223 } 224 225 // test_then_inc_acq template (general template should NOT be used) 226 template< typename T > 227 static __forceinline T 228 test_then_inc_acq( volatile T *p ); 229 230 template<> 231 __forceinline kmp_int32 232 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 233 { 234 kmp_int32 r; 235 r = KMP_TEST_THEN_INC_ACQ32( p ); 236 return r; 237 } 238 239 template<> 240 __forceinline kmp_int64 241 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 242 { 243 kmp_int64 r; 244 r = KMP_TEST_THEN_INC_ACQ64( p ); 245 return r; 246 } 247 248 // test_then_inc template (general template should NOT be used) 249 template< typename T > 250 static __forceinline T 251 test_then_inc( volatile T *p ); 252 253 template<> 254 __forceinline kmp_int32 255 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 256 { 257 kmp_int32 r; 258 r = KMP_TEST_THEN_INC32( p ); 259 return r; 260 } 261 262 template<> 263 __forceinline kmp_int64 264 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 265 { 266 kmp_int64 r; 267 r = KMP_TEST_THEN_INC64( p ); 268 return r; 269 } 270 271 // compare_and_swap template (general template should NOT be used) 272 template< typename T > 273 static __forceinline kmp_int32 274 compare_and_swap( volatile T *p, T c, T s ); 275 276 template<> 277 __forceinline kmp_int32 278 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 279 { 280 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 281 } 282 283 template<> 284 __forceinline kmp_int32 285 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 286 { 287 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 288 } 289 290 /* 291 Spin wait loop that first does pause, then yield. 292 Waits until function returns non-zero when called with *spinner and check. 293 Does NOT put threads to sleep. 294 #if USE_ITT_BUILD 295 Arguments: 296 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 297 locks consistently. For example, if lock is acquired immediately, its address is 298 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 299 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 300 address, not an address of low-level spinner. 301 #endif // USE_ITT_BUILD 302 */ 303 template< typename UT > 304 // ToDo: make inline function (move to header file for icl) 305 static UT // unsigned 4- or 8-byte type 306 __kmp_wait_yield( volatile UT * spinner, 307 UT checker, 308 kmp_uint32 (* pred)( UT, UT ) 309 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 310 ) 311 { 312 // note: we may not belong to a team at this point 313 register volatile UT * spin = spinner; 314 register UT check = checker; 315 register kmp_uint32 spins; 316 register kmp_uint32 (*f) ( UT, UT ) = pred; 317 register UT r; 318 319 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 320 KMP_INIT_YIELD( spins ); 321 // main wait spin loop 322 while(!f(r = *spin, check)) 323 { 324 KMP_FSYNC_SPIN_PREPARE( obj ); 325 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 326 It causes problems with infinite recursion because of exit lock */ 327 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 328 __kmp_abort_thread(); */ 329 330 // if we are oversubscribed, 331 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 332 // pause is in the following code 333 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 334 KMP_YIELD_SPIN( spins ); 335 } 336 KMP_FSYNC_SPIN_ACQUIRED( obj ); 337 return r; 338 } 339 340 template< typename UT > 341 static kmp_uint32 __kmp_eq( UT value, UT checker) { 342 return value == checker; 343 } 344 345 template< typename UT > 346 static kmp_uint32 __kmp_neq( UT value, UT checker) { 347 return value != checker; 348 } 349 350 template< typename UT > 351 static kmp_uint32 __kmp_lt( UT value, UT checker) { 352 return value < checker; 353 } 354 355 template< typename UT > 356 static kmp_uint32 __kmp_ge( UT value, UT checker) { 357 return value >= checker; 358 } 359 360 template< typename UT > 361 static kmp_uint32 __kmp_le( UT value, UT checker) { 362 return value <= checker; 363 } 364 365 366 /* ------------------------------------------------------------------------ */ 367 /* ------------------------------------------------------------------------ */ 368 369 static void 370 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 371 { 372 kmp_info_t *th; 373 374 KMP_DEBUG_ASSERT( gtid_ref ); 375 376 if ( __kmp_env_consistency_check ) { 377 th = __kmp_threads[*gtid_ref]; 378 if ( th -> th.th_root -> r.r_active 379 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 380 #if KMP_USE_DYNAMIC_LOCK 381 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 382 #else 383 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 384 #endif 385 } 386 } 387 } 388 389 template< typename UT > 390 static void 391 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 392 { 393 typedef typename traits_t< UT >::signed_t ST; 394 dispatch_private_info_template< UT > * pr; 395 396 int gtid = *gtid_ref; 397 // int cid = *cid_ref; 398 kmp_info_t *th = __kmp_threads[ gtid ]; 399 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 400 401 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 402 if ( __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 if ( pr -> pushed_ws != ct_none ) { 406 #if KMP_USE_DYNAMIC_LOCK 407 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 408 #else 409 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 410 #endif 411 } 412 } 413 414 if ( ! th -> th.th_team -> t.t_serialized ) { 415 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 416 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 417 UT lower; 418 419 if ( ! __kmp_env_consistency_check ) { 420 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 421 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 422 } 423 lower = pr->u.p.ordered_lower; 424 425 #if ! defined( KMP_GOMP_COMPAT ) 426 if ( __kmp_env_consistency_check ) { 427 if ( pr->ordered_bumped ) { 428 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 429 __kmp_error_construct2( 430 kmp_i18n_msg_CnsMultipleNesting, 431 ct_ordered_in_pdo, loc_ref, 432 & p->stack_data[ p->w_top ] 433 ); 434 } 435 } 436 #endif /* !defined(KMP_GOMP_COMPAT) */ 437 438 KMP_MB(); 439 #ifdef KMP_DEBUG 440 { 441 const char * buff; 442 // create format specifiers before the debug output 443 buff = __kmp_str_format( 444 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 445 traits_t< UT >::spec, traits_t< UT >::spec ); 446 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 447 __kmp_str_free( &buff ); 448 } 449 #endif 450 451 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 452 USE_ITT_BUILD_ARG( NULL ) 453 ); 454 KMP_MB(); /* is this necessary? */ 455 #ifdef KMP_DEBUG 456 { 457 const char * buff; 458 // create format specifiers before the debug output 459 buff = __kmp_str_format( 460 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 461 traits_t< UT >::spec, traits_t< UT >::spec ); 462 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 463 __kmp_str_free( &buff ); 464 } 465 #endif 466 } 467 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 468 } 469 470 static void 471 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 472 { 473 kmp_info_t *th; 474 475 if ( __kmp_env_consistency_check ) { 476 th = __kmp_threads[*gtid_ref]; 477 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 478 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 479 } 480 } 481 } 482 483 template< typename UT > 484 static void 485 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 486 { 487 typedef typename traits_t< UT >::signed_t ST; 488 dispatch_private_info_template< UT > * pr; 489 490 int gtid = *gtid_ref; 491 // int cid = *cid_ref; 492 kmp_info_t *th = __kmp_threads[ gtid ]; 493 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 494 495 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 496 if ( __kmp_env_consistency_check ) { 497 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 498 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 499 if ( pr -> pushed_ws != ct_none ) { 500 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 501 } 502 } 503 504 if ( ! th -> th.th_team -> t.t_serialized ) { 505 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 506 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 507 508 if ( ! __kmp_env_consistency_check ) { 509 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 510 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 511 } 512 513 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 514 #if ! defined( KMP_GOMP_COMPAT ) 515 if ( __kmp_env_consistency_check ) { 516 if ( pr->ordered_bumped != 0 ) { 517 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 518 /* How to test it? - OM */ 519 __kmp_error_construct2( 520 kmp_i18n_msg_CnsMultipleNesting, 521 ct_ordered_in_pdo, loc_ref, 522 & p->stack_data[ p->w_top ] 523 ); 524 } 525 } 526 #endif /* !defined(KMP_GOMP_COMPAT) */ 527 528 KMP_MB(); /* Flush all pending memory write invalidates. */ 529 530 pr->ordered_bumped += 1; 531 532 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 533 gtid, pr->ordered_bumped ) ); 534 535 KMP_MB(); /* Flush all pending memory write invalidates. */ 536 537 /* TODO use general release procedure? */ 538 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 539 540 KMP_MB(); /* Flush all pending memory write invalidates. */ 541 } 542 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 543 } 544 545 /* Computes and returns x to the power of y, where y must a non-negative integer */ 546 template< typename UT > 547 static __forceinline long double 548 __kmp_pow(long double x, UT y) { 549 long double s=1.0L; 550 551 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 552 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 553 while(y) { 554 if ( y & 1 ) 555 s *= x; 556 x *= x; 557 y >>= 1; 558 } 559 return s; 560 } 561 562 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 563 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 564 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 565 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 566 */ 567 template< typename T > 568 static __inline typename traits_t< T >::unsigned_t 569 __kmp_dispatch_guided_remaining( 570 T tc, 571 typename traits_t< T >::floating_t base, 572 typename traits_t< T >::unsigned_t idx 573 ) { 574 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 575 least for ICL 8.1, long double arithmetic may not really have 576 long double precision, even with /Qlong_double. Currently, we 577 workaround that in the caller code, by manipulating the FPCW for 578 Windows* OS on IA-32 architecture. The lack of precision is not 579 expected to be a correctness issue, though. 580 */ 581 typedef typename traits_t< T >::unsigned_t UT; 582 583 long double x = tc * __kmp_pow< UT >(base, idx); 584 UT r = (UT) x; 585 if ( x == r ) 586 return r; 587 return r + 1; 588 } 589 590 // Parameters of the guided-iterative algorithm: 591 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 592 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 593 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 594 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 595 static int guided_int_param = 2; 596 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 597 598 // UT - unsigned flavor of T, ST - signed flavor of T, 599 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 600 template< typename T > 601 static void 602 __kmp_dispatch_init( 603 ident_t * loc, 604 int gtid, 605 enum sched_type schedule, 606 T lb, 607 T ub, 608 typename traits_t< T >::signed_t st, 609 typename traits_t< T >::signed_t chunk, 610 int push_ws 611 ) { 612 typedef typename traits_t< T >::unsigned_t UT; 613 typedef typename traits_t< T >::signed_t ST; 614 typedef typename traits_t< T >::floating_t DBL; 615 static const int ___kmp_size_type = sizeof( UT ); 616 617 int active; 618 T tc; 619 kmp_info_t * th; 620 kmp_team_t * team; 621 kmp_uint32 my_buffer_index; 622 dispatch_private_info_template< T > * pr; 623 dispatch_shared_info_template< UT > volatile * sh; 624 625 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 626 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 627 628 if ( ! TCR_4( __kmp_init_parallel ) ) 629 __kmp_parallel_initialize(); 630 631 #if INCLUDE_SSC_MARKS 632 SSC_MARK_DISPATCH_INIT(); 633 #endif 634 #ifdef KMP_DEBUG 635 { 636 const char * buff; 637 // create format specifiers before the debug output 638 buff = __kmp_str_format( 639 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 640 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 641 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 642 __kmp_str_free( &buff ); 643 } 644 #endif 645 /* setup data */ 646 th = __kmp_threads[ gtid ]; 647 team = th -> th.th_team; 648 active = ! team -> t.t_serialized; 649 th->th.th_ident = loc; 650 651 #if USE_ITT_BUILD 652 kmp_uint64 cur_chunk = chunk; 653 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 654 KMP_MASTER_GTID(gtid) && 655 #if OMP_40_ENABLED 656 th->th.th_teams_microtask == NULL && 657 #endif 658 team->t.t_active_level == 1; 659 #endif 660 if ( ! active ) { 661 pr = reinterpret_cast< dispatch_private_info_template< T >* > 662 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 663 } else { 664 KMP_DEBUG_ASSERT( th->th.th_dispatch == 665 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 666 667 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 668 669 /* What happens when number of threads changes, need to resize buffer? */ 670 pr = reinterpret_cast< dispatch_private_info_template< T > * > 671 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 672 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 673 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 674 } 675 676 #if ( KMP_STATIC_STEAL_ENABLED ) 677 if ( SCHEDULE_HAS_NONMONOTONIC(schedule) ) 678 // AC: we now have only one implementation of stealing, so use it 679 schedule = kmp_sch_static_steal; 680 else 681 #endif 682 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 683 684 /* Pick up the nomerge/ordered bits from the scheduling type */ 685 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 686 pr->nomerge = TRUE; 687 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 688 } else { 689 pr->nomerge = FALSE; 690 } 691 pr->type_size = ___kmp_size_type; // remember the size of variables 692 if ( kmp_ord_lower & schedule ) { 693 pr->ordered = TRUE; 694 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 695 } else { 696 pr->ordered = FALSE; 697 } 698 699 if ( schedule == kmp_sch_static ) { 700 schedule = __kmp_static; 701 } else { 702 if ( schedule == kmp_sch_runtime ) { 703 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 704 schedule = team -> t.t_sched.r_sched_type; 705 // Detail the schedule if needed (global controls are differentiated appropriately) 706 if ( schedule == kmp_sch_guided_chunked ) { 707 schedule = __kmp_guided; 708 } else if ( schedule == kmp_sch_static ) { 709 schedule = __kmp_static; 710 } 711 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 712 chunk = team -> t.t_sched.chunk; 713 #if USE_ITT_BUILD 714 cur_chunk = chunk; 715 #endif 716 #ifdef KMP_DEBUG 717 { 718 const char * buff; 719 // create format specifiers before the debug output 720 buff = __kmp_str_format( 721 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 722 traits_t< ST >::spec ); 723 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 724 __kmp_str_free( &buff ); 725 } 726 #endif 727 } else { 728 if ( schedule == kmp_sch_guided_chunked ) { 729 schedule = __kmp_guided; 730 } 731 if ( chunk <= 0 ) { 732 chunk = KMP_DEFAULT_CHUNK; 733 } 734 } 735 736 if ( schedule == kmp_sch_auto ) { 737 // mapping and differentiation: in the __kmp_do_serial_initialize() 738 schedule = __kmp_auto; 739 #ifdef KMP_DEBUG 740 { 741 const char * buff; 742 // create format specifiers before the debug output 743 buff = __kmp_str_format( 744 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 745 traits_t< ST >::spec ); 746 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 747 __kmp_str_free( &buff ); 748 } 749 #endif 750 } 751 752 /* guided analytical not safe for too many threads */ 753 if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) { 754 schedule = kmp_sch_guided_iterative_chunked; 755 KMP_WARNING( DispatchManyThreads ); 756 } 757 pr->u.p.parm1 = chunk; 758 } 759 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 760 "unknown scheduling type" ); 761 762 pr->u.p.count = 0; 763 764 if ( __kmp_env_consistency_check ) { 765 if ( st == 0 ) { 766 __kmp_error_construct( 767 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 768 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 769 ); 770 } 771 } 772 // compute trip count 773 if ( st == 1 ) { // most common case 774 if ( ub >= lb ) { 775 tc = ub - lb + 1; 776 } else { // ub < lb 777 tc = 0; // zero-trip 778 } 779 } else if ( st < 0 ) { 780 if ( lb >= ub ) { 781 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 782 // where the division needs to be unsigned regardless of the result type 783 tc = (UT)(lb - ub) / (-st) + 1; 784 } else { // lb < ub 785 tc = 0; // zero-trip 786 } 787 } else { // st > 0 788 if ( ub >= lb ) { 789 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 790 // where the division needs to be unsigned regardless of the result type 791 tc = (UT)(ub - lb) / st + 1; 792 } else { // ub < lb 793 tc = 0; // zero-trip 794 } 795 } 796 797 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 798 // when statistics are disabled. 799 if (schedule == __kmp_static) 800 { 801 KMP_COUNT_BLOCK(OMP_FOR_static); 802 KMP_COUNT_VALUE(FOR_static_iterations, tc); 803 } 804 else 805 { 806 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 807 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 808 } 809 810 pr->u.p.lb = lb; 811 pr->u.p.ub = ub; 812 pr->u.p.st = st; 813 pr->u.p.tc = tc; 814 815 #if KMP_OS_WINDOWS 816 pr->u.p.last_upper = ub + st; 817 #endif /* KMP_OS_WINDOWS */ 818 819 /* NOTE: only the active parallel region(s) has active ordered sections */ 820 821 if ( active ) { 822 if ( pr->ordered == 0 ) { 823 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 824 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 825 } else { 826 pr->ordered_bumped = 0; 827 828 pr->u.p.ordered_lower = 1; 829 pr->u.p.ordered_upper = 0; 830 831 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 832 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 833 } 834 } 835 836 if ( __kmp_env_consistency_check ) { 837 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 838 if ( push_ws ) { 839 __kmp_push_workshare( gtid, ws, loc ); 840 pr->pushed_ws = ws; 841 } else { 842 __kmp_check_workshare( gtid, ws, loc ); 843 pr->pushed_ws = ct_none; 844 } 845 } 846 847 switch ( schedule ) { 848 #if ( KMP_STATIC_STEAL_ENABLED ) 849 case kmp_sch_static_steal: 850 { 851 T nproc = th->th.th_team_nproc; 852 T ntc, init; 853 854 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 855 856 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 857 if ( nproc > 1 && ntc >= nproc ) { 858 T id = __kmp_tid_from_gtid(gtid); 859 T small_chunk, extras; 860 861 small_chunk = ntc / nproc; 862 extras = ntc % nproc; 863 864 init = id * small_chunk + ( id < extras ? id : extras ); 865 pr->u.p.count = init; 866 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 867 868 pr->u.p.parm2 = lb; 869 //pr->pfields.parm3 = 0; // it's not used in static_steal 870 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 871 pr->u.p.st = st; 872 if ( ___kmp_size_type > 4 ) { 873 // AC: TODO: check if 16-byte CAS available and use it to 874 // improve performance (probably wait for explicit request 875 // before spending time on this). 876 // For now use dynamically allocated per-thread lock, 877 // free memory in __kmp_dispatch_next when status==0. 878 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 879 th->th.th_dispatch->th_steal_lock = 880 (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t)); 881 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 882 } 883 break; 884 } else { 885 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 886 gtid ) ); 887 schedule = kmp_sch_static_balanced; 888 /* too few iterations: fall-through to kmp_sch_static_balanced */ 889 } // if 890 /* FALL-THROUGH to static balanced */ 891 } // case 892 #endif 893 case kmp_sch_static_balanced: 894 { 895 T nproc = th->th.th_team_nproc; 896 T init, limit; 897 898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 899 gtid ) ); 900 901 if ( nproc > 1 ) { 902 T id = __kmp_tid_from_gtid(gtid); 903 904 if ( tc < nproc ) { 905 if ( id < tc ) { 906 init = id; 907 limit = id; 908 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 909 } else { 910 pr->u.p.count = 1; /* means no more chunks to execute */ 911 pr->u.p.parm1 = FALSE; 912 break; 913 } 914 } else { 915 T small_chunk = tc / nproc; 916 T extras = tc % nproc; 917 init = id * small_chunk + (id < extras ? id : extras); 918 limit = init + small_chunk - (id < extras ? 0 : 1); 919 pr->u.p.parm1 = (id == nproc - 1); 920 } 921 } else { 922 if ( tc > 0 ) { 923 init = 0; 924 limit = tc - 1; 925 pr->u.p.parm1 = TRUE; 926 } else { 927 // zero trip count 928 pr->u.p.count = 1; /* means no more chunks to execute */ 929 pr->u.p.parm1 = FALSE; 930 break; 931 } 932 } 933 #if USE_ITT_BUILD 934 // Calculate chunk for metadata report 935 if ( itt_need_metadata_reporting ) 936 cur_chunk = limit - init + 1; 937 #endif 938 if ( st == 1 ) { 939 pr->u.p.lb = lb + init; 940 pr->u.p.ub = lb + limit; 941 } else { 942 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 943 pr->u.p.lb = lb + init * st; 944 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 945 if ( st > 0 ) { 946 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 947 } else { 948 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 949 } 950 } 951 if ( pr->ordered ) { 952 pr->u.p.ordered_lower = init; 953 pr->u.p.ordered_upper = limit; 954 } 955 break; 956 } // case 957 case kmp_sch_guided_iterative_chunked : 958 { 959 T nproc = th->th.th_team_nproc; 960 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 961 962 if ( nproc > 1 ) { 963 if ( (2L * chunk + 1 ) * nproc >= tc ) { 964 /* chunk size too large, switch to dynamic */ 965 schedule = kmp_sch_dynamic_chunked; 966 } else { 967 // when remaining iters become less than parm2 - switch to dynamic 968 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 969 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 970 } 971 } else { 972 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 973 schedule = kmp_sch_static_greedy; 974 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 975 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 976 pr->u.p.parm1 = tc; 977 } // if 978 } // case 979 break; 980 case kmp_sch_guided_analytical_chunked: 981 { 982 T nproc = th->th.th_team_nproc; 983 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 984 985 if ( nproc > 1 ) { 986 if ( (2L * chunk + 1 ) * nproc >= tc ) { 987 /* chunk size too large, switch to dynamic */ 988 schedule = kmp_sch_dynamic_chunked; 989 } else { 990 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 991 DBL x; 992 993 #if KMP_OS_WINDOWS && KMP_ARCH_X86 994 /* Linux* OS already has 64-bit computation by default for 995 long double, and on Windows* OS on Intel(R) 64, 996 /Qlong_double doesn't work. On Windows* OS 997 on IA-32 architecture, we need to set precision to 998 64-bit instead of the default 53-bit. Even though long 999 double doesn't work on Windows* OS on Intel(R) 64, the 1000 resulting lack of precision is not expected to impact 1001 the correctness of the algorithm, but this has not been 1002 mathematically proven. 1003 */ 1004 // save original FPCW and set precision to 64-bit, as 1005 // Windows* OS on IA-32 architecture defaults to 53-bit 1006 unsigned int oldFpcw = _control87(0,0); 1007 _control87(_PC_64,_MCW_PC); // 0,0x30000 1008 #endif 1009 /* value used for comparison in solver for cross-over point */ 1010 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 1011 1012 /* crossover point--chunk indexes equal to or greater than 1013 this point switch to dynamic-style scheduling */ 1014 UT cross; 1015 1016 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 1017 x = (long double)1.0 - (long double)0.5 / nproc; 1018 1019 #ifdef KMP_DEBUG 1020 { // test natural alignment 1021 struct _test_a { 1022 char a; 1023 union { 1024 char b; 1025 DBL d; 1026 }; 1027 } t; 1028 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 1029 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 1030 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 1031 } 1032 #endif // KMP_DEBUG 1033 1034 /* save the term in thread private dispatch structure */ 1035 *(DBL*)&pr->u.p.parm3 = x; 1036 1037 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 1038 { 1039 UT left, right, mid; 1040 long double p; 1041 1042 /* estimate initial upper and lower bound */ 1043 1044 /* doesn't matter what value right is as long as it is positive, but 1045 it affects performance of the solver 1046 */ 1047 right = 229; 1048 p = __kmp_pow< UT >(x,right); 1049 if ( p > target ) { 1050 do{ 1051 p *= p; 1052 right <<= 1; 1053 } while(p>target && right < (1<<27)); 1054 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1055 } else { 1056 left = 0; 1057 } 1058 1059 /* bisection root-finding method */ 1060 while ( left + 1 < right ) { 1061 mid = (left + right) / 2; 1062 if ( __kmp_pow< UT >(x,mid) > target ) { 1063 left = mid; 1064 } else { 1065 right = mid; 1066 } 1067 } // while 1068 cross = right; 1069 } 1070 /* assert sanity of computed crossover point */ 1071 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1072 1073 /* save the crossover point in thread private dispatch structure */ 1074 pr->u.p.parm2 = cross; 1075 1076 // C75803 1077 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1078 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1079 #else 1080 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1081 #endif 1082 /* dynamic-style scheduling offset */ 1083 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1084 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1085 // restore FPCW 1086 _control87(oldFpcw,_MCW_PC); 1087 #endif 1088 } // if 1089 } else { 1090 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1091 gtid ) ); 1092 schedule = kmp_sch_static_greedy; 1093 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1094 pr->u.p.parm1 = tc; 1095 } // if 1096 } // case 1097 break; 1098 case kmp_sch_static_greedy: 1099 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1100 pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ? 1101 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc : 1102 tc; 1103 break; 1104 case kmp_sch_static_chunked : 1105 case kmp_sch_dynamic_chunked : 1106 if ( pr->u.p.parm1 <= 0 ) { 1107 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1108 } 1109 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1110 break; 1111 case kmp_sch_trapezoidal : 1112 { 1113 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1114 1115 T parm1, parm2, parm3, parm4; 1116 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1117 1118 parm1 = chunk; 1119 1120 /* F : size of the first cycle */ 1121 parm2 = ( tc / (2 * th->th.th_team_nproc) ); 1122 1123 if ( parm2 < 1 ) { 1124 parm2 = 1; 1125 } 1126 1127 /* L : size of the last cycle. Make sure the last cycle 1128 * is not larger than the first cycle. 1129 */ 1130 if ( parm1 < 1 ) { 1131 parm1 = 1; 1132 } else if ( parm1 > parm2 ) { 1133 parm1 = parm2; 1134 } 1135 1136 /* N : number of cycles */ 1137 parm3 = ( parm2 + parm1 ); 1138 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1139 1140 if ( parm3 < 2 ) { 1141 parm3 = 2; 1142 } 1143 1144 /* sigma : decreasing incr of the trapezoid */ 1145 parm4 = ( parm3 - 1 ); 1146 parm4 = ( parm2 - parm1 ) / parm4; 1147 1148 // pointless check, because parm4 >= 0 always 1149 //if ( parm4 < 0 ) { 1150 // parm4 = 0; 1151 //} 1152 1153 pr->u.p.parm1 = parm1; 1154 pr->u.p.parm2 = parm2; 1155 pr->u.p.parm3 = parm3; 1156 pr->u.p.parm4 = parm4; 1157 } // case 1158 break; 1159 1160 default: 1161 { 1162 __kmp_msg( 1163 kmp_ms_fatal, // Severity 1164 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1165 KMP_HNT( GetNewerLibrary ), // Hint 1166 __kmp_msg_null // Variadic argument list terminator 1167 ); 1168 } 1169 break; 1170 } // switch 1171 pr->schedule = schedule; 1172 if ( active ) { 1173 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1174 1175 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1176 gtid, my_buffer_index, sh->buffer_index) ); 1177 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1178 USE_ITT_BUILD_ARG( NULL ) 1179 ); 1180 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1181 // *always* 32-bit integers. 1182 KMP_MB(); /* is this necessary? */ 1183 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1184 gtid, my_buffer_index, sh->buffer_index) ); 1185 1186 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1187 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1188 #if USE_ITT_BUILD 1189 if ( pr->ordered ) { 1190 __kmp_itt_ordered_init( gtid ); 1191 }; // if 1192 // Report loop metadata 1193 if ( itt_need_metadata_reporting ) { 1194 // Only report metadata by master of active team at level 1 1195 kmp_uint64 schedtype = 0; 1196 switch ( schedule ) { 1197 case kmp_sch_static_chunked: 1198 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1199 break; 1200 case kmp_sch_static_greedy: 1201 cur_chunk = pr->u.p.parm1; 1202 break; 1203 case kmp_sch_dynamic_chunked: 1204 schedtype = 1; 1205 break; 1206 case kmp_sch_guided_iterative_chunked: 1207 case kmp_sch_guided_analytical_chunked: 1208 schedtype = 2; 1209 break; 1210 default: 1211 // Should we put this case under "static"? 1212 // case kmp_sch_static_steal: 1213 schedtype = 3; 1214 break; 1215 } 1216 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1217 } 1218 #endif /* USE_ITT_BUILD */ 1219 }; // if 1220 1221 #ifdef KMP_DEBUG 1222 { 1223 const char * buff; 1224 // create format specifiers before the debug output 1225 buff = __kmp_str_format( 1226 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1227 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1228 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1229 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1230 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1231 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1232 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1233 KD_TRACE(10, ( buff, 1234 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1235 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1236 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1237 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1238 __kmp_str_free( &buff ); 1239 } 1240 #endif 1241 #if ( KMP_STATIC_STEAL_ENABLED ) 1242 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1243 // all the parm3 variables will contain the same value. 1244 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1245 // rather than program life-time increment. 1246 // So the dedicated variable is required. The 'static_steal_counter' is used. 1247 if( schedule == kmp_sch_static_steal ) { 1248 // Other threads will inspect this variable when searching for a victim. 1249 // This is a flag showing that other threads may steal from this thread since then. 1250 volatile T * p = &pr->u.p.static_steal_counter; 1251 *p = *p + 1; 1252 } 1253 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1254 1255 #if OMPT_SUPPORT && OMPT_TRACE 1256 if (ompt_enabled && 1257 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1258 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1259 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1260 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1261 team_info->parallel_id, task_info->task_id, team_info->microtask); 1262 } 1263 #endif 1264 } 1265 1266 /* 1267 * For ordered loops, either __kmp_dispatch_finish() should be called after 1268 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1269 * every chunk of iterations. If the ordered section(s) were not executed 1270 * for this iteration (or every iteration in this chunk), we need to set the 1271 * ordered iteration counters so that the next thread can proceed. 1272 */ 1273 template< typename UT > 1274 static void 1275 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1276 { 1277 typedef typename traits_t< UT >::signed_t ST; 1278 kmp_info_t *th = __kmp_threads[ gtid ]; 1279 1280 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1281 if ( ! th -> th.th_team -> t.t_serialized ) { 1282 1283 dispatch_private_info_template< UT > * pr = 1284 reinterpret_cast< dispatch_private_info_template< UT >* > 1285 ( th->th.th_dispatch->th_dispatch_pr_current ); 1286 dispatch_shared_info_template< UT > volatile * sh = 1287 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1288 ( th->th.th_dispatch->th_dispatch_sh_current ); 1289 KMP_DEBUG_ASSERT( pr ); 1290 KMP_DEBUG_ASSERT( sh ); 1291 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1292 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1293 1294 if ( pr->ordered_bumped ) { 1295 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1296 gtid ) ); 1297 pr->ordered_bumped = 0; 1298 } else { 1299 UT lower = pr->u.p.ordered_lower; 1300 1301 #ifdef KMP_DEBUG 1302 { 1303 const char * buff; 1304 // create format specifiers before the debug output 1305 buff = __kmp_str_format( 1306 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1307 traits_t< UT >::spec, traits_t< UT >::spec ); 1308 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1309 __kmp_str_free( &buff ); 1310 } 1311 #endif 1312 1313 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1314 USE_ITT_BUILD_ARG(NULL) 1315 ); 1316 KMP_MB(); /* is this necessary? */ 1317 #ifdef KMP_DEBUG 1318 { 1319 const char * buff; 1320 // create format specifiers before the debug output 1321 buff = __kmp_str_format( 1322 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1323 traits_t< UT >::spec, traits_t< UT >::spec ); 1324 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1325 __kmp_str_free( &buff ); 1326 } 1327 #endif 1328 1329 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1330 } // if 1331 } // if 1332 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1333 } 1334 1335 #ifdef KMP_GOMP_COMPAT 1336 1337 template< typename UT > 1338 static void 1339 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1340 { 1341 typedef typename traits_t< UT >::signed_t ST; 1342 kmp_info_t *th = __kmp_threads[ gtid ]; 1343 1344 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1345 if ( ! th -> th.th_team -> t.t_serialized ) { 1346 // int cid; 1347 dispatch_private_info_template< UT > * pr = 1348 reinterpret_cast< dispatch_private_info_template< UT >* > 1349 ( th->th.th_dispatch->th_dispatch_pr_current ); 1350 dispatch_shared_info_template< UT > volatile * sh = 1351 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1352 ( th->th.th_dispatch->th_dispatch_sh_current ); 1353 KMP_DEBUG_ASSERT( pr ); 1354 KMP_DEBUG_ASSERT( sh ); 1355 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1356 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1357 1358 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1359 UT lower = pr->u.p.ordered_lower; 1360 UT upper = pr->u.p.ordered_upper; 1361 UT inc = upper - lower + 1; 1362 1363 if ( pr->ordered_bumped == inc ) { 1364 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1365 gtid ) ); 1366 pr->ordered_bumped = 0; 1367 } else { 1368 inc -= pr->ordered_bumped; 1369 1370 #ifdef KMP_DEBUG 1371 { 1372 const char * buff; 1373 // create format specifiers before the debug output 1374 buff = __kmp_str_format( 1375 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1376 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1377 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1378 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1379 __kmp_str_free( &buff ); 1380 } 1381 #endif 1382 1383 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1384 USE_ITT_BUILD_ARG(NULL) 1385 ); 1386 1387 KMP_MB(); /* is this necessary? */ 1388 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1389 gtid ) ); 1390 pr->ordered_bumped = 0; 1391 //!!!!! TODO check if the inc should be unsigned, or signed??? 1392 #ifdef KMP_DEBUG 1393 { 1394 const char * buff; 1395 // create format specifiers before the debug output 1396 buff = __kmp_str_format( 1397 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1398 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1399 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1400 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1401 __kmp_str_free( &buff ); 1402 } 1403 #endif 1404 1405 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1406 } 1407 // } 1408 } 1409 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1410 } 1411 1412 #endif /* KMP_GOMP_COMPAT */ 1413 1414 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1415 * (no more work), then tell OMPT the loop is over. In some cases 1416 * kmp_dispatch_fini() is not called. */ 1417 #if OMPT_SUPPORT && OMPT_TRACE 1418 #define OMPT_LOOP_END \ 1419 if (status == 0) { \ 1420 if (ompt_enabled && \ 1421 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1422 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1423 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1424 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1425 team_info->parallel_id, task_info->task_id); \ 1426 } \ 1427 } 1428 #else 1429 #define OMPT_LOOP_END // no-op 1430 #endif 1431 1432 template< typename T > 1433 static int 1434 __kmp_dispatch_next( 1435 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1436 ) { 1437 1438 typedef typename traits_t< T >::unsigned_t UT; 1439 typedef typename traits_t< T >::signed_t ST; 1440 typedef typename traits_t< T >::floating_t DBL; 1441 #if ( KMP_STATIC_STEAL_ENABLED ) 1442 static const int ___kmp_size_type = sizeof( UT ); 1443 #endif 1444 1445 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1446 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1447 // more than a compile time choice to use static scheduling would.) 1448 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1449 1450 int status; 1451 dispatch_private_info_template< T > * pr; 1452 kmp_info_t * th = __kmp_threads[ gtid ]; 1453 kmp_team_t * team = th -> th.th_team; 1454 1455 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1456 #ifdef KMP_DEBUG 1457 { 1458 const char * buff; 1459 // create format specifiers before the debug output 1460 buff = __kmp_str_format( 1461 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1462 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1463 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1464 __kmp_str_free( &buff ); 1465 } 1466 #endif 1467 1468 if ( team -> t.t_serialized ) { 1469 /* NOTE: serialize this dispatch becase we are not at the active level */ 1470 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1471 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1472 KMP_DEBUG_ASSERT( pr ); 1473 1474 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1475 *p_lb = 0; 1476 *p_ub = 0; 1477 // if ( p_last != NULL ) 1478 // *p_last = 0; 1479 if ( p_st != NULL ) 1480 *p_st = 0; 1481 if ( __kmp_env_consistency_check ) { 1482 if ( pr->pushed_ws != ct_none ) { 1483 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1484 } 1485 } 1486 } else if ( pr->nomerge ) { 1487 kmp_int32 last; 1488 T start; 1489 UT limit, trip, init; 1490 ST incr; 1491 T chunk = pr->u.p.parm1; 1492 1493 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1494 1495 init = chunk * pr->u.p.count++; 1496 trip = pr->u.p.tc - 1; 1497 1498 if ( (status = (init <= trip)) == 0 ) { 1499 *p_lb = 0; 1500 *p_ub = 0; 1501 // if ( p_last != NULL ) 1502 // *p_last = 0; 1503 if ( p_st != NULL ) 1504 *p_st = 0; 1505 if ( __kmp_env_consistency_check ) { 1506 if ( pr->pushed_ws != ct_none ) { 1507 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1508 } 1509 } 1510 } else { 1511 start = pr->u.p.lb; 1512 limit = chunk + init - 1; 1513 incr = pr->u.p.st; 1514 1515 if ( (last = (limit >= trip)) != 0 ) { 1516 limit = trip; 1517 #if KMP_OS_WINDOWS 1518 pr->u.p.last_upper = pr->u.p.ub; 1519 #endif /* KMP_OS_WINDOWS */ 1520 } 1521 if ( p_last != NULL ) 1522 *p_last = last; 1523 if ( p_st != NULL ) 1524 *p_st = incr; 1525 if ( incr == 1 ) { 1526 *p_lb = start + init; 1527 *p_ub = start + limit; 1528 } else { 1529 *p_lb = start + init * incr; 1530 *p_ub = start + limit * incr; 1531 } 1532 1533 if ( pr->ordered ) { 1534 pr->u.p.ordered_lower = init; 1535 pr->u.p.ordered_upper = limit; 1536 #ifdef KMP_DEBUG 1537 { 1538 const char * buff; 1539 // create format specifiers before the debug output 1540 buff = __kmp_str_format( 1541 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1542 traits_t< UT >::spec, traits_t< UT >::spec ); 1543 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1544 __kmp_str_free( &buff ); 1545 } 1546 #endif 1547 } // if 1548 } // if 1549 } else { 1550 pr->u.p.tc = 0; 1551 *p_lb = pr->u.p.lb; 1552 *p_ub = pr->u.p.ub; 1553 #if KMP_OS_WINDOWS 1554 pr->u.p.last_upper = *p_ub; 1555 #endif /* KMP_OS_WINDOWS */ 1556 if ( p_last != NULL ) 1557 *p_last = TRUE; 1558 if ( p_st != NULL ) 1559 *p_st = pr->u.p.st; 1560 } // if 1561 #ifdef KMP_DEBUG 1562 { 1563 const char * buff; 1564 // create format specifiers before the debug output 1565 buff = __kmp_str_format( 1566 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1567 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1568 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1569 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1570 __kmp_str_free( &buff ); 1571 } 1572 #endif 1573 #if INCLUDE_SSC_MARKS 1574 SSC_MARK_DISPATCH_NEXT(); 1575 #endif 1576 OMPT_LOOP_END; 1577 return status; 1578 } else { 1579 kmp_int32 last = 0; 1580 dispatch_shared_info_template< UT > *sh; 1581 T start; 1582 ST incr; 1583 UT limit, trip, init; 1584 1585 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1586 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1587 1588 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1589 ( th->th.th_dispatch->th_dispatch_pr_current ); 1590 KMP_DEBUG_ASSERT( pr ); 1591 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1592 ( th->th.th_dispatch->th_dispatch_sh_current ); 1593 KMP_DEBUG_ASSERT( sh ); 1594 1595 if ( pr->u.p.tc == 0 ) { 1596 // zero trip count 1597 status = 0; 1598 } else { 1599 switch (pr->schedule) { 1600 #if ( KMP_STATIC_STEAL_ENABLED ) 1601 case kmp_sch_static_steal: 1602 { 1603 T chunk = pr->u.p.parm1; 1604 int nproc = th->th.th_team_nproc; 1605 1606 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1607 1608 trip = pr->u.p.tc - 1; 1609 1610 if ( ___kmp_size_type > 4 ) { 1611 // use lock for 8-byte and CAS for 4-byte induction 1612 // variable. TODO (optional): check and use 16-byte CAS 1613 kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock; 1614 KMP_DEBUG_ASSERT(lck != NULL); 1615 if( pr->u.p.count < (UT)pr->u.p.ub ) { 1616 __kmp_acquire_lock(lck, gtid); 1617 // try to get own chunk of iterations 1618 init = ( pr->u.p.count )++; 1619 status = ( init < (UT)pr->u.p.ub ); 1620 __kmp_release_lock(lck, gtid); 1621 } else { 1622 status = 0; // no own chunks 1623 } 1624 if( !status ) { // try to steal 1625 kmp_info_t **other_threads = team->t.t_threads; 1626 int while_limit = nproc; // nproc attempts to find a victim 1627 int while_index = 0; 1628 // TODO: algorithm of searching for a victim 1629 // should be cleaned up and measured 1630 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1631 T remaining; 1632 T victimIdx = pr->u.p.parm4; 1633 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1634 dispatch_private_info_template< T > * victim = 1635 reinterpret_cast< dispatch_private_info_template< T >* > 1636 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1637 while( ( victim == NULL || victim == pr || 1638 ( *(volatile T*)&victim->u.p.static_steal_counter != 1639 *(volatile T*)&pr->u.p.static_steal_counter ) ) && 1640 oldVictimIdx != victimIdx ) 1641 { 1642 victimIdx = (victimIdx + 1) % nproc; 1643 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1644 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1645 }; 1646 if( !victim || 1647 ( *(volatile T *)&victim->u.p.static_steal_counter != 1648 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1649 { 1650 continue; // try once more (nproc attempts in total) 1651 // no victim is ready yet to participate in stealing 1652 // because all victims are still in kmp_init_dispatch 1653 } 1654 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) { 1655 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1656 continue; // not enough chunks to steal, goto next victim 1657 } 1658 1659 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1660 KMP_ASSERT(lck != NULL); 1661 __kmp_acquire_lock(lck, gtid); 1662 limit = victim->u.p.ub; // keep initial ub 1663 if( victim->u.p.count >= limit || 1664 (remaining = limit - victim->u.p.count) < 2 ) 1665 { 1666 __kmp_release_lock(lck, gtid); 1667 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1668 continue; // not enough chunks to steal 1669 } 1670 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1 1671 if( remaining > 3 ) { 1672 init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining 1673 } else { 1674 init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining 1675 } 1676 __kmp_release_lock(lck, gtid); 1677 1678 KMP_DEBUG_ASSERT(init + 1 <= limit); 1679 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1680 status = 1; 1681 while_index = 0; 1682 // now update own count and ub with stolen range but init chunk 1683 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1684 pr->u.p.count = init + 1; 1685 pr->u.p.ub = limit; 1686 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1687 } // while (search for victim) 1688 } // if (try to find victim and steal) 1689 } else { 1690 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1691 typedef union { 1692 struct { 1693 UT count; 1694 T ub; 1695 } p; 1696 kmp_int64 b; 1697 } union_i4; 1698 // All operations on 'count' or 'ub' must be combined atomically together. 1699 { 1700 union_i4 vold, vnew; 1701 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1702 vnew = vold; 1703 vnew.p.count++; 1704 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1705 ( volatile kmp_int64* )&pr->u.p.count, 1706 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1707 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1708 KMP_CPU_PAUSE(); 1709 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1710 vnew = vold; 1711 vnew.p.count++; 1712 } 1713 vnew = vold; 1714 init = vnew.p.count; 1715 status = ( init < (UT)vnew.p.ub ) ; 1716 } 1717 1718 if( !status ) { 1719 kmp_info_t **other_threads = team->t.t_threads; 1720 int while_limit = nproc; // nproc attempts to find a victim 1721 int while_index = 0; 1722 1723 // TODO: algorithm of searching for a victim 1724 // should be cleaned up and measured 1725 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1726 union_i4 vold, vnew; 1727 kmp_int32 remaining; 1728 T victimIdx = pr->u.p.parm4; 1729 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1730 dispatch_private_info_template< T > * victim = 1731 reinterpret_cast< dispatch_private_info_template< T >* > 1732 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1733 while( (victim == NULL || victim == pr || 1734 (*(volatile T*)&victim->u.p.static_steal_counter != 1735 *(volatile T*)&pr->u.p.static_steal_counter)) && 1736 oldVictimIdx != victimIdx ) 1737 { 1738 victimIdx = (victimIdx + 1) % nproc; 1739 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1740 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1741 }; 1742 if( !victim || 1743 ( *(volatile T *)&victim->u.p.static_steal_counter != 1744 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1745 { 1746 continue; // try once more (nproc attempts in total) 1747 // no victim is ready yet to participate in stealing 1748 // because all victims are still in kmp_init_dispatch 1749 } 1750 pr->u.p.parm4 = victimIdx; // new victim found 1751 while( 1 ) { // CAS loop if victim has enough chunks to steal 1752 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1753 vnew = vold; 1754 1755 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1756 if ( vnew.p.count >= (UT)vnew.p.ub || 1757 (remaining = vnew.p.ub - vnew.p.count) < 2 ) 1758 { 1759 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1760 break; // not enough chunks to steal, goto next victim 1761 } 1762 if( remaining > 3 ) { 1763 vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining 1764 } else { 1765 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1766 } 1767 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1768 // TODO: Should this be acquire or release? 1769 if ( KMP_COMPARE_AND_STORE_ACQ64( 1770 ( volatile kmp_int64 * )&victim->u.p.count, 1771 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1772 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1773 // stealing succedded 1774 status = 1; 1775 while_index = 0; 1776 // now update own count and ub 1777 init = vnew.p.ub; 1778 vold.p.count = init + 1; 1779 #if KMP_ARCH_X86 1780 KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b); 1781 #else 1782 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1783 #endif 1784 break; 1785 } // if (check CAS result) 1786 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1787 } // while (try to steal from particular victim) 1788 } // while (search for victim) 1789 } // if (try to find victim and steal) 1790 } // if (4-byte induction variable) 1791 if ( !status ) { 1792 *p_lb = 0; 1793 *p_ub = 0; 1794 if ( p_st != NULL ) *p_st = 0; 1795 } else { 1796 start = pr->u.p.parm2; 1797 init *= chunk; 1798 limit = chunk + init - 1; 1799 incr = pr->u.p.st; 1800 1801 KMP_DEBUG_ASSERT(init <= trip); 1802 if ( (last = (limit >= trip)) != 0 ) 1803 limit = trip; 1804 if ( p_st != NULL ) *p_st = incr; 1805 1806 if ( incr == 1 ) { 1807 *p_lb = start + init; 1808 *p_ub = start + limit; 1809 } else { 1810 *p_lb = start + init * incr; 1811 *p_ub = start + limit * incr; 1812 } 1813 1814 if ( pr->ordered ) { 1815 pr->u.p.ordered_lower = init; 1816 pr->u.p.ordered_upper = limit; 1817 #ifdef KMP_DEBUG 1818 { 1819 const char * buff; 1820 // create format specifiers before the debug output 1821 buff = __kmp_str_format( 1822 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1823 traits_t< UT >::spec, traits_t< UT >::spec ); 1824 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1825 __kmp_str_free( &buff ); 1826 } 1827 #endif 1828 } // if 1829 } // if 1830 break; 1831 } // case 1832 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1833 case kmp_sch_static_balanced: 1834 { 1835 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1836 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1837 pr->u.p.count = 1; 1838 *p_lb = pr->u.p.lb; 1839 *p_ub = pr->u.p.ub; 1840 last = pr->u.p.parm1; 1841 if ( p_st != NULL ) 1842 *p_st = pr->u.p.st; 1843 } else { /* no iterations to do */ 1844 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1845 } 1846 if ( pr->ordered ) { 1847 #ifdef KMP_DEBUG 1848 { 1849 const char * buff; 1850 // create format specifiers before the debug output 1851 buff = __kmp_str_format( 1852 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1853 traits_t< UT >::spec, traits_t< UT >::spec ); 1854 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1855 __kmp_str_free( &buff ); 1856 } 1857 #endif 1858 } // if 1859 } // case 1860 break; 1861 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1862 case kmp_sch_static_chunked: 1863 { 1864 T parm1; 1865 1866 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1867 gtid ) ); 1868 parm1 = pr->u.p.parm1; 1869 1870 trip = pr->u.p.tc - 1; 1871 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1872 1873 if ( (status = (init <= trip)) != 0 ) { 1874 start = pr->u.p.lb; 1875 incr = pr->u.p.st; 1876 limit = parm1 + init - 1; 1877 1878 if ( (last = (limit >= trip)) != 0 ) 1879 limit = trip; 1880 1881 if ( p_st != NULL ) *p_st = incr; 1882 1883 pr->u.p.count += th->th.th_team_nproc; 1884 1885 if ( incr == 1 ) { 1886 *p_lb = start + init; 1887 *p_ub = start + limit; 1888 } 1889 else { 1890 *p_lb = start + init * incr; 1891 *p_ub = start + limit * incr; 1892 } 1893 1894 if ( pr->ordered ) { 1895 pr->u.p.ordered_lower = init; 1896 pr->u.p.ordered_upper = limit; 1897 #ifdef KMP_DEBUG 1898 { 1899 const char * buff; 1900 // create format specifiers before the debug output 1901 buff = __kmp_str_format( 1902 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1903 traits_t< UT >::spec, traits_t< UT >::spec ); 1904 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1905 __kmp_str_free( &buff ); 1906 } 1907 #endif 1908 } // if 1909 } // if 1910 } // case 1911 break; 1912 1913 case kmp_sch_dynamic_chunked: 1914 { 1915 T chunk = pr->u.p.parm1; 1916 1917 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1918 gtid ) ); 1919 1920 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1921 trip = pr->u.p.tc - 1; 1922 1923 if ( (status = (init <= trip)) == 0 ) { 1924 *p_lb = 0; 1925 *p_ub = 0; 1926 if ( p_st != NULL ) *p_st = 0; 1927 } else { 1928 start = pr->u.p.lb; 1929 limit = chunk + init - 1; 1930 incr = pr->u.p.st; 1931 1932 if ( (last = (limit >= trip)) != 0 ) 1933 limit = trip; 1934 1935 if ( p_st != NULL ) *p_st = incr; 1936 1937 if ( incr == 1 ) { 1938 *p_lb = start + init; 1939 *p_ub = start + limit; 1940 } else { 1941 *p_lb = start + init * incr; 1942 *p_ub = start + limit * incr; 1943 } 1944 1945 if ( pr->ordered ) { 1946 pr->u.p.ordered_lower = init; 1947 pr->u.p.ordered_upper = limit; 1948 #ifdef KMP_DEBUG 1949 { 1950 const char * buff; 1951 // create format specifiers before the debug output 1952 buff = __kmp_str_format( 1953 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1954 traits_t< UT >::spec, traits_t< UT >::spec ); 1955 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1956 __kmp_str_free( &buff ); 1957 } 1958 #endif 1959 } // if 1960 } // if 1961 } // case 1962 break; 1963 1964 case kmp_sch_guided_iterative_chunked: 1965 { 1966 T chunkspec = pr->u.p.parm1; 1967 KD_TRACE(100, 1968 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1969 trip = pr->u.p.tc; 1970 // Start atomic part of calculations 1971 while(1) { 1972 ST remaining; // signed, because can be < 0 1973 init = sh->u.s.iteration; // shared value 1974 remaining = trip - init; 1975 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1976 // nothing to do, don't try atomic op 1977 status = 0; 1978 break; 1979 } 1980 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1981 // use dynamic-style shcedule 1982 // atomically inrement iterations, get old value 1983 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1984 remaining = trip - init; 1985 if (remaining <= 0) { 1986 status = 0; // all iterations got by other threads 1987 } else { 1988 // got some iterations to work on 1989 status = 1; 1990 if ( (T)remaining > chunkspec ) { 1991 limit = init + chunkspec - 1; 1992 } else { 1993 last = 1; // the last chunk 1994 limit = init + remaining - 1; 1995 } // if 1996 } // if 1997 break; 1998 } // if 1999 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 2000 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 2001 // CAS was successful, chunk obtained 2002 status = 1; 2003 --limit; 2004 break; 2005 } // if 2006 } // while 2007 if ( status != 0 ) { 2008 start = pr->u.p.lb; 2009 incr = pr->u.p.st; 2010 if ( p_st != NULL ) 2011 *p_st = incr; 2012 *p_lb = start + init * incr; 2013 *p_ub = start + limit * incr; 2014 if ( pr->ordered ) { 2015 pr->u.p.ordered_lower = init; 2016 pr->u.p.ordered_upper = limit; 2017 #ifdef KMP_DEBUG 2018 { 2019 const char * buff; 2020 // create format specifiers before the debug output 2021 buff = __kmp_str_format( 2022 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2023 traits_t< UT >::spec, traits_t< UT >::spec ); 2024 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2025 __kmp_str_free( &buff ); 2026 } 2027 #endif 2028 } // if 2029 } else { 2030 *p_lb = 0; 2031 *p_ub = 0; 2032 if ( p_st != NULL ) 2033 *p_st = 0; 2034 } // if 2035 } // case 2036 break; 2037 2038 case kmp_sch_guided_analytical_chunked: 2039 { 2040 T chunkspec = pr->u.p.parm1; 2041 UT chunkIdx; 2042 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2043 /* for storing original FPCW value for Windows* OS on 2044 IA-32 architecture 8-byte version */ 2045 unsigned int oldFpcw; 2046 unsigned int fpcwSet = 0; 2047 #endif 2048 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 2049 gtid ) ); 2050 2051 trip = pr->u.p.tc; 2052 2053 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2054 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip); 2055 2056 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 2057 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 2058 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 2059 --trip; 2060 /* use dynamic-style scheduling */ 2061 init = chunkIdx * chunkspec + pr->u.p.count; 2062 /* need to verify init > 0 in case of overflow in the above calculation */ 2063 if ( (status = (init > 0 && init <= trip)) != 0 ) { 2064 limit = init + chunkspec -1; 2065 2066 if ( (last = (limit >= trip)) != 0 ) 2067 limit = trip; 2068 } 2069 break; 2070 } else { 2071 /* use exponential-style scheduling */ 2072 /* The following check is to workaround the lack of long double precision on Windows* OS. 2073 This check works around the possible effect that init != 0 for chunkIdx == 0. 2074 */ 2075 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2076 /* If we haven't already done so, save original 2077 FPCW and set precision to 64-bit, as Windows* OS 2078 on IA-32 architecture defaults to 53-bit */ 2079 if ( !fpcwSet ) { 2080 oldFpcw = _control87(0,0); 2081 _control87(_PC_64,_MCW_PC); 2082 fpcwSet = 0x30000; 2083 } 2084 #endif 2085 if ( chunkIdx ) { 2086 init = __kmp_dispatch_guided_remaining< T >( 2087 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 2088 KMP_DEBUG_ASSERT(init); 2089 init = trip - init; 2090 } else 2091 init = 0; 2092 limit = trip - __kmp_dispatch_guided_remaining< T >( 2093 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 2094 KMP_ASSERT(init <= limit); 2095 if ( init < limit ) { 2096 KMP_DEBUG_ASSERT(limit <= trip); 2097 --limit; 2098 status = 1; 2099 break; 2100 } // if 2101 } // if 2102 } // while (1) 2103 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2104 /* restore FPCW if necessary 2105 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2106 */ 2107 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2108 _control87(oldFpcw,_MCW_PC); 2109 #endif 2110 if ( status != 0 ) { 2111 start = pr->u.p.lb; 2112 incr = pr->u.p.st; 2113 if ( p_st != NULL ) 2114 *p_st = incr; 2115 *p_lb = start + init * incr; 2116 *p_ub = start + limit * incr; 2117 if ( pr->ordered ) { 2118 pr->u.p.ordered_lower = init; 2119 pr->u.p.ordered_upper = limit; 2120 #ifdef KMP_DEBUG 2121 { 2122 const char * buff; 2123 // create format specifiers before the debug output 2124 buff = __kmp_str_format( 2125 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2126 traits_t< UT >::spec, traits_t< UT >::spec ); 2127 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2128 __kmp_str_free( &buff ); 2129 } 2130 #endif 2131 } 2132 } else { 2133 *p_lb = 0; 2134 *p_ub = 0; 2135 if ( p_st != NULL ) 2136 *p_st = 0; 2137 } 2138 } // case 2139 break; 2140 2141 case kmp_sch_trapezoidal: 2142 { 2143 UT index; 2144 T parm2 = pr->u.p.parm2; 2145 T parm3 = pr->u.p.parm3; 2146 T parm4 = pr->u.p.parm4; 2147 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2148 gtid ) ); 2149 2150 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2151 2152 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2153 trip = pr->u.p.tc - 1; 2154 2155 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2156 *p_lb = 0; 2157 *p_ub = 0; 2158 if ( p_st != NULL ) *p_st = 0; 2159 } else { 2160 start = pr->u.p.lb; 2161 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2162 incr = pr->u.p.st; 2163 2164 if ( (last = (limit >= trip)) != 0 ) 2165 limit = trip; 2166 2167 if ( p_st != NULL ) *p_st = incr; 2168 2169 if ( incr == 1 ) { 2170 *p_lb = start + init; 2171 *p_ub = start + limit; 2172 } else { 2173 *p_lb = start + init * incr; 2174 *p_ub = start + limit * incr; 2175 } 2176 2177 if ( pr->ordered ) { 2178 pr->u.p.ordered_lower = init; 2179 pr->u.p.ordered_upper = limit; 2180 #ifdef KMP_DEBUG 2181 { 2182 const char * buff; 2183 // create format specifiers before the debug output 2184 buff = __kmp_str_format( 2185 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2186 traits_t< UT >::spec, traits_t< UT >::spec ); 2187 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2188 __kmp_str_free( &buff ); 2189 } 2190 #endif 2191 } // if 2192 } // if 2193 } // case 2194 break; 2195 default: 2196 { 2197 status = 0; // to avoid complaints on uninitialized variable use 2198 __kmp_msg( 2199 kmp_ms_fatal, // Severity 2200 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2201 KMP_HNT( GetNewerLibrary ), // Hint 2202 __kmp_msg_null // Variadic argument list terminator 2203 ); 2204 } 2205 break; 2206 } // switch 2207 } // if tc == 0; 2208 2209 if ( status == 0 ) { 2210 UT num_done; 2211 2212 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2213 #ifdef KMP_DEBUG 2214 { 2215 const char * buff; 2216 // create format specifiers before the debug output 2217 buff = __kmp_str_format( 2218 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2219 traits_t< UT >::spec ); 2220 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2221 __kmp_str_free( &buff ); 2222 } 2223 #endif 2224 2225 if ( (ST)num_done == th->th.th_team_nproc - 1 ) { 2226 #if ( KMP_STATIC_STEAL_ENABLED ) 2227 if( pr->schedule == kmp_sch_static_steal && ___kmp_size_type > 4 ) { 2228 int i; 2229 kmp_info_t **other_threads = team->t.t_threads; 2230 // loop complete, safe to destroy locks used for stealing 2231 for( i = 0; i < th->th.th_team_nproc; ++i ) { 2232 kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2233 KMP_ASSERT(lck != NULL); 2234 __kmp_destroy_lock( lck ); 2235 __kmp_free( lck ); 2236 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2237 } 2238 } 2239 #endif 2240 /* NOTE: release this buffer to be reused */ 2241 2242 KMP_MB(); /* Flush all pending memory write invalidates. */ 2243 2244 sh->u.s.num_done = 0; 2245 sh->u.s.iteration = 0; 2246 2247 /* TODO replace with general release procedure? */ 2248 if ( pr->ordered ) { 2249 sh->u.s.ordered_iteration = 0; 2250 } 2251 2252 KMP_MB(); /* Flush all pending memory write invalidates. */ 2253 2254 sh -> buffer_index += __kmp_dispatch_num_buffers; 2255 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2256 gtid, sh->buffer_index) ); 2257 2258 KMP_MB(); /* Flush all pending memory write invalidates. */ 2259 2260 } // if 2261 if ( __kmp_env_consistency_check ) { 2262 if ( pr->pushed_ws != ct_none ) { 2263 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2264 } 2265 } 2266 2267 th -> th.th_dispatch -> th_deo_fcn = NULL; 2268 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2269 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2270 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2271 } // if (status == 0) 2272 #if KMP_OS_WINDOWS 2273 else if ( last ) { 2274 pr->u.p.last_upper = pr->u.p.ub; 2275 } 2276 #endif /* KMP_OS_WINDOWS */ 2277 if ( p_last != NULL && status != 0 ) 2278 *p_last = last; 2279 } // if 2280 2281 #ifdef KMP_DEBUG 2282 { 2283 const char * buff; 2284 // create format specifiers before the debug output 2285 buff = __kmp_str_format( 2286 "__kmp_dispatch_next: T#%%d normal case: " \ 2287 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2288 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2289 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2290 __kmp_str_free( &buff ); 2291 } 2292 #endif 2293 #if INCLUDE_SSC_MARKS 2294 SSC_MARK_DISPATCH_NEXT(); 2295 #endif 2296 OMPT_LOOP_END; 2297 return status; 2298 } 2299 2300 template< typename T > 2301 static void 2302 __kmp_dist_get_bounds( 2303 ident_t *loc, 2304 kmp_int32 gtid, 2305 kmp_int32 *plastiter, 2306 T *plower, 2307 T *pupper, 2308 typename traits_t< T >::signed_t incr 2309 ) { 2310 typedef typename traits_t< T >::unsigned_t UT; 2311 typedef typename traits_t< T >::signed_t ST; 2312 register kmp_uint32 team_id; 2313 register kmp_uint32 nteams; 2314 register UT trip_count; 2315 register kmp_team_t *team; 2316 kmp_info_t * th; 2317 2318 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2319 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2320 #ifdef KMP_DEBUG 2321 { 2322 const char * buff; 2323 // create format specifiers before the debug output 2324 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2325 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2326 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2327 traits_t< T >::spec ); 2328 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2329 __kmp_str_free( &buff ); 2330 } 2331 #endif 2332 2333 if( __kmp_env_consistency_check ) { 2334 if( incr == 0 ) { 2335 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2336 } 2337 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2338 // The loop is illegal. 2339 // Some zero-trip loops maintained by compiler, e.g.: 2340 // for(i=10;i<0;++i) // lower >= upper - run-time check 2341 // for(i=0;i>10;--i) // lower <= upper - run-time check 2342 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2343 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2344 // Compiler does not check the following illegal loops: 2345 // for(i=0;i<10;i+=incr) // where incr<0 2346 // for(i=10;i>0;i-=incr) // where incr<0 2347 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2348 } 2349 } 2350 th = __kmp_threads[gtid]; 2351 team = th->th.th_team; 2352 #if OMP_40_ENABLED 2353 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2354 nteams = th->th.th_teams_size.nteams; 2355 #endif 2356 team_id = team->t.t_master_tid; 2357 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2358 2359 // compute global trip count 2360 if( incr == 1 ) { 2361 trip_count = *pupper - *plower + 1; 2362 } else if(incr == -1) { 2363 trip_count = *plower - *pupper + 1; 2364 } else if ( incr > 0 ) { 2365 // upper-lower can exceed the limit of signed type 2366 trip_count = (UT)(*pupper - *plower) / incr + 1; 2367 } else { 2368 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1; 2369 } 2370 2371 if( trip_count <= nteams ) { 2372 KMP_DEBUG_ASSERT( 2373 __kmp_static == kmp_sch_static_greedy || \ 2374 __kmp_static == kmp_sch_static_balanced 2375 ); // Unknown static scheduling type. 2376 // only some teams get single iteration, others get nothing 2377 if( team_id < trip_count ) { 2378 *pupper = *plower = *plower + team_id * incr; 2379 } else { 2380 *plower = *pupper + incr; // zero-trip loop 2381 } 2382 if( plastiter != NULL ) 2383 *plastiter = ( team_id == trip_count - 1 ); 2384 } else { 2385 if( __kmp_static == kmp_sch_static_balanced ) { 2386 register UT chunk = trip_count / nteams; 2387 register UT extras = trip_count % nteams; 2388 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2389 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2390 if( plastiter != NULL ) 2391 *plastiter = ( team_id == nteams - 1 ); 2392 } else { 2393 register T chunk_inc_count = 2394 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2395 register T upper = *pupper; 2396 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2397 // Unknown static scheduling type. 2398 *plower += team_id * chunk_inc_count; 2399 *pupper = *plower + chunk_inc_count - incr; 2400 // Check/correct bounds if needed 2401 if( incr > 0 ) { 2402 if( *pupper < *plower ) 2403 *pupper = i_maxmin< T >::mx; 2404 if( plastiter != NULL ) 2405 *plastiter = *plower <= upper && *pupper > upper - incr; 2406 if( *pupper > upper ) 2407 *pupper = upper; // tracker C73258 2408 } else { 2409 if( *pupper > *plower ) 2410 *pupper = i_maxmin< T >::mn; 2411 if( plastiter != NULL ) 2412 *plastiter = *plower >= upper && *pupper < upper - incr; 2413 if( *pupper < upper ) 2414 *pupper = upper; // tracker C73258 2415 } 2416 } 2417 } 2418 } 2419 2420 //----------------------------------------------------------------------------------------- 2421 // Dispatch routines 2422 // Transfer call to template< type T > 2423 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2424 // T lb, T ub, ST st, ST chunk ) 2425 extern "C" { 2426 2427 /*! 2428 @ingroup WORK_SHARING 2429 @{ 2430 @param loc Source location 2431 @param gtid Global thread id 2432 @param schedule Schedule type 2433 @param lb Lower bound 2434 @param ub Upper bound 2435 @param st Step (or increment if you prefer) 2436 @param chunk The chunk size to block with 2437 2438 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2439 These functions are all identical apart from the types of the arguments. 2440 */ 2441 2442 void 2443 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2444 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2445 { 2446 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2447 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2448 } 2449 /*! 2450 See @ref __kmpc_dispatch_init_4 2451 */ 2452 void 2453 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2454 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2455 { 2456 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2457 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2458 } 2459 2460 /*! 2461 See @ref __kmpc_dispatch_init_4 2462 */ 2463 void 2464 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2465 kmp_int64 lb, kmp_int64 ub, 2466 kmp_int64 st, kmp_int64 chunk ) 2467 { 2468 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2469 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2470 } 2471 2472 /*! 2473 See @ref __kmpc_dispatch_init_4 2474 */ 2475 void 2476 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2477 kmp_uint64 lb, kmp_uint64 ub, 2478 kmp_int64 st, kmp_int64 chunk ) 2479 { 2480 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2481 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2482 } 2483 2484 /*! 2485 See @ref __kmpc_dispatch_init_4 2486 2487 Difference from __kmpc_dispatch_init set of functions is these functions 2488 are called for composite distribute parallel for construct. Thus before 2489 regular iterations dispatching we need to calc per-team iteration space. 2490 2491 These functions are all identical apart from the types of the arguments. 2492 */ 2493 void 2494 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2495 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2496 { 2497 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2498 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2499 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2500 } 2501 2502 void 2503 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2504 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2505 { 2506 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2507 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2508 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2509 } 2510 2511 void 2512 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2513 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2514 { 2515 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2516 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2517 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2518 } 2519 2520 void 2521 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2522 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2523 { 2524 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2525 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2526 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2527 } 2528 2529 /*! 2530 @param loc Source code location 2531 @param gtid Global thread id 2532 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2533 @param p_lb Pointer to the lower bound for the next chunk of work 2534 @param p_ub Pointer to the upper bound for the next chunk of work 2535 @param p_st Pointer to the stride for the next chunk of work 2536 @return one if there is work to be done, zero otherwise 2537 2538 Get the next dynamically allocated chunk of work for this thread. 2539 If there is no more work, then the lb,ub and stride need not be modified. 2540 */ 2541 int 2542 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2543 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2544 { 2545 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2546 } 2547 2548 /*! 2549 See @ref __kmpc_dispatch_next_4 2550 */ 2551 int 2552 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2553 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2554 { 2555 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2556 } 2557 2558 /*! 2559 See @ref __kmpc_dispatch_next_4 2560 */ 2561 int 2562 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2563 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2564 { 2565 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2566 } 2567 2568 /*! 2569 See @ref __kmpc_dispatch_next_4 2570 */ 2571 int 2572 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2573 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2574 { 2575 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2576 } 2577 2578 /*! 2579 @param loc Source code location 2580 @param gtid Global thread id 2581 2582 Mark the end of a dynamic loop. 2583 */ 2584 void 2585 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2586 { 2587 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2588 } 2589 2590 /*! 2591 See @ref __kmpc_dispatch_fini_4 2592 */ 2593 void 2594 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2595 { 2596 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2597 } 2598 2599 /*! 2600 See @ref __kmpc_dispatch_fini_4 2601 */ 2602 void 2603 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2604 { 2605 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2606 } 2607 2608 /*! 2609 See @ref __kmpc_dispatch_fini_4 2610 */ 2611 void 2612 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2613 { 2614 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2615 } 2616 /*! @} */ 2617 2618 //----------------------------------------------------------------------------------------- 2619 //Non-template routines from kmp_dispatch.c used in other sources 2620 2621 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2622 return value == checker; 2623 } 2624 2625 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2626 return value != checker; 2627 } 2628 2629 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2630 return value < checker; 2631 } 2632 2633 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2634 return value >= checker; 2635 } 2636 2637 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2638 return value <= checker; 2639 } 2640 2641 kmp_uint32 2642 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2643 kmp_uint32 checker, 2644 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2645 , void * obj // Higher-level synchronization object, or NULL. 2646 ) 2647 { 2648 // note: we may not belong to a team at this point 2649 register volatile kmp_uint32 * spin = spinner; 2650 register kmp_uint32 check = checker; 2651 register kmp_uint32 spins; 2652 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2653 register kmp_uint32 r; 2654 2655 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2656 KMP_INIT_YIELD( spins ); 2657 // main wait spin loop 2658 while(!f(r = TCR_4(*spin), check)) { 2659 KMP_FSYNC_SPIN_PREPARE( obj ); 2660 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2661 It causes problems with infinite recursion because of exit lock */ 2662 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2663 __kmp_abort_thread(); */ 2664 2665 /* if we have waited a bit, or are oversubscribed, yield */ 2666 /* pause is in the following code */ 2667 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2668 KMP_YIELD_SPIN( spins ); 2669 } 2670 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2671 return r; 2672 } 2673 2674 void 2675 __kmp_wait_yield_4_ptr(void *spinner, 2676 kmp_uint32 checker, 2677 kmp_uint32 (*pred)( void *, kmp_uint32 ), 2678 void *obj // Higher-level synchronization object, or NULL. 2679 ) 2680 { 2681 // note: we may not belong to a team at this point 2682 register void *spin = spinner; 2683 register kmp_uint32 check = checker; 2684 register kmp_uint32 spins; 2685 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred; 2686 2687 KMP_FSYNC_SPIN_INIT( obj, spin ); 2688 KMP_INIT_YIELD( spins ); 2689 // main wait spin loop 2690 while ( !f( spin, check ) ) { 2691 KMP_FSYNC_SPIN_PREPARE( obj ); 2692 /* if we have waited a bit, or are oversubscribed, yield */ 2693 /* pause is in the following code */ 2694 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc ); 2695 KMP_YIELD_SPIN( spins ); 2696 } 2697 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2698 } 2699 2700 } // extern "C" 2701 2702 #ifdef KMP_GOMP_COMPAT 2703 2704 void 2705 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2706 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2707 kmp_int32 chunk, int push_ws ) 2708 { 2709 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2710 push_ws ); 2711 } 2712 2713 void 2714 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2715 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2716 kmp_int32 chunk, int push_ws ) 2717 { 2718 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2719 push_ws ); 2720 } 2721 2722 void 2723 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2724 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2725 kmp_int64 chunk, int push_ws ) 2726 { 2727 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2728 push_ws ); 2729 } 2730 2731 void 2732 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2733 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2734 kmp_int64 chunk, int push_ws ) 2735 { 2736 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2737 push_ws ); 2738 } 2739 2740 void 2741 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2742 { 2743 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2744 } 2745 2746 void 2747 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2748 { 2749 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2750 } 2751 2752 void 2753 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2754 { 2755 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2756 } 2757 2758 void 2759 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2760 { 2761 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2762 } 2763 2764 #endif /* KMP_GOMP_COMPAT */ 2765 2766 /* ------------------------------------------------------------------------ */ 2767 /* ------------------------------------------------------------------------ */ 2768 2769