1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64 29 #if defined(_WIN32_WINNT) && defined(_M_IX86) 30 #undef _WIN32_WINNT 31 #define _WIN32_WINNT 0x0502 32 #endif 33 34 #include "kmp.h" 35 #include "kmp_i18n.h" 36 #include "kmp_itt.h" 37 #include "kmp_str.h" 38 #include "kmp_error.h" 39 #include "kmp_stats.h" 40 #if KMP_OS_WINDOWS && KMP_ARCH_X86 41 #include <float.h> 42 #endif 43 44 #if OMPT_SUPPORT 45 #include "ompt-internal.h" 46 #include "ompt-specific.h" 47 #endif 48 49 /* ------------------------------------------------------------------------ */ 50 /* ------------------------------------------------------------------------ */ 51 52 #if KMP_STATIC_STEAL_ENABLED 53 54 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 55 template< typename T > 56 struct dispatch_private_infoXX_template { 57 typedef typename traits_t< T >::unsigned_t UT; 58 typedef typename traits_t< T >::signed_t ST; 59 UT count; // unsigned 60 T ub; 61 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 62 T lb; 63 ST st; // signed 64 UT tc; // unsigned 65 T static_steal_counter; // for static_steal only; maybe better to put after ub 66 67 /* parm[1-4] are used in different ways by different scheduling algorithms */ 68 69 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 70 // a) parm3 is properly aligned and 71 // b) all parm1-4 are in the same cache line. 72 // Because of parm1-4 are used together, performance seems to be better 73 // if they are in the same line (not measured though). 74 75 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 76 T parm1; 77 T parm2; 78 T parm3; 79 T parm4; 80 }; 81 82 UT ordered_lower; // unsigned 83 UT ordered_upper; // unsigned 84 #if KMP_OS_WINDOWS 85 T last_upper; 86 #endif /* KMP_OS_WINDOWS */ 87 }; 88 89 #else /* KMP_STATIC_STEAL_ENABLED */ 90 91 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 92 template< typename T > 93 struct dispatch_private_infoXX_template { 94 typedef typename traits_t< T >::unsigned_t UT; 95 typedef typename traits_t< T >::signed_t ST; 96 T lb; 97 T ub; 98 ST st; // signed 99 UT tc; // unsigned 100 101 T parm1; 102 T parm2; 103 T parm3; 104 T parm4; 105 106 UT count; // unsigned 107 108 UT ordered_lower; // unsigned 109 UT ordered_upper; // unsigned 110 #if KMP_OS_WINDOWS 111 T last_upper; 112 #endif /* KMP_OS_WINDOWS */ 113 }; 114 115 #endif /* KMP_STATIC_STEAL_ENABLED */ 116 117 // replaces dispatch_private_info structure and dispatch_private_info_t type 118 template< typename T > 119 struct KMP_ALIGN_CACHE dispatch_private_info_template { 120 // duplicate alignment here, otherwise size of structure is not correct in our compiler 121 union KMP_ALIGN_CACHE private_info_tmpl { 122 dispatch_private_infoXX_template< T > p; 123 dispatch_private_info64_t p64; 124 } u; 125 enum sched_type schedule; /* scheduling algorithm */ 126 kmp_uint32 ordered; /* ordered clause specified */ 127 kmp_uint32 ordered_bumped; 128 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 129 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 130 kmp_uint32 nomerge; /* don't merge iters if serialized */ 131 kmp_uint32 type_size; 132 enum cons_type pushed_ws; 133 }; 134 135 136 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 137 template< typename UT > 138 struct dispatch_shared_infoXX_template { 139 /* chunk index under dynamic, number of idle threads under static-steal; 140 iteration index otherwise */ 141 volatile UT iteration; 142 volatile UT num_done; 143 volatile UT ordered_iteration; 144 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar 145 }; 146 147 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 148 template< typename UT > 149 struct dispatch_shared_info_template { 150 // we need union here to keep the structure size 151 union shared_info_tmpl { 152 dispatch_shared_infoXX_template< UT > s; 153 dispatch_shared_info64_t s64; 154 } u; 155 volatile kmp_uint32 buffer_index; 156 #if OMP_45_ENABLED 157 volatile kmp_int32 doacross_buf_idx; // teamwise index 158 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 159 kmp_int32 doacross_num_done; // count finished threads 160 #endif 161 #if KMP_USE_HWLOC 162 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 163 // machines (> 48 cores). Performance analysis showed that a cache thrash 164 // was occurring and this padding helps alleviate the problem. 165 char padding[64]; 166 #endif 167 }; 168 169 /* ------------------------------------------------------------------------ */ 170 /* ------------------------------------------------------------------------ */ 171 172 #undef USE_TEST_LOCKS 173 174 // test_then_add template (general template should NOT be used) 175 template< typename T > 176 static __forceinline T 177 test_then_add( volatile T *p, T d ); 178 179 template<> 180 __forceinline kmp_int32 181 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 182 { 183 kmp_int32 r; 184 r = KMP_TEST_THEN_ADD32( p, d ); 185 return r; 186 } 187 188 template<> 189 __forceinline kmp_int64 190 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 191 { 192 kmp_int64 r; 193 r = KMP_TEST_THEN_ADD64( p, d ); 194 return r; 195 } 196 197 // test_then_inc_acq template (general template should NOT be used) 198 template< typename T > 199 static __forceinline T 200 test_then_inc_acq( volatile T *p ); 201 202 template<> 203 __forceinline kmp_int32 204 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 205 { 206 kmp_int32 r; 207 r = KMP_TEST_THEN_INC_ACQ32( p ); 208 return r; 209 } 210 211 template<> 212 __forceinline kmp_int64 213 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 214 { 215 kmp_int64 r; 216 r = KMP_TEST_THEN_INC_ACQ64( p ); 217 return r; 218 } 219 220 // test_then_inc template (general template should NOT be used) 221 template< typename T > 222 static __forceinline T 223 test_then_inc( volatile T *p ); 224 225 template<> 226 __forceinline kmp_int32 227 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 228 { 229 kmp_int32 r; 230 r = KMP_TEST_THEN_INC32( p ); 231 return r; 232 } 233 234 template<> 235 __forceinline kmp_int64 236 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 237 { 238 kmp_int64 r; 239 r = KMP_TEST_THEN_INC64( p ); 240 return r; 241 } 242 243 // compare_and_swap template (general template should NOT be used) 244 template< typename T > 245 static __forceinline kmp_int32 246 compare_and_swap( volatile T *p, T c, T s ); 247 248 template<> 249 __forceinline kmp_int32 250 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 251 { 252 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 253 } 254 255 template<> 256 __forceinline kmp_int32 257 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 258 { 259 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 260 } 261 262 /* 263 Spin wait loop that first does pause, then yield. 264 Waits until function returns non-zero when called with *spinner and check. 265 Does NOT put threads to sleep. 266 #if USE_ITT_BUILD 267 Arguments: 268 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 269 locks consistently. For example, if lock is acquired immediately, its address is 270 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 271 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 272 address, not an address of low-level spinner. 273 #endif // USE_ITT_BUILD 274 */ 275 template< typename UT > 276 // ToDo: make inline function (move to header file for icl) 277 static UT // unsigned 4- or 8-byte type 278 __kmp_wait_yield( volatile UT * spinner, 279 UT checker, 280 kmp_uint32 (* pred)( UT, UT ) 281 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 282 ) 283 { 284 // note: we may not belong to a team at this point 285 register volatile UT * spin = spinner; 286 register UT check = checker; 287 register kmp_uint32 spins; 288 register kmp_uint32 (*f) ( UT, UT ) = pred; 289 register UT r; 290 291 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 292 KMP_INIT_YIELD( spins ); 293 // main wait spin loop 294 while(!f(r = *spin, check)) 295 { 296 KMP_FSYNC_SPIN_PREPARE( obj ); 297 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 298 It causes problems with infinite recursion because of exit lock */ 299 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 300 __kmp_abort_thread(); */ 301 302 // if we are oversubscribed, 303 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 304 // pause is in the following code 305 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 306 KMP_YIELD_SPIN( spins ); 307 } 308 KMP_FSYNC_SPIN_ACQUIRED( obj ); 309 return r; 310 } 311 312 template< typename UT > 313 static kmp_uint32 __kmp_eq( UT value, UT checker) { 314 return value == checker; 315 } 316 317 template< typename UT > 318 static kmp_uint32 __kmp_neq( UT value, UT checker) { 319 return value != checker; 320 } 321 322 template< typename UT > 323 static kmp_uint32 __kmp_lt( UT value, UT checker) { 324 return value < checker; 325 } 326 327 template< typename UT > 328 static kmp_uint32 __kmp_ge( UT value, UT checker) { 329 return value >= checker; 330 } 331 332 template< typename UT > 333 static kmp_uint32 __kmp_le( UT value, UT checker) { 334 return value <= checker; 335 } 336 337 338 /* ------------------------------------------------------------------------ */ 339 /* ------------------------------------------------------------------------ */ 340 341 static void 342 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 343 { 344 kmp_info_t *th; 345 346 KMP_DEBUG_ASSERT( gtid_ref ); 347 348 if ( __kmp_env_consistency_check ) { 349 th = __kmp_threads[*gtid_ref]; 350 if ( th -> th.th_root -> r.r_active 351 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 352 #if KMP_USE_DYNAMIC_LOCK 353 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 354 #else 355 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 356 #endif 357 } 358 } 359 } 360 361 template< typename UT > 362 static void 363 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 364 { 365 typedef typename traits_t< UT >::signed_t ST; 366 dispatch_private_info_template< UT > * pr; 367 368 int gtid = *gtid_ref; 369 // int cid = *cid_ref; 370 kmp_info_t *th = __kmp_threads[ gtid ]; 371 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 372 373 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 374 if ( __kmp_env_consistency_check ) { 375 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 376 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 377 if ( pr -> pushed_ws != ct_none ) { 378 #if KMP_USE_DYNAMIC_LOCK 379 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 380 #else 381 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 382 #endif 383 } 384 } 385 386 if ( ! th -> th.th_team -> t.t_serialized ) { 387 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 388 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 389 UT lower; 390 391 if ( ! __kmp_env_consistency_check ) { 392 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 393 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 394 } 395 lower = pr->u.p.ordered_lower; 396 397 #if ! defined( KMP_GOMP_COMPAT ) 398 if ( __kmp_env_consistency_check ) { 399 if ( pr->ordered_bumped ) { 400 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 401 __kmp_error_construct2( 402 kmp_i18n_msg_CnsMultipleNesting, 403 ct_ordered_in_pdo, loc_ref, 404 & p->stack_data[ p->w_top ] 405 ); 406 } 407 } 408 #endif /* !defined(KMP_GOMP_COMPAT) */ 409 410 KMP_MB(); 411 #ifdef KMP_DEBUG 412 { 413 const char * buff; 414 // create format specifiers before the debug output 415 buff = __kmp_str_format( 416 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 417 traits_t< UT >::spec, traits_t< UT >::spec ); 418 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 419 __kmp_str_free( &buff ); 420 } 421 #endif 422 423 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 424 USE_ITT_BUILD_ARG( NULL ) 425 ); 426 KMP_MB(); /* is this necessary? */ 427 #ifdef KMP_DEBUG 428 { 429 const char * buff; 430 // create format specifiers before the debug output 431 buff = __kmp_str_format( 432 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 433 traits_t< UT >::spec, traits_t< UT >::spec ); 434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 435 __kmp_str_free( &buff ); 436 } 437 #endif 438 } 439 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 440 } 441 442 static void 443 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 444 { 445 kmp_info_t *th; 446 447 if ( __kmp_env_consistency_check ) { 448 th = __kmp_threads[*gtid_ref]; 449 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 450 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 451 } 452 } 453 } 454 455 template< typename UT > 456 static void 457 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 458 { 459 typedef typename traits_t< UT >::signed_t ST; 460 dispatch_private_info_template< UT > * pr; 461 462 int gtid = *gtid_ref; 463 // int cid = *cid_ref; 464 kmp_info_t *th = __kmp_threads[ gtid ]; 465 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 466 467 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 468 if ( __kmp_env_consistency_check ) { 469 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 470 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 471 if ( pr -> pushed_ws != ct_none ) { 472 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 473 } 474 } 475 476 if ( ! th -> th.th_team -> t.t_serialized ) { 477 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 478 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 479 480 if ( ! __kmp_env_consistency_check ) { 481 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 482 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 483 } 484 485 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 486 #if ! defined( KMP_GOMP_COMPAT ) 487 if ( __kmp_env_consistency_check ) { 488 if ( pr->ordered_bumped != 0 ) { 489 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 490 /* How to test it? - OM */ 491 __kmp_error_construct2( 492 kmp_i18n_msg_CnsMultipleNesting, 493 ct_ordered_in_pdo, loc_ref, 494 & p->stack_data[ p->w_top ] 495 ); 496 } 497 } 498 #endif /* !defined(KMP_GOMP_COMPAT) */ 499 500 KMP_MB(); /* Flush all pending memory write invalidates. */ 501 502 pr->ordered_bumped += 1; 503 504 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 505 gtid, pr->ordered_bumped ) ); 506 507 KMP_MB(); /* Flush all pending memory write invalidates. */ 508 509 /* TODO use general release procedure? */ 510 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 511 512 KMP_MB(); /* Flush all pending memory write invalidates. */ 513 } 514 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 515 } 516 517 /* Computes and returns x to the power of y, where y must a non-negative integer */ 518 template< typename UT > 519 static __forceinline long double 520 __kmp_pow(long double x, UT y) { 521 long double s=1.0L; 522 523 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 524 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 525 while(y) { 526 if ( y & 1 ) 527 s *= x; 528 x *= x; 529 y >>= 1; 530 } 531 return s; 532 } 533 534 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 535 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 536 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 537 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 538 */ 539 template< typename T > 540 static __inline typename traits_t< T >::unsigned_t 541 __kmp_dispatch_guided_remaining( 542 T tc, 543 typename traits_t< T >::floating_t base, 544 typename traits_t< T >::unsigned_t idx 545 ) { 546 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 547 least for ICL 8.1, long double arithmetic may not really have 548 long double precision, even with /Qlong_double. Currently, we 549 workaround that in the caller code, by manipulating the FPCW for 550 Windows* OS on IA-32 architecture. The lack of precision is not 551 expected to be a correctness issue, though. 552 */ 553 typedef typename traits_t< T >::unsigned_t UT; 554 555 long double x = tc * __kmp_pow< UT >(base, idx); 556 UT r = (UT) x; 557 if ( x == r ) 558 return r; 559 return r + 1; 560 } 561 562 // Parameters of the guided-iterative algorithm: 563 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 564 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 565 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 566 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 567 static int guided_int_param = 2; 568 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 569 570 // UT - unsigned flavor of T, ST - signed flavor of T, 571 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 572 template< typename T > 573 static void 574 __kmp_dispatch_init( 575 ident_t * loc, 576 int gtid, 577 enum sched_type schedule, 578 T lb, 579 T ub, 580 typename traits_t< T >::signed_t st, 581 typename traits_t< T >::signed_t chunk, 582 int push_ws 583 ) { 584 typedef typename traits_t< T >::unsigned_t UT; 585 typedef typename traits_t< T >::signed_t ST; 586 typedef typename traits_t< T >::floating_t DBL; 587 588 int active; 589 T tc; 590 kmp_info_t * th; 591 kmp_team_t * team; 592 kmp_uint32 my_buffer_index; 593 dispatch_private_info_template< T > * pr; 594 dispatch_shared_info_template< UT > volatile * sh; 595 596 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 597 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 598 599 if ( ! TCR_4( __kmp_init_parallel ) ) 600 __kmp_parallel_initialize(); 601 602 #if INCLUDE_SSC_MARKS 603 SSC_MARK_DISPATCH_INIT(); 604 #endif 605 #ifdef KMP_DEBUG 606 { 607 const char * buff; 608 // create format specifiers before the debug output 609 buff = __kmp_str_format( 610 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 611 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 612 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 613 __kmp_str_free( &buff ); 614 } 615 #endif 616 /* setup data */ 617 th = __kmp_threads[ gtid ]; 618 team = th -> th.th_team; 619 active = ! team -> t.t_serialized; 620 th->th.th_ident = loc; 621 622 #if USE_ITT_BUILD 623 kmp_uint64 cur_chunk = chunk; 624 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 625 KMP_MASTER_GTID(gtid) && 626 #if OMP_40_ENABLED 627 th->th.th_teams_microtask == NULL && 628 #endif 629 team->t.t_active_level == 1; 630 #endif 631 if ( ! active ) { 632 pr = reinterpret_cast< dispatch_private_info_template< T >* > 633 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 634 } else { 635 KMP_DEBUG_ASSERT( th->th.th_dispatch == 636 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 637 638 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 639 640 /* What happens when number of threads changes, need to resize buffer? */ 641 pr = reinterpret_cast< dispatch_private_info_template< T > * > 642 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 643 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 644 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 645 } 646 647 #if ( KMP_STATIC_STEAL_ENABLED ) 648 if ( SCHEDULE_HAS_NONMONOTONIC(schedule) ) 649 // AC: we now have only one implementation of stealing, so use it 650 schedule = kmp_sch_static_steal; 651 else 652 #endif 653 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 654 655 /* Pick up the nomerge/ordered bits from the scheduling type */ 656 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 657 pr->nomerge = TRUE; 658 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 659 } else { 660 pr->nomerge = FALSE; 661 } 662 pr->type_size = traits_t<T>::type_size; // remember the size of variables 663 if ( kmp_ord_lower & schedule ) { 664 pr->ordered = TRUE; 665 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 666 } else { 667 pr->ordered = FALSE; 668 } 669 670 if ( schedule == kmp_sch_static ) { 671 schedule = __kmp_static; 672 } else { 673 if ( schedule == kmp_sch_runtime ) { 674 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 675 schedule = team -> t.t_sched.r_sched_type; 676 // Detail the schedule if needed (global controls are differentiated appropriately) 677 if ( schedule == kmp_sch_guided_chunked ) { 678 schedule = __kmp_guided; 679 } else if ( schedule == kmp_sch_static ) { 680 schedule = __kmp_static; 681 } 682 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 683 chunk = team -> t.t_sched.chunk; 684 #if USE_ITT_BUILD 685 cur_chunk = chunk; 686 #endif 687 #ifdef KMP_DEBUG 688 { 689 const char * buff; 690 // create format specifiers before the debug output 691 buff = __kmp_str_format( 692 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 693 traits_t< ST >::spec ); 694 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 695 __kmp_str_free( &buff ); 696 } 697 #endif 698 } else { 699 if ( schedule == kmp_sch_guided_chunked ) { 700 schedule = __kmp_guided; 701 } 702 if ( chunk <= 0 ) { 703 chunk = KMP_DEFAULT_CHUNK; 704 } 705 } 706 707 if ( schedule == kmp_sch_auto ) { 708 // mapping and differentiation: in the __kmp_do_serial_initialize() 709 schedule = __kmp_auto; 710 #ifdef KMP_DEBUG 711 { 712 const char * buff; 713 // create format specifiers before the debug output 714 buff = __kmp_str_format( 715 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 716 traits_t< ST >::spec ); 717 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 718 __kmp_str_free( &buff ); 719 } 720 #endif 721 } 722 723 /* guided analytical not safe for too many threads */ 724 if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) { 725 schedule = kmp_sch_guided_iterative_chunked; 726 KMP_WARNING( DispatchManyThreads ); 727 } 728 pr->u.p.parm1 = chunk; 729 } 730 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 731 "unknown scheduling type" ); 732 733 pr->u.p.count = 0; 734 735 if ( __kmp_env_consistency_check ) { 736 if ( st == 0 ) { 737 __kmp_error_construct( 738 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 739 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 740 ); 741 } 742 } 743 // compute trip count 744 if ( st == 1 ) { // most common case 745 if ( ub >= lb ) { 746 tc = ub - lb + 1; 747 } else { // ub < lb 748 tc = 0; // zero-trip 749 } 750 } else if ( st < 0 ) { 751 if ( lb >= ub ) { 752 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 753 // where the division needs to be unsigned regardless of the result type 754 tc = (UT)(lb - ub) / (-st) + 1; 755 } else { // lb < ub 756 tc = 0; // zero-trip 757 } 758 } else { // st > 0 759 if ( ub >= lb ) { 760 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 761 // where the division needs to be unsigned regardless of the result type 762 tc = (UT)(ub - lb) / st + 1; 763 } else { // ub < lb 764 tc = 0; // zero-trip 765 } 766 } 767 768 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 769 // when statistics are disabled. 770 if (schedule == __kmp_static) 771 { 772 KMP_COUNT_BLOCK(OMP_FOR_static); 773 KMP_COUNT_VALUE(FOR_static_iterations, tc); 774 } 775 else 776 { 777 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 778 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 779 } 780 781 pr->u.p.lb = lb; 782 pr->u.p.ub = ub; 783 pr->u.p.st = st; 784 pr->u.p.tc = tc; 785 786 #if KMP_OS_WINDOWS 787 pr->u.p.last_upper = ub + st; 788 #endif /* KMP_OS_WINDOWS */ 789 790 /* NOTE: only the active parallel region(s) has active ordered sections */ 791 792 if ( active ) { 793 if ( pr->ordered == 0 ) { 794 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 795 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 796 } else { 797 pr->ordered_bumped = 0; 798 799 pr->u.p.ordered_lower = 1; 800 pr->u.p.ordered_upper = 0; 801 802 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 803 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 804 } 805 } 806 807 if ( __kmp_env_consistency_check ) { 808 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 809 if ( push_ws ) { 810 __kmp_push_workshare( gtid, ws, loc ); 811 pr->pushed_ws = ws; 812 } else { 813 __kmp_check_workshare( gtid, ws, loc ); 814 pr->pushed_ws = ct_none; 815 } 816 } 817 818 switch ( schedule ) { 819 #if ( KMP_STATIC_STEAL_ENABLED ) 820 case kmp_sch_static_steal: 821 { 822 T nproc = th->th.th_team_nproc; 823 T ntc, init; 824 825 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 826 827 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 828 if ( nproc > 1 && ntc >= nproc ) { 829 T id = __kmp_tid_from_gtid(gtid); 830 T small_chunk, extras; 831 832 small_chunk = ntc / nproc; 833 extras = ntc % nproc; 834 835 init = id * small_chunk + ( id < extras ? id : extras ); 836 pr->u.p.count = init; 837 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 838 839 pr->u.p.parm2 = lb; 840 //pr->pfields.parm3 = 0; // it's not used in static_steal 841 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 842 pr->u.p.st = st; 843 if ( traits_t<T>::type_size > 4 ) { 844 // AC: TODO: check if 16-byte CAS available and use it to 845 // improve performance (probably wait for explicit request 846 // before spending time on this). 847 // For now use dynamically allocated per-thread lock, 848 // free memory in __kmp_dispatch_next when status==0. 849 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 850 th->th.th_dispatch->th_steal_lock = 851 (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t)); 852 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 853 } 854 break; 855 } else { 856 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 857 gtid ) ); 858 schedule = kmp_sch_static_balanced; 859 /* too few iterations: fall-through to kmp_sch_static_balanced */ 860 } // if 861 /* FALL-THROUGH to static balanced */ 862 } // case 863 #endif 864 case kmp_sch_static_balanced: 865 { 866 T nproc = th->th.th_team_nproc; 867 T init, limit; 868 869 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 870 gtid ) ); 871 872 if ( nproc > 1 ) { 873 T id = __kmp_tid_from_gtid(gtid); 874 875 if ( tc < nproc ) { 876 if ( id < tc ) { 877 init = id; 878 limit = id; 879 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 880 } else { 881 pr->u.p.count = 1; /* means no more chunks to execute */ 882 pr->u.p.parm1 = FALSE; 883 break; 884 } 885 } else { 886 T small_chunk = tc / nproc; 887 T extras = tc % nproc; 888 init = id * small_chunk + (id < extras ? id : extras); 889 limit = init + small_chunk - (id < extras ? 0 : 1); 890 pr->u.p.parm1 = (id == nproc - 1); 891 } 892 } else { 893 if ( tc > 0 ) { 894 init = 0; 895 limit = tc - 1; 896 pr->u.p.parm1 = TRUE; 897 } else { 898 // zero trip count 899 pr->u.p.count = 1; /* means no more chunks to execute */ 900 pr->u.p.parm1 = FALSE; 901 break; 902 } 903 } 904 #if USE_ITT_BUILD 905 // Calculate chunk for metadata report 906 if ( itt_need_metadata_reporting ) 907 cur_chunk = limit - init + 1; 908 #endif 909 if ( st == 1 ) { 910 pr->u.p.lb = lb + init; 911 pr->u.p.ub = lb + limit; 912 } else { 913 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 914 pr->u.p.lb = lb + init * st; 915 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 916 if ( st > 0 ) { 917 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 918 } else { 919 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 920 } 921 } 922 if ( pr->ordered ) { 923 pr->u.p.ordered_lower = init; 924 pr->u.p.ordered_upper = limit; 925 } 926 break; 927 } // case 928 case kmp_sch_guided_iterative_chunked : 929 { 930 T nproc = th->th.th_team_nproc; 931 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 932 933 if ( nproc > 1 ) { 934 if ( (2L * chunk + 1 ) * nproc >= tc ) { 935 /* chunk size too large, switch to dynamic */ 936 schedule = kmp_sch_dynamic_chunked; 937 } else { 938 // when remaining iters become less than parm2 - switch to dynamic 939 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 940 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 941 } 942 } else { 943 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 944 schedule = kmp_sch_static_greedy; 945 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 946 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 947 pr->u.p.parm1 = tc; 948 } // if 949 } // case 950 break; 951 case kmp_sch_guided_analytical_chunked: 952 { 953 T nproc = th->th.th_team_nproc; 954 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 955 956 if ( nproc > 1 ) { 957 if ( (2L * chunk + 1 ) * nproc >= tc ) { 958 /* chunk size too large, switch to dynamic */ 959 schedule = kmp_sch_dynamic_chunked; 960 } else { 961 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 962 DBL x; 963 964 #if KMP_OS_WINDOWS && KMP_ARCH_X86 965 /* Linux* OS already has 64-bit computation by default for 966 long double, and on Windows* OS on Intel(R) 64, 967 /Qlong_double doesn't work. On Windows* OS 968 on IA-32 architecture, we need to set precision to 969 64-bit instead of the default 53-bit. Even though long 970 double doesn't work on Windows* OS on Intel(R) 64, the 971 resulting lack of precision is not expected to impact 972 the correctness of the algorithm, but this has not been 973 mathematically proven. 974 */ 975 // save original FPCW and set precision to 64-bit, as 976 // Windows* OS on IA-32 architecture defaults to 53-bit 977 unsigned int oldFpcw = _control87(0,0); 978 _control87(_PC_64,_MCW_PC); // 0,0x30000 979 #endif 980 /* value used for comparison in solver for cross-over point */ 981 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 982 983 /* crossover point--chunk indexes equal to or greater than 984 this point switch to dynamic-style scheduling */ 985 UT cross; 986 987 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 988 x = (long double)1.0 - (long double)0.5 / nproc; 989 990 #ifdef KMP_DEBUG 991 { // test natural alignment 992 struct _test_a { 993 char a; 994 union { 995 char b; 996 DBL d; 997 }; 998 } t; 999 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 1000 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 1001 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 1002 } 1003 #endif // KMP_DEBUG 1004 1005 /* save the term in thread private dispatch structure */ 1006 *(DBL*)&pr->u.p.parm3 = x; 1007 1008 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 1009 { 1010 UT left, right, mid; 1011 long double p; 1012 1013 /* estimate initial upper and lower bound */ 1014 1015 /* doesn't matter what value right is as long as it is positive, but 1016 it affects performance of the solver 1017 */ 1018 right = 229; 1019 p = __kmp_pow< UT >(x,right); 1020 if ( p > target ) { 1021 do{ 1022 p *= p; 1023 right <<= 1; 1024 } while(p>target && right < (1<<27)); 1025 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1026 } else { 1027 left = 0; 1028 } 1029 1030 /* bisection root-finding method */ 1031 while ( left + 1 < right ) { 1032 mid = (left + right) / 2; 1033 if ( __kmp_pow< UT >(x,mid) > target ) { 1034 left = mid; 1035 } else { 1036 right = mid; 1037 } 1038 } // while 1039 cross = right; 1040 } 1041 /* assert sanity of computed crossover point */ 1042 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1043 1044 /* save the crossover point in thread private dispatch structure */ 1045 pr->u.p.parm2 = cross; 1046 1047 // C75803 1048 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1049 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1050 #else 1051 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1052 #endif 1053 /* dynamic-style scheduling offset */ 1054 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1055 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1056 // restore FPCW 1057 _control87(oldFpcw,_MCW_PC); 1058 #endif 1059 } // if 1060 } else { 1061 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1062 gtid ) ); 1063 schedule = kmp_sch_static_greedy; 1064 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1065 pr->u.p.parm1 = tc; 1066 } // if 1067 } // case 1068 break; 1069 case kmp_sch_static_greedy: 1070 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1071 pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ? 1072 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc : 1073 tc; 1074 break; 1075 case kmp_sch_static_chunked : 1076 case kmp_sch_dynamic_chunked : 1077 if ( pr->u.p.parm1 <= 0 ) { 1078 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1079 } 1080 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1081 break; 1082 case kmp_sch_trapezoidal : 1083 { 1084 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1085 1086 T parm1, parm2, parm3, parm4; 1087 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1088 1089 parm1 = chunk; 1090 1091 /* F : size of the first cycle */ 1092 parm2 = ( tc / (2 * th->th.th_team_nproc) ); 1093 1094 if ( parm2 < 1 ) { 1095 parm2 = 1; 1096 } 1097 1098 /* L : size of the last cycle. Make sure the last cycle 1099 * is not larger than the first cycle. 1100 */ 1101 if ( parm1 < 1 ) { 1102 parm1 = 1; 1103 } else if ( parm1 > parm2 ) { 1104 parm1 = parm2; 1105 } 1106 1107 /* N : number of cycles */ 1108 parm3 = ( parm2 + parm1 ); 1109 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1110 1111 if ( parm3 < 2 ) { 1112 parm3 = 2; 1113 } 1114 1115 /* sigma : decreasing incr of the trapezoid */ 1116 parm4 = ( parm3 - 1 ); 1117 parm4 = ( parm2 - parm1 ) / parm4; 1118 1119 // pointless check, because parm4 >= 0 always 1120 //if ( parm4 < 0 ) { 1121 // parm4 = 0; 1122 //} 1123 1124 pr->u.p.parm1 = parm1; 1125 pr->u.p.parm2 = parm2; 1126 pr->u.p.parm3 = parm3; 1127 pr->u.p.parm4 = parm4; 1128 } // case 1129 break; 1130 1131 default: 1132 { 1133 __kmp_msg( 1134 kmp_ms_fatal, // Severity 1135 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1136 KMP_HNT( GetNewerLibrary ), // Hint 1137 __kmp_msg_null // Variadic argument list terminator 1138 ); 1139 } 1140 break; 1141 } // switch 1142 pr->schedule = schedule; 1143 if ( active ) { 1144 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1145 1146 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1147 gtid, my_buffer_index, sh->buffer_index) ); 1148 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1149 USE_ITT_BUILD_ARG( NULL ) 1150 ); 1151 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1152 // *always* 32-bit integers. 1153 KMP_MB(); /* is this necessary? */ 1154 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1155 gtid, my_buffer_index, sh->buffer_index) ); 1156 1157 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1158 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1159 #if USE_ITT_BUILD 1160 if ( pr->ordered ) { 1161 __kmp_itt_ordered_init( gtid ); 1162 }; // if 1163 // Report loop metadata 1164 if ( itt_need_metadata_reporting ) { 1165 // Only report metadata by master of active team at level 1 1166 kmp_uint64 schedtype = 0; 1167 switch ( schedule ) { 1168 case kmp_sch_static_chunked: 1169 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1170 break; 1171 case kmp_sch_static_greedy: 1172 cur_chunk = pr->u.p.parm1; 1173 break; 1174 case kmp_sch_dynamic_chunked: 1175 schedtype = 1; 1176 break; 1177 case kmp_sch_guided_iterative_chunked: 1178 case kmp_sch_guided_analytical_chunked: 1179 schedtype = 2; 1180 break; 1181 default: 1182 // Should we put this case under "static"? 1183 // case kmp_sch_static_steal: 1184 schedtype = 3; 1185 break; 1186 } 1187 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1188 } 1189 #endif /* USE_ITT_BUILD */ 1190 }; // if 1191 1192 #ifdef KMP_DEBUG 1193 { 1194 const char * buff; 1195 // create format specifiers before the debug output 1196 buff = __kmp_str_format( 1197 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1198 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1199 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1200 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1201 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1202 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1203 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1204 KD_TRACE(10, ( buff, 1205 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1206 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1207 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1208 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1209 __kmp_str_free( &buff ); 1210 } 1211 #endif 1212 #if ( KMP_STATIC_STEAL_ENABLED ) 1213 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1214 // all the parm3 variables will contain the same value. 1215 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1216 // rather than program life-time increment. 1217 // So the dedicated variable is required. The 'static_steal_counter' is used. 1218 if( schedule == kmp_sch_static_steal ) { 1219 // Other threads will inspect this variable when searching for a victim. 1220 // This is a flag showing that other threads may steal from this thread since then. 1221 volatile T * p = &pr->u.p.static_steal_counter; 1222 *p = *p + 1; 1223 } 1224 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1225 1226 #if OMPT_SUPPORT && OMPT_TRACE 1227 if (ompt_enabled && 1228 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1229 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1230 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1231 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1232 team_info->parallel_id, task_info->task_id, team_info->microtask); 1233 } 1234 #endif 1235 } 1236 1237 /* 1238 * For ordered loops, either __kmp_dispatch_finish() should be called after 1239 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1240 * every chunk of iterations. If the ordered section(s) were not executed 1241 * for this iteration (or every iteration in this chunk), we need to set the 1242 * ordered iteration counters so that the next thread can proceed. 1243 */ 1244 template< typename UT > 1245 static void 1246 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1247 { 1248 typedef typename traits_t< UT >::signed_t ST; 1249 kmp_info_t *th = __kmp_threads[ gtid ]; 1250 1251 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1252 if ( ! th -> th.th_team -> t.t_serialized ) { 1253 1254 dispatch_private_info_template< UT > * pr = 1255 reinterpret_cast< dispatch_private_info_template< UT >* > 1256 ( th->th.th_dispatch->th_dispatch_pr_current ); 1257 dispatch_shared_info_template< UT > volatile * sh = 1258 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1259 ( th->th.th_dispatch->th_dispatch_sh_current ); 1260 KMP_DEBUG_ASSERT( pr ); 1261 KMP_DEBUG_ASSERT( sh ); 1262 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1263 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1264 1265 if ( pr->ordered_bumped ) { 1266 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1267 gtid ) ); 1268 pr->ordered_bumped = 0; 1269 } else { 1270 UT lower = pr->u.p.ordered_lower; 1271 1272 #ifdef KMP_DEBUG 1273 { 1274 const char * buff; 1275 // create format specifiers before the debug output 1276 buff = __kmp_str_format( 1277 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1278 traits_t< UT >::spec, traits_t< UT >::spec ); 1279 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1280 __kmp_str_free( &buff ); 1281 } 1282 #endif 1283 1284 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1285 USE_ITT_BUILD_ARG(NULL) 1286 ); 1287 KMP_MB(); /* is this necessary? */ 1288 #ifdef KMP_DEBUG 1289 { 1290 const char * buff; 1291 // create format specifiers before the debug output 1292 buff = __kmp_str_format( 1293 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1294 traits_t< UT >::spec, traits_t< UT >::spec ); 1295 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1296 __kmp_str_free( &buff ); 1297 } 1298 #endif 1299 1300 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1301 } // if 1302 } // if 1303 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1304 } 1305 1306 #ifdef KMP_GOMP_COMPAT 1307 1308 template< typename UT > 1309 static void 1310 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1311 { 1312 typedef typename traits_t< UT >::signed_t ST; 1313 kmp_info_t *th = __kmp_threads[ gtid ]; 1314 1315 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1316 if ( ! th -> th.th_team -> t.t_serialized ) { 1317 // int cid; 1318 dispatch_private_info_template< UT > * pr = 1319 reinterpret_cast< dispatch_private_info_template< UT >* > 1320 ( th->th.th_dispatch->th_dispatch_pr_current ); 1321 dispatch_shared_info_template< UT > volatile * sh = 1322 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1323 ( th->th.th_dispatch->th_dispatch_sh_current ); 1324 KMP_DEBUG_ASSERT( pr ); 1325 KMP_DEBUG_ASSERT( sh ); 1326 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1327 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1328 1329 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1330 UT lower = pr->u.p.ordered_lower; 1331 UT upper = pr->u.p.ordered_upper; 1332 UT inc = upper - lower + 1; 1333 1334 if ( pr->ordered_bumped == inc ) { 1335 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1336 gtid ) ); 1337 pr->ordered_bumped = 0; 1338 } else { 1339 inc -= pr->ordered_bumped; 1340 1341 #ifdef KMP_DEBUG 1342 { 1343 const char * buff; 1344 // create format specifiers before the debug output 1345 buff = __kmp_str_format( 1346 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1347 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1348 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1349 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1350 __kmp_str_free( &buff ); 1351 } 1352 #endif 1353 1354 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1355 USE_ITT_BUILD_ARG(NULL) 1356 ); 1357 1358 KMP_MB(); /* is this necessary? */ 1359 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1360 gtid ) ); 1361 pr->ordered_bumped = 0; 1362 //!!!!! TODO check if the inc should be unsigned, or signed??? 1363 #ifdef KMP_DEBUG 1364 { 1365 const char * buff; 1366 // create format specifiers before the debug output 1367 buff = __kmp_str_format( 1368 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1369 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1370 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1371 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1372 __kmp_str_free( &buff ); 1373 } 1374 #endif 1375 1376 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1377 } 1378 // } 1379 } 1380 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1381 } 1382 1383 #endif /* KMP_GOMP_COMPAT */ 1384 1385 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1386 * (no more work), then tell OMPT the loop is over. In some cases 1387 * kmp_dispatch_fini() is not called. */ 1388 #if OMPT_SUPPORT && OMPT_TRACE 1389 #define OMPT_LOOP_END \ 1390 if (status == 0) { \ 1391 if (ompt_enabled && \ 1392 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1393 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1394 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1395 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1396 team_info->parallel_id, task_info->task_id); \ 1397 } \ 1398 } 1399 #else 1400 #define OMPT_LOOP_END // no-op 1401 #endif 1402 1403 template< typename T > 1404 static int 1405 __kmp_dispatch_next( 1406 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1407 ) { 1408 1409 typedef typename traits_t< T >::unsigned_t UT; 1410 typedef typename traits_t< T >::signed_t ST; 1411 typedef typename traits_t< T >::floating_t DBL; 1412 1413 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1414 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1415 // more than a compile time choice to use static scheduling would.) 1416 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1417 1418 int status; 1419 dispatch_private_info_template< T > * pr; 1420 kmp_info_t * th = __kmp_threads[ gtid ]; 1421 kmp_team_t * team = th -> th.th_team; 1422 1423 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1424 #ifdef KMP_DEBUG 1425 { 1426 const char * buff; 1427 // create format specifiers before the debug output 1428 buff = __kmp_str_format( 1429 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1430 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1431 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1432 __kmp_str_free( &buff ); 1433 } 1434 #endif 1435 1436 if ( team -> t.t_serialized ) { 1437 /* NOTE: serialize this dispatch becase we are not at the active level */ 1438 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1439 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1440 KMP_DEBUG_ASSERT( pr ); 1441 1442 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1443 *p_lb = 0; 1444 *p_ub = 0; 1445 // if ( p_last != NULL ) 1446 // *p_last = 0; 1447 if ( p_st != NULL ) 1448 *p_st = 0; 1449 if ( __kmp_env_consistency_check ) { 1450 if ( pr->pushed_ws != ct_none ) { 1451 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1452 } 1453 } 1454 } else if ( pr->nomerge ) { 1455 kmp_int32 last; 1456 T start; 1457 UT limit, trip, init; 1458 ST incr; 1459 T chunk = pr->u.p.parm1; 1460 1461 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1462 1463 init = chunk * pr->u.p.count++; 1464 trip = pr->u.p.tc - 1; 1465 1466 if ( (status = (init <= trip)) == 0 ) { 1467 *p_lb = 0; 1468 *p_ub = 0; 1469 // if ( p_last != NULL ) 1470 // *p_last = 0; 1471 if ( p_st != NULL ) 1472 *p_st = 0; 1473 if ( __kmp_env_consistency_check ) { 1474 if ( pr->pushed_ws != ct_none ) { 1475 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1476 } 1477 } 1478 } else { 1479 start = pr->u.p.lb; 1480 limit = chunk + init - 1; 1481 incr = pr->u.p.st; 1482 1483 if ( (last = (limit >= trip)) != 0 ) { 1484 limit = trip; 1485 #if KMP_OS_WINDOWS 1486 pr->u.p.last_upper = pr->u.p.ub; 1487 #endif /* KMP_OS_WINDOWS */ 1488 } 1489 if ( p_last != NULL ) 1490 *p_last = last; 1491 if ( p_st != NULL ) 1492 *p_st = incr; 1493 if ( incr == 1 ) { 1494 *p_lb = start + init; 1495 *p_ub = start + limit; 1496 } else { 1497 *p_lb = start + init * incr; 1498 *p_ub = start + limit * incr; 1499 } 1500 1501 if ( pr->ordered ) { 1502 pr->u.p.ordered_lower = init; 1503 pr->u.p.ordered_upper = limit; 1504 #ifdef KMP_DEBUG 1505 { 1506 const char * buff; 1507 // create format specifiers before the debug output 1508 buff = __kmp_str_format( 1509 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1510 traits_t< UT >::spec, traits_t< UT >::spec ); 1511 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1512 __kmp_str_free( &buff ); 1513 } 1514 #endif 1515 } // if 1516 } // if 1517 } else { 1518 pr->u.p.tc = 0; 1519 *p_lb = pr->u.p.lb; 1520 *p_ub = pr->u.p.ub; 1521 #if KMP_OS_WINDOWS 1522 pr->u.p.last_upper = *p_ub; 1523 #endif /* KMP_OS_WINDOWS */ 1524 if ( p_last != NULL ) 1525 *p_last = TRUE; 1526 if ( p_st != NULL ) 1527 *p_st = pr->u.p.st; 1528 } // if 1529 #ifdef KMP_DEBUG 1530 { 1531 const char * buff; 1532 // create format specifiers before the debug output 1533 buff = __kmp_str_format( 1534 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1535 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1536 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1537 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1538 __kmp_str_free( &buff ); 1539 } 1540 #endif 1541 #if INCLUDE_SSC_MARKS 1542 SSC_MARK_DISPATCH_NEXT(); 1543 #endif 1544 OMPT_LOOP_END; 1545 return status; 1546 } else { 1547 kmp_int32 last = 0; 1548 dispatch_shared_info_template< UT > *sh; 1549 T start; 1550 ST incr; 1551 UT limit, trip, init; 1552 1553 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1554 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1555 1556 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1557 ( th->th.th_dispatch->th_dispatch_pr_current ); 1558 KMP_DEBUG_ASSERT( pr ); 1559 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1560 ( th->th.th_dispatch->th_dispatch_sh_current ); 1561 KMP_DEBUG_ASSERT( sh ); 1562 1563 if ( pr->u.p.tc == 0 ) { 1564 // zero trip count 1565 status = 0; 1566 } else { 1567 switch (pr->schedule) { 1568 #if ( KMP_STATIC_STEAL_ENABLED ) 1569 case kmp_sch_static_steal: 1570 { 1571 T chunk = pr->u.p.parm1; 1572 int nproc = th->th.th_team_nproc; 1573 1574 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1575 1576 trip = pr->u.p.tc - 1; 1577 1578 if ( traits_t<T>::type_size > 4 ) { 1579 // use lock for 8-byte and CAS for 4-byte induction 1580 // variable. TODO (optional): check and use 16-byte CAS 1581 kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock; 1582 KMP_DEBUG_ASSERT(lck != NULL); 1583 if( pr->u.p.count < (UT)pr->u.p.ub ) { 1584 __kmp_acquire_lock(lck, gtid); 1585 // try to get own chunk of iterations 1586 init = ( pr->u.p.count )++; 1587 status = ( init < (UT)pr->u.p.ub ); 1588 __kmp_release_lock(lck, gtid); 1589 } else { 1590 status = 0; // no own chunks 1591 } 1592 if( !status ) { // try to steal 1593 kmp_info_t **other_threads = team->t.t_threads; 1594 int while_limit = nproc; // nproc attempts to find a victim 1595 int while_index = 0; 1596 // TODO: algorithm of searching for a victim 1597 // should be cleaned up and measured 1598 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1599 T remaining; 1600 T victimIdx = pr->u.p.parm4; 1601 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1602 dispatch_private_info_template< T > * victim = 1603 reinterpret_cast< dispatch_private_info_template< T >* > 1604 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1605 while( ( victim == NULL || victim == pr || 1606 ( *(volatile T*)&victim->u.p.static_steal_counter != 1607 *(volatile T*)&pr->u.p.static_steal_counter ) ) && 1608 oldVictimIdx != victimIdx ) 1609 { 1610 victimIdx = (victimIdx + 1) % nproc; 1611 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1612 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1613 }; 1614 if( !victim || 1615 ( *(volatile T *)&victim->u.p.static_steal_counter != 1616 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1617 { 1618 continue; // try once more (nproc attempts in total) 1619 // no victim is ready yet to participate in stealing 1620 // because all victims are still in kmp_init_dispatch 1621 } 1622 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) { 1623 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1624 continue; // not enough chunks to steal, goto next victim 1625 } 1626 1627 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1628 KMP_ASSERT(lck != NULL); 1629 __kmp_acquire_lock(lck, gtid); 1630 limit = victim->u.p.ub; // keep initial ub 1631 if( victim->u.p.count >= limit || 1632 (remaining = limit - victim->u.p.count) < 2 ) 1633 { 1634 __kmp_release_lock(lck, gtid); 1635 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1636 continue; // not enough chunks to steal 1637 } 1638 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1 1639 if( remaining > 3 ) { 1640 init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining 1641 } else { 1642 init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining 1643 } 1644 __kmp_release_lock(lck, gtid); 1645 1646 KMP_DEBUG_ASSERT(init + 1 <= limit); 1647 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1648 status = 1; 1649 while_index = 0; 1650 // now update own count and ub with stolen range but init chunk 1651 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1652 pr->u.p.count = init + 1; 1653 pr->u.p.ub = limit; 1654 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1655 } // while (search for victim) 1656 } // if (try to find victim and steal) 1657 } else { 1658 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1659 typedef union { 1660 struct { 1661 UT count; 1662 T ub; 1663 } p; 1664 kmp_int64 b; 1665 } union_i4; 1666 // All operations on 'count' or 'ub' must be combined atomically together. 1667 { 1668 union_i4 vold, vnew; 1669 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1670 vnew = vold; 1671 vnew.p.count++; 1672 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1673 ( volatile kmp_int64* )&pr->u.p.count, 1674 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1675 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1676 KMP_CPU_PAUSE(); 1677 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1678 vnew = vold; 1679 vnew.p.count++; 1680 } 1681 vnew = vold; 1682 init = vnew.p.count; 1683 status = ( init < (UT)vnew.p.ub ) ; 1684 } 1685 1686 if( !status ) { 1687 kmp_info_t **other_threads = team->t.t_threads; 1688 int while_limit = nproc; // nproc attempts to find a victim 1689 int while_index = 0; 1690 1691 // TODO: algorithm of searching for a victim 1692 // should be cleaned up and measured 1693 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1694 union_i4 vold, vnew; 1695 kmp_int32 remaining; 1696 T victimIdx = pr->u.p.parm4; 1697 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1698 dispatch_private_info_template< T > * victim = 1699 reinterpret_cast< dispatch_private_info_template< T >* > 1700 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1701 while( (victim == NULL || victim == pr || 1702 (*(volatile T*)&victim->u.p.static_steal_counter != 1703 *(volatile T*)&pr->u.p.static_steal_counter)) && 1704 oldVictimIdx != victimIdx ) 1705 { 1706 victimIdx = (victimIdx + 1) % nproc; 1707 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1708 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1709 }; 1710 if( !victim || 1711 ( *(volatile T *)&victim->u.p.static_steal_counter != 1712 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1713 { 1714 continue; // try once more (nproc attempts in total) 1715 // no victim is ready yet to participate in stealing 1716 // because all victims are still in kmp_init_dispatch 1717 } 1718 pr->u.p.parm4 = victimIdx; // new victim found 1719 while( 1 ) { // CAS loop if victim has enough chunks to steal 1720 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1721 vnew = vold; 1722 1723 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1724 if ( vnew.p.count >= (UT)vnew.p.ub || 1725 (remaining = vnew.p.ub - vnew.p.count) < 2 ) 1726 { 1727 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1728 break; // not enough chunks to steal, goto next victim 1729 } 1730 if( remaining > 3 ) { 1731 vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining 1732 } else { 1733 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1734 } 1735 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1736 // TODO: Should this be acquire or release? 1737 if ( KMP_COMPARE_AND_STORE_ACQ64( 1738 ( volatile kmp_int64 * )&victim->u.p.count, 1739 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1740 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1741 // stealing succedded 1742 status = 1; 1743 while_index = 0; 1744 // now update own count and ub 1745 init = vnew.p.ub; 1746 vold.p.count = init + 1; 1747 #if KMP_ARCH_X86 1748 KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b); 1749 #else 1750 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1751 #endif 1752 break; 1753 } // if (check CAS result) 1754 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1755 } // while (try to steal from particular victim) 1756 } // while (search for victim) 1757 } // if (try to find victim and steal) 1758 } // if (4-byte induction variable) 1759 if ( !status ) { 1760 *p_lb = 0; 1761 *p_ub = 0; 1762 if ( p_st != NULL ) *p_st = 0; 1763 } else { 1764 start = pr->u.p.parm2; 1765 init *= chunk; 1766 limit = chunk + init - 1; 1767 incr = pr->u.p.st; 1768 1769 KMP_DEBUG_ASSERT(init <= trip); 1770 if ( (last = (limit >= trip)) != 0 ) 1771 limit = trip; 1772 if ( p_st != NULL ) *p_st = incr; 1773 1774 if ( incr == 1 ) { 1775 *p_lb = start + init; 1776 *p_ub = start + limit; 1777 } else { 1778 *p_lb = start + init * incr; 1779 *p_ub = start + limit * incr; 1780 } 1781 1782 if ( pr->ordered ) { 1783 pr->u.p.ordered_lower = init; 1784 pr->u.p.ordered_upper = limit; 1785 #ifdef KMP_DEBUG 1786 { 1787 const char * buff; 1788 // create format specifiers before the debug output 1789 buff = __kmp_str_format( 1790 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1791 traits_t< UT >::spec, traits_t< UT >::spec ); 1792 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1793 __kmp_str_free( &buff ); 1794 } 1795 #endif 1796 } // if 1797 } // if 1798 break; 1799 } // case 1800 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1801 case kmp_sch_static_balanced: 1802 { 1803 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1804 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1805 pr->u.p.count = 1; 1806 *p_lb = pr->u.p.lb; 1807 *p_ub = pr->u.p.ub; 1808 last = pr->u.p.parm1; 1809 if ( p_st != NULL ) 1810 *p_st = pr->u.p.st; 1811 } else { /* no iterations to do */ 1812 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1813 } 1814 if ( pr->ordered ) { 1815 #ifdef KMP_DEBUG 1816 { 1817 const char * buff; 1818 // create format specifiers before the debug output 1819 buff = __kmp_str_format( 1820 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1821 traits_t< UT >::spec, traits_t< UT >::spec ); 1822 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1823 __kmp_str_free( &buff ); 1824 } 1825 #endif 1826 } // if 1827 } // case 1828 break; 1829 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1830 case kmp_sch_static_chunked: 1831 { 1832 T parm1; 1833 1834 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1835 gtid ) ); 1836 parm1 = pr->u.p.parm1; 1837 1838 trip = pr->u.p.tc - 1; 1839 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1840 1841 if ( (status = (init <= trip)) != 0 ) { 1842 start = pr->u.p.lb; 1843 incr = pr->u.p.st; 1844 limit = parm1 + init - 1; 1845 1846 if ( (last = (limit >= trip)) != 0 ) 1847 limit = trip; 1848 1849 if ( p_st != NULL ) *p_st = incr; 1850 1851 pr->u.p.count += th->th.th_team_nproc; 1852 1853 if ( incr == 1 ) { 1854 *p_lb = start + init; 1855 *p_ub = start + limit; 1856 } 1857 else { 1858 *p_lb = start + init * incr; 1859 *p_ub = start + limit * incr; 1860 } 1861 1862 if ( pr->ordered ) { 1863 pr->u.p.ordered_lower = init; 1864 pr->u.p.ordered_upper = limit; 1865 #ifdef KMP_DEBUG 1866 { 1867 const char * buff; 1868 // create format specifiers before the debug output 1869 buff = __kmp_str_format( 1870 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1871 traits_t< UT >::spec, traits_t< UT >::spec ); 1872 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1873 __kmp_str_free( &buff ); 1874 } 1875 #endif 1876 } // if 1877 } // if 1878 } // case 1879 break; 1880 1881 case kmp_sch_dynamic_chunked: 1882 { 1883 T chunk = pr->u.p.parm1; 1884 1885 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1886 gtid ) ); 1887 1888 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1889 trip = pr->u.p.tc - 1; 1890 1891 if ( (status = (init <= trip)) == 0 ) { 1892 *p_lb = 0; 1893 *p_ub = 0; 1894 if ( p_st != NULL ) *p_st = 0; 1895 } else { 1896 start = pr->u.p.lb; 1897 limit = chunk + init - 1; 1898 incr = pr->u.p.st; 1899 1900 if ( (last = (limit >= trip)) != 0 ) 1901 limit = trip; 1902 1903 if ( p_st != NULL ) *p_st = incr; 1904 1905 if ( incr == 1 ) { 1906 *p_lb = start + init; 1907 *p_ub = start + limit; 1908 } else { 1909 *p_lb = start + init * incr; 1910 *p_ub = start + limit * incr; 1911 } 1912 1913 if ( pr->ordered ) { 1914 pr->u.p.ordered_lower = init; 1915 pr->u.p.ordered_upper = limit; 1916 #ifdef KMP_DEBUG 1917 { 1918 const char * buff; 1919 // create format specifiers before the debug output 1920 buff = __kmp_str_format( 1921 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1922 traits_t< UT >::spec, traits_t< UT >::spec ); 1923 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1924 __kmp_str_free( &buff ); 1925 } 1926 #endif 1927 } // if 1928 } // if 1929 } // case 1930 break; 1931 1932 case kmp_sch_guided_iterative_chunked: 1933 { 1934 T chunkspec = pr->u.p.parm1; 1935 KD_TRACE(100, 1936 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1937 trip = pr->u.p.tc; 1938 // Start atomic part of calculations 1939 while(1) { 1940 ST remaining; // signed, because can be < 0 1941 init = sh->u.s.iteration; // shared value 1942 remaining = trip - init; 1943 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1944 // nothing to do, don't try atomic op 1945 status = 0; 1946 break; 1947 } 1948 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1949 // use dynamic-style shcedule 1950 // atomically inrement iterations, get old value 1951 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1952 remaining = trip - init; 1953 if (remaining <= 0) { 1954 status = 0; // all iterations got by other threads 1955 } else { 1956 // got some iterations to work on 1957 status = 1; 1958 if ( (T)remaining > chunkspec ) { 1959 limit = init + chunkspec - 1; 1960 } else { 1961 last = 1; // the last chunk 1962 limit = init + remaining - 1; 1963 } // if 1964 } // if 1965 break; 1966 } // if 1967 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1968 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1969 // CAS was successful, chunk obtained 1970 status = 1; 1971 --limit; 1972 break; 1973 } // if 1974 } // while 1975 if ( status != 0 ) { 1976 start = pr->u.p.lb; 1977 incr = pr->u.p.st; 1978 if ( p_st != NULL ) 1979 *p_st = incr; 1980 *p_lb = start + init * incr; 1981 *p_ub = start + limit * incr; 1982 if ( pr->ordered ) { 1983 pr->u.p.ordered_lower = init; 1984 pr->u.p.ordered_upper = limit; 1985 #ifdef KMP_DEBUG 1986 { 1987 const char * buff; 1988 // create format specifiers before the debug output 1989 buff = __kmp_str_format( 1990 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1991 traits_t< UT >::spec, traits_t< UT >::spec ); 1992 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1993 __kmp_str_free( &buff ); 1994 } 1995 #endif 1996 } // if 1997 } else { 1998 *p_lb = 0; 1999 *p_ub = 0; 2000 if ( p_st != NULL ) 2001 *p_st = 0; 2002 } // if 2003 } // case 2004 break; 2005 2006 case kmp_sch_guided_analytical_chunked: 2007 { 2008 T chunkspec = pr->u.p.parm1; 2009 UT chunkIdx; 2010 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2011 /* for storing original FPCW value for Windows* OS on 2012 IA-32 architecture 8-byte version */ 2013 unsigned int oldFpcw; 2014 unsigned int fpcwSet = 0; 2015 #endif 2016 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 2017 gtid ) ); 2018 2019 trip = pr->u.p.tc; 2020 2021 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2022 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip); 2023 2024 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 2025 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 2026 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 2027 --trip; 2028 /* use dynamic-style scheduling */ 2029 init = chunkIdx * chunkspec + pr->u.p.count; 2030 /* need to verify init > 0 in case of overflow in the above calculation */ 2031 if ( (status = (init > 0 && init <= trip)) != 0 ) { 2032 limit = init + chunkspec -1; 2033 2034 if ( (last = (limit >= trip)) != 0 ) 2035 limit = trip; 2036 } 2037 break; 2038 } else { 2039 /* use exponential-style scheduling */ 2040 /* The following check is to workaround the lack of long double precision on Windows* OS. 2041 This check works around the possible effect that init != 0 for chunkIdx == 0. 2042 */ 2043 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2044 /* If we haven't already done so, save original 2045 FPCW and set precision to 64-bit, as Windows* OS 2046 on IA-32 architecture defaults to 53-bit */ 2047 if ( !fpcwSet ) { 2048 oldFpcw = _control87(0,0); 2049 _control87(_PC_64,_MCW_PC); 2050 fpcwSet = 0x30000; 2051 } 2052 #endif 2053 if ( chunkIdx ) { 2054 init = __kmp_dispatch_guided_remaining< T >( 2055 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 2056 KMP_DEBUG_ASSERT(init); 2057 init = trip - init; 2058 } else 2059 init = 0; 2060 limit = trip - __kmp_dispatch_guided_remaining< T >( 2061 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 2062 KMP_ASSERT(init <= limit); 2063 if ( init < limit ) { 2064 KMP_DEBUG_ASSERT(limit <= trip); 2065 --limit; 2066 status = 1; 2067 break; 2068 } // if 2069 } // if 2070 } // while (1) 2071 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2072 /* restore FPCW if necessary 2073 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2074 */ 2075 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2076 _control87(oldFpcw,_MCW_PC); 2077 #endif 2078 if ( status != 0 ) { 2079 start = pr->u.p.lb; 2080 incr = pr->u.p.st; 2081 if ( p_st != NULL ) 2082 *p_st = incr; 2083 *p_lb = start + init * incr; 2084 *p_ub = start + limit * incr; 2085 if ( pr->ordered ) { 2086 pr->u.p.ordered_lower = init; 2087 pr->u.p.ordered_upper = limit; 2088 #ifdef KMP_DEBUG 2089 { 2090 const char * buff; 2091 // create format specifiers before the debug output 2092 buff = __kmp_str_format( 2093 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2094 traits_t< UT >::spec, traits_t< UT >::spec ); 2095 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2096 __kmp_str_free( &buff ); 2097 } 2098 #endif 2099 } 2100 } else { 2101 *p_lb = 0; 2102 *p_ub = 0; 2103 if ( p_st != NULL ) 2104 *p_st = 0; 2105 } 2106 } // case 2107 break; 2108 2109 case kmp_sch_trapezoidal: 2110 { 2111 UT index; 2112 T parm2 = pr->u.p.parm2; 2113 T parm3 = pr->u.p.parm3; 2114 T parm4 = pr->u.p.parm4; 2115 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2116 gtid ) ); 2117 2118 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2119 2120 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2121 trip = pr->u.p.tc - 1; 2122 2123 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2124 *p_lb = 0; 2125 *p_ub = 0; 2126 if ( p_st != NULL ) *p_st = 0; 2127 } else { 2128 start = pr->u.p.lb; 2129 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2130 incr = pr->u.p.st; 2131 2132 if ( (last = (limit >= trip)) != 0 ) 2133 limit = trip; 2134 2135 if ( p_st != NULL ) *p_st = incr; 2136 2137 if ( incr == 1 ) { 2138 *p_lb = start + init; 2139 *p_ub = start + limit; 2140 } else { 2141 *p_lb = start + init * incr; 2142 *p_ub = start + limit * incr; 2143 } 2144 2145 if ( pr->ordered ) { 2146 pr->u.p.ordered_lower = init; 2147 pr->u.p.ordered_upper = limit; 2148 #ifdef KMP_DEBUG 2149 { 2150 const char * buff; 2151 // create format specifiers before the debug output 2152 buff = __kmp_str_format( 2153 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2154 traits_t< UT >::spec, traits_t< UT >::spec ); 2155 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2156 __kmp_str_free( &buff ); 2157 } 2158 #endif 2159 } // if 2160 } // if 2161 } // case 2162 break; 2163 default: 2164 { 2165 status = 0; // to avoid complaints on uninitialized variable use 2166 __kmp_msg( 2167 kmp_ms_fatal, // Severity 2168 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2169 KMP_HNT( GetNewerLibrary ), // Hint 2170 __kmp_msg_null // Variadic argument list terminator 2171 ); 2172 } 2173 break; 2174 } // switch 2175 } // if tc == 0; 2176 2177 if ( status == 0 ) { 2178 UT num_done; 2179 2180 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2181 #ifdef KMP_DEBUG 2182 { 2183 const char * buff; 2184 // create format specifiers before the debug output 2185 buff = __kmp_str_format( 2186 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2187 traits_t< UT >::spec ); 2188 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2189 __kmp_str_free( &buff ); 2190 } 2191 #endif 2192 2193 if ( (ST)num_done == th->th.th_team_nproc - 1 ) { 2194 #if ( KMP_STATIC_STEAL_ENABLED ) 2195 if( pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4 ) { 2196 int i; 2197 kmp_info_t **other_threads = team->t.t_threads; 2198 // loop complete, safe to destroy locks used for stealing 2199 for( i = 0; i < th->th.th_team_nproc; ++i ) { 2200 kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2201 KMP_ASSERT(lck != NULL); 2202 __kmp_destroy_lock( lck ); 2203 __kmp_free( lck ); 2204 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2205 } 2206 } 2207 #endif 2208 /* NOTE: release this buffer to be reused */ 2209 2210 KMP_MB(); /* Flush all pending memory write invalidates. */ 2211 2212 sh->u.s.num_done = 0; 2213 sh->u.s.iteration = 0; 2214 2215 /* TODO replace with general release procedure? */ 2216 if ( pr->ordered ) { 2217 sh->u.s.ordered_iteration = 0; 2218 } 2219 2220 KMP_MB(); /* Flush all pending memory write invalidates. */ 2221 2222 sh -> buffer_index += __kmp_dispatch_num_buffers; 2223 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2224 gtid, sh->buffer_index) ); 2225 2226 KMP_MB(); /* Flush all pending memory write invalidates. */ 2227 2228 } // if 2229 if ( __kmp_env_consistency_check ) { 2230 if ( pr->pushed_ws != ct_none ) { 2231 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2232 } 2233 } 2234 2235 th -> th.th_dispatch -> th_deo_fcn = NULL; 2236 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2237 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2238 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2239 } // if (status == 0) 2240 #if KMP_OS_WINDOWS 2241 else if ( last ) { 2242 pr->u.p.last_upper = pr->u.p.ub; 2243 } 2244 #endif /* KMP_OS_WINDOWS */ 2245 if ( p_last != NULL && status != 0 ) 2246 *p_last = last; 2247 } // if 2248 2249 #ifdef KMP_DEBUG 2250 { 2251 const char * buff; 2252 // create format specifiers before the debug output 2253 buff = __kmp_str_format( 2254 "__kmp_dispatch_next: T#%%d normal case: " \ 2255 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2256 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2257 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2258 __kmp_str_free( &buff ); 2259 } 2260 #endif 2261 #if INCLUDE_SSC_MARKS 2262 SSC_MARK_DISPATCH_NEXT(); 2263 #endif 2264 OMPT_LOOP_END; 2265 return status; 2266 } 2267 2268 template< typename T > 2269 static void 2270 __kmp_dist_get_bounds( 2271 ident_t *loc, 2272 kmp_int32 gtid, 2273 kmp_int32 *plastiter, 2274 T *plower, 2275 T *pupper, 2276 typename traits_t< T >::signed_t incr 2277 ) { 2278 typedef typename traits_t< T >::unsigned_t UT; 2279 typedef typename traits_t< T >::signed_t ST; 2280 register kmp_uint32 team_id; 2281 register kmp_uint32 nteams; 2282 register UT trip_count; 2283 register kmp_team_t *team; 2284 kmp_info_t * th; 2285 2286 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2287 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2288 #ifdef KMP_DEBUG 2289 { 2290 const char * buff; 2291 // create format specifiers before the debug output 2292 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2293 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2294 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2295 traits_t< T >::spec ); 2296 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2297 __kmp_str_free( &buff ); 2298 } 2299 #endif 2300 2301 if( __kmp_env_consistency_check ) { 2302 if( incr == 0 ) { 2303 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2304 } 2305 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2306 // The loop is illegal. 2307 // Some zero-trip loops maintained by compiler, e.g.: 2308 // for(i=10;i<0;++i) // lower >= upper - run-time check 2309 // for(i=0;i>10;--i) // lower <= upper - run-time check 2310 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2311 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2312 // Compiler does not check the following illegal loops: 2313 // for(i=0;i<10;i+=incr) // where incr<0 2314 // for(i=10;i>0;i-=incr) // where incr<0 2315 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2316 } 2317 } 2318 th = __kmp_threads[gtid]; 2319 team = th->th.th_team; 2320 #if OMP_40_ENABLED 2321 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2322 nteams = th->th.th_teams_size.nteams; 2323 #endif 2324 team_id = team->t.t_master_tid; 2325 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2326 2327 // compute global trip count 2328 if( incr == 1 ) { 2329 trip_count = *pupper - *plower + 1; 2330 } else if(incr == -1) { 2331 trip_count = *plower - *pupper + 1; 2332 } else if ( incr > 0 ) { 2333 // upper-lower can exceed the limit of signed type 2334 trip_count = (UT)(*pupper - *plower) / incr + 1; 2335 } else { 2336 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1; 2337 } 2338 2339 if( trip_count <= nteams ) { 2340 KMP_DEBUG_ASSERT( 2341 __kmp_static == kmp_sch_static_greedy || \ 2342 __kmp_static == kmp_sch_static_balanced 2343 ); // Unknown static scheduling type. 2344 // only some teams get single iteration, others get nothing 2345 if( team_id < trip_count ) { 2346 *pupper = *plower = *plower + team_id * incr; 2347 } else { 2348 *plower = *pupper + incr; // zero-trip loop 2349 } 2350 if( plastiter != NULL ) 2351 *plastiter = ( team_id == trip_count - 1 ); 2352 } else { 2353 if( __kmp_static == kmp_sch_static_balanced ) { 2354 register UT chunk = trip_count / nteams; 2355 register UT extras = trip_count % nteams; 2356 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2357 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2358 if( plastiter != NULL ) 2359 *plastiter = ( team_id == nteams - 1 ); 2360 } else { 2361 register T chunk_inc_count = 2362 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2363 register T upper = *pupper; 2364 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2365 // Unknown static scheduling type. 2366 *plower += team_id * chunk_inc_count; 2367 *pupper = *plower + chunk_inc_count - incr; 2368 // Check/correct bounds if needed 2369 if( incr > 0 ) { 2370 if( *pupper < *plower ) 2371 *pupper = traits_t<T>::max_value; 2372 if( plastiter != NULL ) 2373 *plastiter = *plower <= upper && *pupper > upper - incr; 2374 if( *pupper > upper ) 2375 *pupper = upper; // tracker C73258 2376 } else { 2377 if( *pupper > *plower ) 2378 *pupper = traits_t<T>::min_value; 2379 if( plastiter != NULL ) 2380 *plastiter = *plower >= upper && *pupper < upper - incr; 2381 if( *pupper < upper ) 2382 *pupper = upper; // tracker C73258 2383 } 2384 } 2385 } 2386 } 2387 2388 //----------------------------------------------------------------------------------------- 2389 // Dispatch routines 2390 // Transfer call to template< type T > 2391 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2392 // T lb, T ub, ST st, ST chunk ) 2393 extern "C" { 2394 2395 /*! 2396 @ingroup WORK_SHARING 2397 @{ 2398 @param loc Source location 2399 @param gtid Global thread id 2400 @param schedule Schedule type 2401 @param lb Lower bound 2402 @param ub Upper bound 2403 @param st Step (or increment if you prefer) 2404 @param chunk The chunk size to block with 2405 2406 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2407 These functions are all identical apart from the types of the arguments. 2408 */ 2409 2410 void 2411 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2412 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2413 { 2414 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2415 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2416 } 2417 /*! 2418 See @ref __kmpc_dispatch_init_4 2419 */ 2420 void 2421 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2422 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2423 { 2424 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2425 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2426 } 2427 2428 /*! 2429 See @ref __kmpc_dispatch_init_4 2430 */ 2431 void 2432 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2433 kmp_int64 lb, kmp_int64 ub, 2434 kmp_int64 st, kmp_int64 chunk ) 2435 { 2436 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2437 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2438 } 2439 2440 /*! 2441 See @ref __kmpc_dispatch_init_4 2442 */ 2443 void 2444 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2445 kmp_uint64 lb, kmp_uint64 ub, 2446 kmp_int64 st, kmp_int64 chunk ) 2447 { 2448 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2449 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2450 } 2451 2452 /*! 2453 See @ref __kmpc_dispatch_init_4 2454 2455 Difference from __kmpc_dispatch_init set of functions is these functions 2456 are called for composite distribute parallel for construct. Thus before 2457 regular iterations dispatching we need to calc per-team iteration space. 2458 2459 These functions are all identical apart from the types of the arguments. 2460 */ 2461 void 2462 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2463 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2464 { 2465 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2466 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2467 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2468 } 2469 2470 void 2471 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2472 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2473 { 2474 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2475 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2476 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2477 } 2478 2479 void 2480 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2481 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2482 { 2483 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2484 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2485 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2486 } 2487 2488 void 2489 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2490 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2491 { 2492 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2493 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2494 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2495 } 2496 2497 /*! 2498 @param loc Source code location 2499 @param gtid Global thread id 2500 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2501 @param p_lb Pointer to the lower bound for the next chunk of work 2502 @param p_ub Pointer to the upper bound for the next chunk of work 2503 @param p_st Pointer to the stride for the next chunk of work 2504 @return one if there is work to be done, zero otherwise 2505 2506 Get the next dynamically allocated chunk of work for this thread. 2507 If there is no more work, then the lb,ub and stride need not be modified. 2508 */ 2509 int 2510 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2511 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2512 { 2513 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2514 } 2515 2516 /*! 2517 See @ref __kmpc_dispatch_next_4 2518 */ 2519 int 2520 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2521 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2522 { 2523 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2524 } 2525 2526 /*! 2527 See @ref __kmpc_dispatch_next_4 2528 */ 2529 int 2530 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2531 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2532 { 2533 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2534 } 2535 2536 /*! 2537 See @ref __kmpc_dispatch_next_4 2538 */ 2539 int 2540 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2541 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2542 { 2543 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2544 } 2545 2546 /*! 2547 @param loc Source code location 2548 @param gtid Global thread id 2549 2550 Mark the end of a dynamic loop. 2551 */ 2552 void 2553 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2554 { 2555 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2556 } 2557 2558 /*! 2559 See @ref __kmpc_dispatch_fini_4 2560 */ 2561 void 2562 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2563 { 2564 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2565 } 2566 2567 /*! 2568 See @ref __kmpc_dispatch_fini_4 2569 */ 2570 void 2571 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2572 { 2573 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2574 } 2575 2576 /*! 2577 See @ref __kmpc_dispatch_fini_4 2578 */ 2579 void 2580 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2581 { 2582 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2583 } 2584 /*! @} */ 2585 2586 //----------------------------------------------------------------------------------------- 2587 //Non-template routines from kmp_dispatch.cpp used in other sources 2588 2589 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2590 return value == checker; 2591 } 2592 2593 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2594 return value != checker; 2595 } 2596 2597 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2598 return value < checker; 2599 } 2600 2601 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2602 return value >= checker; 2603 } 2604 2605 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2606 return value <= checker; 2607 } 2608 2609 kmp_uint32 2610 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2611 kmp_uint32 checker, 2612 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2613 , void * obj // Higher-level synchronization object, or NULL. 2614 ) 2615 { 2616 // note: we may not belong to a team at this point 2617 register volatile kmp_uint32 * spin = spinner; 2618 register kmp_uint32 check = checker; 2619 register kmp_uint32 spins; 2620 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2621 register kmp_uint32 r; 2622 2623 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2624 KMP_INIT_YIELD( spins ); 2625 // main wait spin loop 2626 while(!f(r = TCR_4(*spin), check)) { 2627 KMP_FSYNC_SPIN_PREPARE( obj ); 2628 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2629 It causes problems with infinite recursion because of exit lock */ 2630 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2631 __kmp_abort_thread(); */ 2632 2633 /* if we have waited a bit, or are oversubscribed, yield */ 2634 /* pause is in the following code */ 2635 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2636 KMP_YIELD_SPIN( spins ); 2637 } 2638 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2639 return r; 2640 } 2641 2642 void 2643 __kmp_wait_yield_4_ptr(void *spinner, 2644 kmp_uint32 checker, 2645 kmp_uint32 (*pred)( void *, kmp_uint32 ), 2646 void *obj // Higher-level synchronization object, or NULL. 2647 ) 2648 { 2649 // note: we may not belong to a team at this point 2650 register void *spin = spinner; 2651 register kmp_uint32 check = checker; 2652 register kmp_uint32 spins; 2653 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred; 2654 2655 KMP_FSYNC_SPIN_INIT( obj, spin ); 2656 KMP_INIT_YIELD( spins ); 2657 // main wait spin loop 2658 while ( !f( spin, check ) ) { 2659 KMP_FSYNC_SPIN_PREPARE( obj ); 2660 /* if we have waited a bit, or are oversubscribed, yield */ 2661 /* pause is in the following code */ 2662 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc ); 2663 KMP_YIELD_SPIN( spins ); 2664 } 2665 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2666 } 2667 2668 } // extern "C" 2669 2670 #ifdef KMP_GOMP_COMPAT 2671 2672 void 2673 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2674 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2675 kmp_int32 chunk, int push_ws ) 2676 { 2677 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2678 push_ws ); 2679 } 2680 2681 void 2682 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2683 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2684 kmp_int32 chunk, int push_ws ) 2685 { 2686 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2687 push_ws ); 2688 } 2689 2690 void 2691 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2692 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2693 kmp_int64 chunk, int push_ws ) 2694 { 2695 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2696 push_ws ); 2697 } 2698 2699 void 2700 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2701 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2702 kmp_int64 chunk, int push_ws ) 2703 { 2704 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2705 push_ws ); 2706 } 2707 2708 void 2709 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2710 { 2711 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2712 } 2713 2714 void 2715 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2716 { 2717 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2718 } 2719 2720 void 2721 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2722 { 2723 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2724 } 2725 2726 void 2727 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2728 { 2729 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2730 } 2731 2732 #endif /* KMP_GOMP_COMPAT */ 2733 2734 /* ------------------------------------------------------------------------ */ 2735 /* ------------------------------------------------------------------------ */ 2736 2737