1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64 29 #if defined(_WIN32_WINNT) && defined(_M_IX86) 30 #undef _WIN32_WINNT 31 #define _WIN32_WINNT 0x0502 32 #endif 33 34 #include "kmp.h" 35 #include "kmp_i18n.h" 36 #include "kmp_itt.h" 37 #include "kmp_str.h" 38 #include "kmp_error.h" 39 #include "kmp_stats.h" 40 #if KMP_OS_WINDOWS && KMP_ARCH_X86 41 #include <float.h> 42 #endif 43 44 #if OMPT_SUPPORT 45 #include "ompt-internal.h" 46 #include "ompt-specific.h" 47 #endif 48 49 /* ------------------------------------------------------------------------ */ 50 /* ------------------------------------------------------------------------ */ 51 52 #if KMP_STATIC_STEAL_ENABLED 53 54 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 55 template< typename T > 56 struct dispatch_private_infoXX_template { 57 typedef typename traits_t< T >::unsigned_t UT; 58 typedef typename traits_t< T >::signed_t ST; 59 UT count; // unsigned 60 T ub; 61 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 62 T lb; 63 ST st; // signed 64 UT tc; // unsigned 65 T static_steal_counter; // for static_steal only; maybe better to put after ub 66 67 /* parm[1-4] are used in different ways by different scheduling algorithms */ 68 69 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 70 // a) parm3 is properly aligned and 71 // b) all parm1-4 are in the same cache line. 72 // Because of parm1-4 are used together, performance seems to be better 73 // if they are in the same line (not measured though). 74 75 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 76 T parm1; 77 T parm2; 78 T parm3; 79 T parm4; 80 }; 81 82 UT ordered_lower; // unsigned 83 UT ordered_upper; // unsigned 84 #if KMP_OS_WINDOWS 85 T last_upper; 86 #endif /* KMP_OS_WINDOWS */ 87 }; 88 89 #else /* KMP_STATIC_STEAL_ENABLED */ 90 91 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 92 template< typename T > 93 struct dispatch_private_infoXX_template { 94 typedef typename traits_t< T >::unsigned_t UT; 95 typedef typename traits_t< T >::signed_t ST; 96 T lb; 97 T ub; 98 ST st; // signed 99 UT tc; // unsigned 100 101 T parm1; 102 T parm2; 103 T parm3; 104 T parm4; 105 106 UT count; // unsigned 107 108 UT ordered_lower; // unsigned 109 UT ordered_upper; // unsigned 110 #if KMP_OS_WINDOWS 111 T last_upper; 112 #endif /* KMP_OS_WINDOWS */ 113 }; 114 115 #endif /* KMP_STATIC_STEAL_ENABLED */ 116 117 // replaces dispatch_private_info structure and dispatch_private_info_t type 118 template< typename T > 119 struct KMP_ALIGN_CACHE dispatch_private_info_template { 120 // duplicate alignment here, otherwise size of structure is not correct in our compiler 121 union KMP_ALIGN_CACHE private_info_tmpl { 122 dispatch_private_infoXX_template< T > p; 123 dispatch_private_info64_t p64; 124 } u; 125 enum sched_type schedule; /* scheduling algorithm */ 126 kmp_uint32 ordered; /* ordered clause specified */ 127 kmp_uint32 ordered_bumped; 128 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 129 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 130 kmp_uint32 nomerge; /* don't merge iters if serialized */ 131 kmp_uint32 type_size; 132 enum cons_type pushed_ws; 133 }; 134 135 136 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 137 template< typename UT > 138 struct dispatch_shared_infoXX_template { 139 /* chunk index under dynamic, number of idle threads under static-steal; 140 iteration index otherwise */ 141 volatile UT iteration; 142 volatile UT num_done; 143 volatile UT ordered_iteration; 144 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar 145 }; 146 147 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 148 template< typename UT > 149 struct dispatch_shared_info_template { 150 // we need union here to keep the structure size 151 union shared_info_tmpl { 152 dispatch_shared_infoXX_template< UT > s; 153 dispatch_shared_info64_t s64; 154 } u; 155 volatile kmp_uint32 buffer_index; 156 #if OMP_45_ENABLED 157 volatile kmp_int32 doacross_buf_idx; // teamwise index 158 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 159 kmp_int32 doacross_num_done; // count finished threads 160 #endif 161 #if KMP_USE_HWLOC 162 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big 163 // machines (> 48 cores). Performance analysis showed that a cache thrash 164 // was occurring and this padding helps alleviate the problem. 165 char padding[64]; 166 #endif 167 }; 168 169 /* ------------------------------------------------------------------------ */ 170 /* ------------------------------------------------------------------------ */ 171 172 #undef USE_TEST_LOCKS 173 174 // test_then_add template (general template should NOT be used) 175 template< typename T > 176 static __forceinline T 177 test_then_add( volatile T *p, T d ); 178 179 template<> 180 __forceinline kmp_int32 181 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 182 { 183 kmp_int32 r; 184 r = KMP_TEST_THEN_ADD32( p, d ); 185 return r; 186 } 187 188 template<> 189 __forceinline kmp_int64 190 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 191 { 192 kmp_int64 r; 193 r = KMP_TEST_THEN_ADD64( p, d ); 194 return r; 195 } 196 197 // test_then_inc_acq template (general template should NOT be used) 198 template< typename T > 199 static __forceinline T 200 test_then_inc_acq( volatile T *p ); 201 202 template<> 203 __forceinline kmp_int32 204 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 205 { 206 kmp_int32 r; 207 r = KMP_TEST_THEN_INC_ACQ32( p ); 208 return r; 209 } 210 211 template<> 212 __forceinline kmp_int64 213 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 214 { 215 kmp_int64 r; 216 r = KMP_TEST_THEN_INC_ACQ64( p ); 217 return r; 218 } 219 220 // test_then_inc template (general template should NOT be used) 221 template< typename T > 222 static __forceinline T 223 test_then_inc( volatile T *p ); 224 225 template<> 226 __forceinline kmp_int32 227 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 228 { 229 kmp_int32 r; 230 r = KMP_TEST_THEN_INC32( p ); 231 return r; 232 } 233 234 template<> 235 __forceinline kmp_int64 236 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 237 { 238 kmp_int64 r; 239 r = KMP_TEST_THEN_INC64( p ); 240 return r; 241 } 242 243 // compare_and_swap template (general template should NOT be used) 244 template< typename T > 245 static __forceinline kmp_int32 246 compare_and_swap( volatile T *p, T c, T s ); 247 248 template<> 249 __forceinline kmp_int32 250 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 251 { 252 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 253 } 254 255 template<> 256 __forceinline kmp_int32 257 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 258 { 259 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 260 } 261 262 /* 263 Spin wait loop that first does pause, then yield. 264 Waits until function returns non-zero when called with *spinner and check. 265 Does NOT put threads to sleep. 266 #if USE_ITT_BUILD 267 Arguments: 268 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 269 locks consistently. For example, if lock is acquired immediately, its address is 270 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 271 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 272 address, not an address of low-level spinner. 273 #endif // USE_ITT_BUILD 274 */ 275 template< typename UT > 276 // ToDo: make inline function (move to header file for icl) 277 static UT // unsigned 4- or 8-byte type 278 __kmp_wait_yield( volatile UT * spinner, 279 UT checker, 280 kmp_uint32 (* pred)( UT, UT ) 281 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 282 ) 283 { 284 // note: we may not belong to a team at this point 285 register volatile UT * spin = spinner; 286 register UT check = checker; 287 register kmp_uint32 spins; 288 register kmp_uint32 (*f) ( UT, UT ) = pred; 289 register UT r; 290 291 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 292 KMP_INIT_YIELD( spins ); 293 // main wait spin loop 294 while(!f(r = *spin, check)) 295 { 296 KMP_FSYNC_SPIN_PREPARE( obj ); 297 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 298 It causes problems with infinite recursion because of exit lock */ 299 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 300 __kmp_abort_thread(); */ 301 302 // if we are oversubscribed, 303 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 304 // pause is in the following code 305 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 306 KMP_YIELD_SPIN( spins ); 307 } 308 KMP_FSYNC_SPIN_ACQUIRED( obj ); 309 return r; 310 } 311 312 template< typename UT > 313 static kmp_uint32 __kmp_eq( UT value, UT checker) { 314 return value == checker; 315 } 316 317 template< typename UT > 318 static kmp_uint32 __kmp_neq( UT value, UT checker) { 319 return value != checker; 320 } 321 322 template< typename UT > 323 static kmp_uint32 __kmp_lt( UT value, UT checker) { 324 return value < checker; 325 } 326 327 template< typename UT > 328 static kmp_uint32 __kmp_ge( UT value, UT checker) { 329 return value >= checker; 330 } 331 332 template< typename UT > 333 static kmp_uint32 __kmp_le( UT value, UT checker) { 334 return value <= checker; 335 } 336 337 338 /* ------------------------------------------------------------------------ */ 339 /* ------------------------------------------------------------------------ */ 340 341 static void 342 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 343 { 344 kmp_info_t *th; 345 346 KMP_DEBUG_ASSERT( gtid_ref ); 347 348 if ( __kmp_env_consistency_check ) { 349 th = __kmp_threads[*gtid_ref]; 350 if ( th -> th.th_root -> r.r_active 351 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 352 #if KMP_USE_DYNAMIC_LOCK 353 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 354 #else 355 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 356 #endif 357 } 358 } 359 } 360 361 template< typename UT > 362 static void 363 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 364 { 365 typedef typename traits_t< UT >::signed_t ST; 366 dispatch_private_info_template< UT > * pr; 367 368 int gtid = *gtid_ref; 369 // int cid = *cid_ref; 370 kmp_info_t *th = __kmp_threads[ gtid ]; 371 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 372 373 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 374 if ( __kmp_env_consistency_check ) { 375 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 376 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 377 if ( pr -> pushed_ws != ct_none ) { 378 #if KMP_USE_DYNAMIC_LOCK 379 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 380 #else 381 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 382 #endif 383 } 384 } 385 386 if ( ! th -> th.th_team -> t.t_serialized ) { 387 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 388 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 389 UT lower; 390 391 if ( ! __kmp_env_consistency_check ) { 392 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 393 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 394 } 395 lower = pr->u.p.ordered_lower; 396 397 #if ! defined( KMP_GOMP_COMPAT ) 398 if ( __kmp_env_consistency_check ) { 399 if ( pr->ordered_bumped ) { 400 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 401 __kmp_error_construct2( 402 kmp_i18n_msg_CnsMultipleNesting, 403 ct_ordered_in_pdo, loc_ref, 404 & p->stack_data[ p->w_top ] 405 ); 406 } 407 } 408 #endif /* !defined(KMP_GOMP_COMPAT) */ 409 410 KMP_MB(); 411 #ifdef KMP_DEBUG 412 { 413 const char * buff; 414 // create format specifiers before the debug output 415 buff = __kmp_str_format( 416 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 417 traits_t< UT >::spec, traits_t< UT >::spec ); 418 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 419 __kmp_str_free( &buff ); 420 } 421 #endif 422 423 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 424 USE_ITT_BUILD_ARG( NULL ) 425 ); 426 KMP_MB(); /* is this necessary? */ 427 #ifdef KMP_DEBUG 428 { 429 const char * buff; 430 // create format specifiers before the debug output 431 buff = __kmp_str_format( 432 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 433 traits_t< UT >::spec, traits_t< UT >::spec ); 434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 435 __kmp_str_free( &buff ); 436 } 437 #endif 438 } 439 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 440 } 441 442 static void 443 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 444 { 445 kmp_info_t *th; 446 447 if ( __kmp_env_consistency_check ) { 448 th = __kmp_threads[*gtid_ref]; 449 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 450 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 451 } 452 } 453 } 454 455 template< typename UT > 456 static void 457 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 458 { 459 typedef typename traits_t< UT >::signed_t ST; 460 dispatch_private_info_template< UT > * pr; 461 462 int gtid = *gtid_ref; 463 // int cid = *cid_ref; 464 kmp_info_t *th = __kmp_threads[ gtid ]; 465 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 466 467 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 468 if ( __kmp_env_consistency_check ) { 469 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 470 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 471 if ( pr -> pushed_ws != ct_none ) { 472 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 473 } 474 } 475 476 if ( ! th -> th.th_team -> t.t_serialized ) { 477 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 478 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 479 480 if ( ! __kmp_env_consistency_check ) { 481 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 482 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 483 } 484 485 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 486 #if ! defined( KMP_GOMP_COMPAT ) 487 if ( __kmp_env_consistency_check ) { 488 if ( pr->ordered_bumped != 0 ) { 489 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 490 /* How to test it? - OM */ 491 __kmp_error_construct2( 492 kmp_i18n_msg_CnsMultipleNesting, 493 ct_ordered_in_pdo, loc_ref, 494 & p->stack_data[ p->w_top ] 495 ); 496 } 497 } 498 #endif /* !defined(KMP_GOMP_COMPAT) */ 499 500 KMP_MB(); /* Flush all pending memory write invalidates. */ 501 502 pr->ordered_bumped += 1; 503 504 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 505 gtid, pr->ordered_bumped ) ); 506 507 KMP_MB(); /* Flush all pending memory write invalidates. */ 508 509 /* TODO use general release procedure? */ 510 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 511 512 KMP_MB(); /* Flush all pending memory write invalidates. */ 513 } 514 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 515 } 516 517 /* Computes and returns x to the power of y, where y must a non-negative integer */ 518 template< typename UT > 519 static __forceinline long double 520 __kmp_pow(long double x, UT y) { 521 long double s=1.0L; 522 523 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 524 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 525 while(y) { 526 if ( y & 1 ) 527 s *= x; 528 x *= x; 529 y >>= 1; 530 } 531 return s; 532 } 533 534 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 535 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 536 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 537 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 538 */ 539 template< typename T > 540 static __inline typename traits_t< T >::unsigned_t 541 __kmp_dispatch_guided_remaining( 542 T tc, 543 typename traits_t< T >::floating_t base, 544 typename traits_t< T >::unsigned_t idx 545 ) { 546 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 547 least for ICL 8.1, long double arithmetic may not really have 548 long double precision, even with /Qlong_double. Currently, we 549 workaround that in the caller code, by manipulating the FPCW for 550 Windows* OS on IA-32 architecture. The lack of precision is not 551 expected to be a correctness issue, though. 552 */ 553 typedef typename traits_t< T >::unsigned_t UT; 554 555 long double x = tc * __kmp_pow< UT >(base, idx); 556 UT r = (UT) x; 557 if ( x == r ) 558 return r; 559 return r + 1; 560 } 561 562 // Parameters of the guided-iterative algorithm: 563 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 564 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 565 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 566 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 567 static int guided_int_param = 2; 568 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 569 570 // UT - unsigned flavor of T, ST - signed flavor of T, 571 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 572 template< typename T > 573 static void 574 __kmp_dispatch_init( 575 ident_t * loc, 576 int gtid, 577 enum sched_type schedule, 578 T lb, 579 T ub, 580 typename traits_t< T >::signed_t st, 581 typename traits_t< T >::signed_t chunk, 582 int push_ws 583 ) { 584 typedef typename traits_t< T >::unsigned_t UT; 585 typedef typename traits_t< T >::signed_t ST; 586 typedef typename traits_t< T >::floating_t DBL; 587 588 int active; 589 T tc; 590 kmp_info_t * th; 591 kmp_team_t * team; 592 kmp_uint32 my_buffer_index; 593 dispatch_private_info_template< T > * pr; 594 dispatch_shared_info_template< UT > volatile * sh; 595 596 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 597 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 598 599 if ( ! TCR_4( __kmp_init_parallel ) ) 600 __kmp_parallel_initialize(); 601 602 #if INCLUDE_SSC_MARKS 603 SSC_MARK_DISPATCH_INIT(); 604 #endif 605 #ifdef KMP_DEBUG 606 { 607 const char * buff; 608 // create format specifiers before the debug output 609 buff = __kmp_str_format( 610 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 611 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 612 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 613 __kmp_str_free( &buff ); 614 } 615 #endif 616 /* setup data */ 617 th = __kmp_threads[ gtid ]; 618 team = th -> th.th_team; 619 active = ! team -> t.t_serialized; 620 th->th.th_ident = loc; 621 622 #if USE_ITT_BUILD 623 kmp_uint64 cur_chunk = chunk; 624 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 625 KMP_MASTER_GTID(gtid) && 626 #if OMP_40_ENABLED 627 th->th.th_teams_microtask == NULL && 628 #endif 629 team->t.t_active_level == 1; 630 #endif 631 if ( ! active ) { 632 pr = reinterpret_cast< dispatch_private_info_template< T >* > 633 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 634 } else { 635 KMP_DEBUG_ASSERT( th->th.th_dispatch == 636 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 637 638 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 639 640 /* What happens when number of threads changes, need to resize buffer? */ 641 pr = reinterpret_cast< dispatch_private_info_template< T > * > 642 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 643 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 644 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 645 } 646 647 #if ( KMP_STATIC_STEAL_ENABLED ) 648 if ( SCHEDULE_HAS_NONMONOTONIC(schedule) ) 649 // AC: we now have only one implementation of stealing, so use it 650 schedule = kmp_sch_static_steal; 651 else 652 #endif 653 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 654 655 /* Pick up the nomerge/ordered bits from the scheduling type */ 656 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 657 pr->nomerge = TRUE; 658 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 659 } else { 660 pr->nomerge = FALSE; 661 } 662 pr->type_size = traits_t<T>::type_size; // remember the size of variables 663 if ( kmp_ord_lower & schedule ) { 664 pr->ordered = TRUE; 665 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 666 } else { 667 pr->ordered = FALSE; 668 } 669 670 if ( schedule == kmp_sch_static ) { 671 schedule = __kmp_static; 672 } else { 673 if ( schedule == kmp_sch_runtime ) { 674 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 675 schedule = team -> t.t_sched.r_sched_type; 676 // Detail the schedule if needed (global controls are differentiated appropriately) 677 if ( schedule == kmp_sch_guided_chunked ) { 678 schedule = __kmp_guided; 679 } else if ( schedule == kmp_sch_static ) { 680 schedule = __kmp_static; 681 } 682 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 683 chunk = team -> t.t_sched.chunk; 684 #if USE_ITT_BUILD 685 cur_chunk = chunk; 686 #endif 687 #ifdef KMP_DEBUG 688 { 689 const char * buff; 690 // create format specifiers before the debug output 691 buff = __kmp_str_format( 692 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 693 traits_t< ST >::spec ); 694 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 695 __kmp_str_free( &buff ); 696 } 697 #endif 698 } else { 699 if ( schedule == kmp_sch_guided_chunked ) { 700 schedule = __kmp_guided; 701 } 702 if ( chunk <= 0 ) { 703 chunk = KMP_DEFAULT_CHUNK; 704 } 705 } 706 707 if ( schedule == kmp_sch_auto ) { 708 // mapping and differentiation: in the __kmp_do_serial_initialize() 709 schedule = __kmp_auto; 710 #ifdef KMP_DEBUG 711 { 712 const char * buff; 713 // create format specifiers before the debug output 714 buff = __kmp_str_format( 715 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 716 traits_t< ST >::spec ); 717 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 718 __kmp_str_free( &buff ); 719 } 720 #endif 721 } 722 723 /* guided analytical not safe for too many threads */ 724 if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) { 725 schedule = kmp_sch_guided_iterative_chunked; 726 KMP_WARNING( DispatchManyThreads ); 727 } 728 pr->u.p.parm1 = chunk; 729 } 730 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 731 "unknown scheduling type" ); 732 733 pr->u.p.count = 0; 734 735 if ( __kmp_env_consistency_check ) { 736 if ( st == 0 ) { 737 __kmp_error_construct( 738 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 739 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 740 ); 741 } 742 } 743 // compute trip count 744 if ( st == 1 ) { // most common case 745 if ( ub >= lb ) { 746 tc = ub - lb + 1; 747 } else { // ub < lb 748 tc = 0; // zero-trip 749 } 750 } else if ( st < 0 ) { 751 if ( lb >= ub ) { 752 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 753 // where the division needs to be unsigned regardless of the result type 754 tc = (UT)(lb - ub) / (-st) + 1; 755 } else { // lb < ub 756 tc = 0; // zero-trip 757 } 758 } else { // st > 0 759 if ( ub >= lb ) { 760 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 761 // where the division needs to be unsigned regardless of the result type 762 tc = (UT)(ub - lb) / st + 1; 763 } else { // ub < lb 764 tc = 0; // zero-trip 765 } 766 } 767 768 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 769 // when statistics are disabled. 770 if (schedule == __kmp_static) 771 { 772 KMP_COUNT_BLOCK(OMP_FOR_static); 773 KMP_COUNT_VALUE(FOR_static_iterations, tc); 774 } 775 else 776 { 777 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 778 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 779 } 780 781 pr->u.p.lb = lb; 782 pr->u.p.ub = ub; 783 pr->u.p.st = st; 784 pr->u.p.tc = tc; 785 786 #if KMP_OS_WINDOWS 787 pr->u.p.last_upper = ub + st; 788 #endif /* KMP_OS_WINDOWS */ 789 790 /* NOTE: only the active parallel region(s) has active ordered sections */ 791 792 if ( active ) { 793 if ( pr->ordered == 0 ) { 794 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 795 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 796 } else { 797 pr->ordered_bumped = 0; 798 799 pr->u.p.ordered_lower = 1; 800 pr->u.p.ordered_upper = 0; 801 802 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 803 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 804 } 805 } 806 807 if ( __kmp_env_consistency_check ) { 808 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 809 if ( push_ws ) { 810 __kmp_push_workshare( gtid, ws, loc ); 811 pr->pushed_ws = ws; 812 } else { 813 __kmp_check_workshare( gtid, ws, loc ); 814 pr->pushed_ws = ct_none; 815 } 816 } 817 818 switch ( schedule ) { 819 #if ( KMP_STATIC_STEAL_ENABLED ) 820 case kmp_sch_static_steal: 821 { 822 T nproc = th->th.th_team_nproc; 823 T ntc, init; 824 825 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 826 827 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 828 if ( nproc > 1 && ntc >= nproc ) { 829 KMP_COUNT_BLOCK(OMP_FOR_static_steal); 830 T id = __kmp_tid_from_gtid(gtid); 831 T small_chunk, extras; 832 833 small_chunk = ntc / nproc; 834 extras = ntc % nproc; 835 836 init = id * small_chunk + ( id < extras ? id : extras ); 837 pr->u.p.count = init; 838 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 839 840 pr->u.p.parm2 = lb; 841 //pr->pfields.parm3 = 0; // it's not used in static_steal 842 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid 843 pr->u.p.st = st; 844 if ( traits_t<T>::type_size > 4 ) { 845 // AC: TODO: check if 16-byte CAS available and use it to 846 // improve performance (probably wait for explicit request 847 // before spending time on this). 848 // For now use dynamically allocated per-thread lock, 849 // free memory in __kmp_dispatch_next when status==0. 850 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); 851 th->th.th_dispatch->th_steal_lock = 852 (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t)); 853 __kmp_init_lock(th->th.th_dispatch->th_steal_lock); 854 } 855 break; 856 } else { 857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 858 gtid ) ); 859 schedule = kmp_sch_static_balanced; 860 /* too few iterations: fall-through to kmp_sch_static_balanced */ 861 } // if 862 /* FALL-THROUGH to static balanced */ 863 } // case 864 #endif 865 case kmp_sch_static_balanced: 866 { 867 T nproc = th->th.th_team_nproc; 868 T init, limit; 869 870 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 871 gtid ) ); 872 873 if ( nproc > 1 ) { 874 T id = __kmp_tid_from_gtid(gtid); 875 876 if ( tc < nproc ) { 877 if ( id < tc ) { 878 init = id; 879 limit = id; 880 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 881 } else { 882 pr->u.p.count = 1; /* means no more chunks to execute */ 883 pr->u.p.parm1 = FALSE; 884 break; 885 } 886 } else { 887 T small_chunk = tc / nproc; 888 T extras = tc % nproc; 889 init = id * small_chunk + (id < extras ? id : extras); 890 limit = init + small_chunk - (id < extras ? 0 : 1); 891 pr->u.p.parm1 = (id == nproc - 1); 892 } 893 } else { 894 if ( tc > 0 ) { 895 init = 0; 896 limit = tc - 1; 897 pr->u.p.parm1 = TRUE; 898 } else { 899 // zero trip count 900 pr->u.p.count = 1; /* means no more chunks to execute */ 901 pr->u.p.parm1 = FALSE; 902 break; 903 } 904 } 905 #if USE_ITT_BUILD 906 // Calculate chunk for metadata report 907 if ( itt_need_metadata_reporting ) 908 cur_chunk = limit - init + 1; 909 #endif 910 if ( st == 1 ) { 911 pr->u.p.lb = lb + init; 912 pr->u.p.ub = lb + limit; 913 } else { 914 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 915 pr->u.p.lb = lb + init * st; 916 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 917 if ( st > 0 ) { 918 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 919 } else { 920 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 921 } 922 } 923 if ( pr->ordered ) { 924 pr->u.p.ordered_lower = init; 925 pr->u.p.ordered_upper = limit; 926 } 927 break; 928 } // case 929 case kmp_sch_guided_iterative_chunked : 930 { 931 T nproc = th->th.th_team_nproc; 932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 933 934 if ( nproc > 1 ) { 935 if ( (2L * chunk + 1 ) * nproc >= tc ) { 936 /* chunk size too large, switch to dynamic */ 937 schedule = kmp_sch_dynamic_chunked; 938 } else { 939 // when remaining iters become less than parm2 - switch to dynamic 940 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 941 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 942 } 943 } else { 944 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 945 schedule = kmp_sch_static_greedy; 946 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 947 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 948 pr->u.p.parm1 = tc; 949 } // if 950 } // case 951 break; 952 case kmp_sch_guided_analytical_chunked: 953 { 954 T nproc = th->th.th_team_nproc; 955 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 956 957 if ( nproc > 1 ) { 958 if ( (2L * chunk + 1 ) * nproc >= tc ) { 959 /* chunk size too large, switch to dynamic */ 960 schedule = kmp_sch_dynamic_chunked; 961 } else { 962 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 963 DBL x; 964 965 #if KMP_OS_WINDOWS && KMP_ARCH_X86 966 /* Linux* OS already has 64-bit computation by default for 967 long double, and on Windows* OS on Intel(R) 64, 968 /Qlong_double doesn't work. On Windows* OS 969 on IA-32 architecture, we need to set precision to 970 64-bit instead of the default 53-bit. Even though long 971 double doesn't work on Windows* OS on Intel(R) 64, the 972 resulting lack of precision is not expected to impact 973 the correctness of the algorithm, but this has not been 974 mathematically proven. 975 */ 976 // save original FPCW and set precision to 64-bit, as 977 // Windows* OS on IA-32 architecture defaults to 53-bit 978 unsigned int oldFpcw = _control87(0,0); 979 _control87(_PC_64,_MCW_PC); // 0,0x30000 980 #endif 981 /* value used for comparison in solver for cross-over point */ 982 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 983 984 /* crossover point--chunk indexes equal to or greater than 985 this point switch to dynamic-style scheduling */ 986 UT cross; 987 988 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 989 x = (long double)1.0 - (long double)0.5 / nproc; 990 991 #ifdef KMP_DEBUG 992 { // test natural alignment 993 struct _test_a { 994 char a; 995 union { 996 char b; 997 DBL d; 998 }; 999 } t; 1000 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 1001 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 1002 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 1003 } 1004 #endif // KMP_DEBUG 1005 1006 /* save the term in thread private dispatch structure */ 1007 *(DBL*)&pr->u.p.parm3 = x; 1008 1009 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 1010 { 1011 UT left, right, mid; 1012 long double p; 1013 1014 /* estimate initial upper and lower bound */ 1015 1016 /* doesn't matter what value right is as long as it is positive, but 1017 it affects performance of the solver 1018 */ 1019 right = 229; 1020 p = __kmp_pow< UT >(x,right); 1021 if ( p > target ) { 1022 do{ 1023 p *= p; 1024 right <<= 1; 1025 } while(p>target && right < (1<<27)); 1026 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1027 } else { 1028 left = 0; 1029 } 1030 1031 /* bisection root-finding method */ 1032 while ( left + 1 < right ) { 1033 mid = (left + right) / 2; 1034 if ( __kmp_pow< UT >(x,mid) > target ) { 1035 left = mid; 1036 } else { 1037 right = mid; 1038 } 1039 } // while 1040 cross = right; 1041 } 1042 /* assert sanity of computed crossover point */ 1043 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1044 1045 /* save the crossover point in thread private dispatch structure */ 1046 pr->u.p.parm2 = cross; 1047 1048 // C75803 1049 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1050 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1051 #else 1052 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1053 #endif 1054 /* dynamic-style scheduling offset */ 1055 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1056 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1057 // restore FPCW 1058 _control87(oldFpcw,_MCW_PC); 1059 #endif 1060 } // if 1061 } else { 1062 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1063 gtid ) ); 1064 schedule = kmp_sch_static_greedy; 1065 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1066 pr->u.p.parm1 = tc; 1067 } // if 1068 } // case 1069 break; 1070 case kmp_sch_static_greedy: 1071 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1072 pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ? 1073 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc : 1074 tc; 1075 break; 1076 case kmp_sch_static_chunked : 1077 case kmp_sch_dynamic_chunked : 1078 if ( pr->u.p.parm1 <= 0 ) { 1079 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1080 } 1081 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1082 break; 1083 case kmp_sch_trapezoidal : 1084 { 1085 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1086 1087 T parm1, parm2, parm3, parm4; 1088 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1089 1090 parm1 = chunk; 1091 1092 /* F : size of the first cycle */ 1093 parm2 = ( tc / (2 * th->th.th_team_nproc) ); 1094 1095 if ( parm2 < 1 ) { 1096 parm2 = 1; 1097 } 1098 1099 /* L : size of the last cycle. Make sure the last cycle 1100 * is not larger than the first cycle. 1101 */ 1102 if ( parm1 < 1 ) { 1103 parm1 = 1; 1104 } else if ( parm1 > parm2 ) { 1105 parm1 = parm2; 1106 } 1107 1108 /* N : number of cycles */ 1109 parm3 = ( parm2 + parm1 ); 1110 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1111 1112 if ( parm3 < 2 ) { 1113 parm3 = 2; 1114 } 1115 1116 /* sigma : decreasing incr of the trapezoid */ 1117 parm4 = ( parm3 - 1 ); 1118 parm4 = ( parm2 - parm1 ) / parm4; 1119 1120 // pointless check, because parm4 >= 0 always 1121 //if ( parm4 < 0 ) { 1122 // parm4 = 0; 1123 //} 1124 1125 pr->u.p.parm1 = parm1; 1126 pr->u.p.parm2 = parm2; 1127 pr->u.p.parm3 = parm3; 1128 pr->u.p.parm4 = parm4; 1129 } // case 1130 break; 1131 1132 default: 1133 { 1134 __kmp_msg( 1135 kmp_ms_fatal, // Severity 1136 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1137 KMP_HNT( GetNewerLibrary ), // Hint 1138 __kmp_msg_null // Variadic argument list terminator 1139 ); 1140 } 1141 break; 1142 } // switch 1143 pr->schedule = schedule; 1144 if ( active ) { 1145 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1146 1147 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1148 gtid, my_buffer_index, sh->buffer_index) ); 1149 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1150 USE_ITT_BUILD_ARG( NULL ) 1151 ); 1152 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1153 // *always* 32-bit integers. 1154 KMP_MB(); /* is this necessary? */ 1155 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1156 gtid, my_buffer_index, sh->buffer_index) ); 1157 1158 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1159 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1160 #if USE_ITT_BUILD 1161 if ( pr->ordered ) { 1162 __kmp_itt_ordered_init( gtid ); 1163 }; // if 1164 // Report loop metadata 1165 if ( itt_need_metadata_reporting ) { 1166 // Only report metadata by master of active team at level 1 1167 kmp_uint64 schedtype = 0; 1168 switch ( schedule ) { 1169 case kmp_sch_static_chunked: 1170 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1171 break; 1172 case kmp_sch_static_greedy: 1173 cur_chunk = pr->u.p.parm1; 1174 break; 1175 case kmp_sch_dynamic_chunked: 1176 schedtype = 1; 1177 break; 1178 case kmp_sch_guided_iterative_chunked: 1179 case kmp_sch_guided_analytical_chunked: 1180 schedtype = 2; 1181 break; 1182 default: 1183 // Should we put this case under "static"? 1184 // case kmp_sch_static_steal: 1185 schedtype = 3; 1186 break; 1187 } 1188 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1189 } 1190 #endif /* USE_ITT_BUILD */ 1191 }; // if 1192 1193 #ifdef KMP_DEBUG 1194 { 1195 const char * buff; 1196 // create format specifiers before the debug output 1197 buff = __kmp_str_format( 1198 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1199 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1200 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1201 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1202 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1203 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1204 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1205 KD_TRACE(10, ( buff, 1206 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1207 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1208 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1209 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1210 __kmp_str_free( &buff ); 1211 } 1212 #endif 1213 #if ( KMP_STATIC_STEAL_ENABLED ) 1214 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1215 // all the parm3 variables will contain the same value. 1216 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1217 // rather than program life-time increment. 1218 // So the dedicated variable is required. The 'static_steal_counter' is used. 1219 if( schedule == kmp_sch_static_steal ) { 1220 // Other threads will inspect this variable when searching for a victim. 1221 // This is a flag showing that other threads may steal from this thread since then. 1222 volatile T * p = &pr->u.p.static_steal_counter; 1223 *p = *p + 1; 1224 } 1225 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1226 1227 #if OMPT_SUPPORT && OMPT_TRACE 1228 if (ompt_enabled && 1229 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1230 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1231 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1232 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1233 team_info->parallel_id, task_info->task_id, team_info->microtask); 1234 } 1235 #endif 1236 } 1237 1238 /* 1239 * For ordered loops, either __kmp_dispatch_finish() should be called after 1240 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1241 * every chunk of iterations. If the ordered section(s) were not executed 1242 * for this iteration (or every iteration in this chunk), we need to set the 1243 * ordered iteration counters so that the next thread can proceed. 1244 */ 1245 template< typename UT > 1246 static void 1247 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1248 { 1249 typedef typename traits_t< UT >::signed_t ST; 1250 kmp_info_t *th = __kmp_threads[ gtid ]; 1251 1252 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1253 if ( ! th -> th.th_team -> t.t_serialized ) { 1254 1255 dispatch_private_info_template< UT > * pr = 1256 reinterpret_cast< dispatch_private_info_template< UT >* > 1257 ( th->th.th_dispatch->th_dispatch_pr_current ); 1258 dispatch_shared_info_template< UT > volatile * sh = 1259 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1260 ( th->th.th_dispatch->th_dispatch_sh_current ); 1261 KMP_DEBUG_ASSERT( pr ); 1262 KMP_DEBUG_ASSERT( sh ); 1263 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1264 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1265 1266 if ( pr->ordered_bumped ) { 1267 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1268 gtid ) ); 1269 pr->ordered_bumped = 0; 1270 } else { 1271 UT lower = pr->u.p.ordered_lower; 1272 1273 #ifdef KMP_DEBUG 1274 { 1275 const char * buff; 1276 // create format specifiers before the debug output 1277 buff = __kmp_str_format( 1278 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1279 traits_t< UT >::spec, traits_t< UT >::spec ); 1280 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1281 __kmp_str_free( &buff ); 1282 } 1283 #endif 1284 1285 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1286 USE_ITT_BUILD_ARG(NULL) 1287 ); 1288 KMP_MB(); /* is this necessary? */ 1289 #ifdef KMP_DEBUG 1290 { 1291 const char * buff; 1292 // create format specifiers before the debug output 1293 buff = __kmp_str_format( 1294 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1295 traits_t< UT >::spec, traits_t< UT >::spec ); 1296 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1297 __kmp_str_free( &buff ); 1298 } 1299 #endif 1300 1301 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1302 } // if 1303 } // if 1304 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1305 } 1306 1307 #ifdef KMP_GOMP_COMPAT 1308 1309 template< typename UT > 1310 static void 1311 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1312 { 1313 typedef typename traits_t< UT >::signed_t ST; 1314 kmp_info_t *th = __kmp_threads[ gtid ]; 1315 1316 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1317 if ( ! th -> th.th_team -> t.t_serialized ) { 1318 // int cid; 1319 dispatch_private_info_template< UT > * pr = 1320 reinterpret_cast< dispatch_private_info_template< UT >* > 1321 ( th->th.th_dispatch->th_dispatch_pr_current ); 1322 dispatch_shared_info_template< UT > volatile * sh = 1323 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1324 ( th->th.th_dispatch->th_dispatch_sh_current ); 1325 KMP_DEBUG_ASSERT( pr ); 1326 KMP_DEBUG_ASSERT( sh ); 1327 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1328 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1329 1330 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1331 UT lower = pr->u.p.ordered_lower; 1332 UT upper = pr->u.p.ordered_upper; 1333 UT inc = upper - lower + 1; 1334 1335 if ( pr->ordered_bumped == inc ) { 1336 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1337 gtid ) ); 1338 pr->ordered_bumped = 0; 1339 } else { 1340 inc -= pr->ordered_bumped; 1341 1342 #ifdef KMP_DEBUG 1343 { 1344 const char * buff; 1345 // create format specifiers before the debug output 1346 buff = __kmp_str_format( 1347 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1348 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1349 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1350 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1351 __kmp_str_free( &buff ); 1352 } 1353 #endif 1354 1355 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1356 USE_ITT_BUILD_ARG(NULL) 1357 ); 1358 1359 KMP_MB(); /* is this necessary? */ 1360 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1361 gtid ) ); 1362 pr->ordered_bumped = 0; 1363 //!!!!! TODO check if the inc should be unsigned, or signed??? 1364 #ifdef KMP_DEBUG 1365 { 1366 const char * buff; 1367 // create format specifiers before the debug output 1368 buff = __kmp_str_format( 1369 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1370 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1371 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1372 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1373 __kmp_str_free( &buff ); 1374 } 1375 #endif 1376 1377 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1378 } 1379 // } 1380 } 1381 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1382 } 1383 1384 #endif /* KMP_GOMP_COMPAT */ 1385 1386 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1387 * (no more work), then tell OMPT the loop is over. In some cases 1388 * kmp_dispatch_fini() is not called. */ 1389 #if OMPT_SUPPORT && OMPT_TRACE 1390 #define OMPT_LOOP_END \ 1391 if (status == 0) { \ 1392 if (ompt_enabled && \ 1393 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1394 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1395 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1396 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1397 team_info->parallel_id, task_info->task_id); \ 1398 } \ 1399 } 1400 #else 1401 #define OMPT_LOOP_END // no-op 1402 #endif 1403 1404 template< typename T > 1405 static int 1406 __kmp_dispatch_next( 1407 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1408 ) { 1409 1410 typedef typename traits_t< T >::unsigned_t UT; 1411 typedef typename traits_t< T >::signed_t ST; 1412 typedef typename traits_t< T >::floating_t DBL; 1413 1414 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1415 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1416 // more than a compile time choice to use static scheduling would.) 1417 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1418 1419 int status; 1420 dispatch_private_info_template< T > * pr; 1421 kmp_info_t * th = __kmp_threads[ gtid ]; 1422 kmp_team_t * team = th -> th.th_team; 1423 1424 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1425 #ifdef KMP_DEBUG 1426 { 1427 const char * buff; 1428 // create format specifiers before the debug output 1429 buff = __kmp_str_format( 1430 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1431 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1432 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1433 __kmp_str_free( &buff ); 1434 } 1435 #endif 1436 1437 if ( team -> t.t_serialized ) { 1438 /* NOTE: serialize this dispatch becase we are not at the active level */ 1439 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1440 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1441 KMP_DEBUG_ASSERT( pr ); 1442 1443 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1444 *p_lb = 0; 1445 *p_ub = 0; 1446 // if ( p_last != NULL ) 1447 // *p_last = 0; 1448 if ( p_st != NULL ) 1449 *p_st = 0; 1450 if ( __kmp_env_consistency_check ) { 1451 if ( pr->pushed_ws != ct_none ) { 1452 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1453 } 1454 } 1455 } else if ( pr->nomerge ) { 1456 kmp_int32 last; 1457 T start; 1458 UT limit, trip, init; 1459 ST incr; 1460 T chunk = pr->u.p.parm1; 1461 1462 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1463 1464 init = chunk * pr->u.p.count++; 1465 trip = pr->u.p.tc - 1; 1466 1467 if ( (status = (init <= trip)) == 0 ) { 1468 *p_lb = 0; 1469 *p_ub = 0; 1470 // if ( p_last != NULL ) 1471 // *p_last = 0; 1472 if ( p_st != NULL ) 1473 *p_st = 0; 1474 if ( __kmp_env_consistency_check ) { 1475 if ( pr->pushed_ws != ct_none ) { 1476 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1477 } 1478 } 1479 } else { 1480 start = pr->u.p.lb; 1481 limit = chunk + init - 1; 1482 incr = pr->u.p.st; 1483 1484 if ( (last = (limit >= trip)) != 0 ) { 1485 limit = trip; 1486 #if KMP_OS_WINDOWS 1487 pr->u.p.last_upper = pr->u.p.ub; 1488 #endif /* KMP_OS_WINDOWS */ 1489 } 1490 if ( p_last != NULL ) 1491 *p_last = last; 1492 if ( p_st != NULL ) 1493 *p_st = incr; 1494 if ( incr == 1 ) { 1495 *p_lb = start + init; 1496 *p_ub = start + limit; 1497 } else { 1498 *p_lb = start + init * incr; 1499 *p_ub = start + limit * incr; 1500 } 1501 1502 if ( pr->ordered ) { 1503 pr->u.p.ordered_lower = init; 1504 pr->u.p.ordered_upper = limit; 1505 #ifdef KMP_DEBUG 1506 { 1507 const char * buff; 1508 // create format specifiers before the debug output 1509 buff = __kmp_str_format( 1510 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1511 traits_t< UT >::spec, traits_t< UT >::spec ); 1512 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1513 __kmp_str_free( &buff ); 1514 } 1515 #endif 1516 } // if 1517 } // if 1518 } else { 1519 pr->u.p.tc = 0; 1520 *p_lb = pr->u.p.lb; 1521 *p_ub = pr->u.p.ub; 1522 #if KMP_OS_WINDOWS 1523 pr->u.p.last_upper = *p_ub; 1524 #endif /* KMP_OS_WINDOWS */ 1525 if ( p_last != NULL ) 1526 *p_last = TRUE; 1527 if ( p_st != NULL ) 1528 *p_st = pr->u.p.st; 1529 } // if 1530 #ifdef KMP_DEBUG 1531 { 1532 const char * buff; 1533 // create format specifiers before the debug output 1534 buff = __kmp_str_format( 1535 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1536 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1537 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1538 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1539 __kmp_str_free( &buff ); 1540 } 1541 #endif 1542 #if INCLUDE_SSC_MARKS 1543 SSC_MARK_DISPATCH_NEXT(); 1544 #endif 1545 OMPT_LOOP_END; 1546 return status; 1547 } else { 1548 kmp_int32 last = 0; 1549 dispatch_shared_info_template< UT > *sh; 1550 T start; 1551 ST incr; 1552 UT limit, trip, init; 1553 1554 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1555 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1556 1557 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1558 ( th->th.th_dispatch->th_dispatch_pr_current ); 1559 KMP_DEBUG_ASSERT( pr ); 1560 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1561 ( th->th.th_dispatch->th_dispatch_sh_current ); 1562 KMP_DEBUG_ASSERT( sh ); 1563 1564 if ( pr->u.p.tc == 0 ) { 1565 // zero trip count 1566 status = 0; 1567 } else { 1568 switch (pr->schedule) { 1569 #if ( KMP_STATIC_STEAL_ENABLED ) 1570 case kmp_sch_static_steal: 1571 { 1572 T chunk = pr->u.p.parm1; 1573 int nproc = th->th.th_team_nproc; 1574 1575 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1576 1577 trip = pr->u.p.tc - 1; 1578 1579 if ( traits_t<T>::type_size > 4 ) { 1580 // use lock for 8-byte and CAS for 4-byte induction 1581 // variable. TODO (optional): check and use 16-byte CAS 1582 kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock; 1583 KMP_DEBUG_ASSERT(lck != NULL); 1584 if( pr->u.p.count < (UT)pr->u.p.ub ) { 1585 __kmp_acquire_lock(lck, gtid); 1586 // try to get own chunk of iterations 1587 init = ( pr->u.p.count )++; 1588 status = ( init < (UT)pr->u.p.ub ); 1589 __kmp_release_lock(lck, gtid); 1590 } else { 1591 status = 0; // no own chunks 1592 } 1593 if( !status ) { // try to steal 1594 kmp_info_t **other_threads = team->t.t_threads; 1595 int while_limit = nproc; // nproc attempts to find a victim 1596 int while_index = 0; 1597 // TODO: algorithm of searching for a victim 1598 // should be cleaned up and measured 1599 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1600 T remaining; 1601 T victimIdx = pr->u.p.parm4; 1602 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1603 dispatch_private_info_template< T > * victim = 1604 reinterpret_cast< dispatch_private_info_template< T >* > 1605 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1606 while( ( victim == NULL || victim == pr || 1607 ( *(volatile T*)&victim->u.p.static_steal_counter != 1608 *(volatile T*)&pr->u.p.static_steal_counter ) ) && 1609 oldVictimIdx != victimIdx ) 1610 { 1611 victimIdx = (victimIdx + 1) % nproc; 1612 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1613 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1614 }; 1615 if( !victim || 1616 ( *(volatile T *)&victim->u.p.static_steal_counter != 1617 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1618 { 1619 continue; // try once more (nproc attempts in total) 1620 // no victim is ready yet to participate in stealing 1621 // because all victims are still in kmp_init_dispatch 1622 } 1623 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) { 1624 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid 1625 continue; // not enough chunks to steal, goto next victim 1626 } 1627 1628 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; 1629 KMP_ASSERT(lck != NULL); 1630 __kmp_acquire_lock(lck, gtid); 1631 limit = victim->u.p.ub; // keep initial ub 1632 if( victim->u.p.count >= limit || 1633 (remaining = limit - victim->u.p.count) < 2 ) 1634 { 1635 __kmp_release_lock(lck, gtid); 1636 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim 1637 continue; // not enough chunks to steal 1638 } 1639 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1 1640 if( remaining > 3 ) { 1641 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining>>2); 1642 init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining 1643 } else { 1644 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1); 1645 init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining 1646 } 1647 __kmp_release_lock(lck, gtid); 1648 1649 KMP_DEBUG_ASSERT(init + 1 <= limit); 1650 pr->u.p.parm4 = victimIdx; // remember victim to steal from 1651 status = 1; 1652 while_index = 0; 1653 // now update own count and ub with stolen range but init chunk 1654 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); 1655 pr->u.p.count = init + 1; 1656 pr->u.p.ub = limit; 1657 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); 1658 } // while (search for victim) 1659 } // if (try to find victim and steal) 1660 } else { 1661 // 4-byte induction variable, use 8-byte CAS for pair (count, ub) 1662 typedef union { 1663 struct { 1664 UT count; 1665 T ub; 1666 } p; 1667 kmp_int64 b; 1668 } union_i4; 1669 // All operations on 'count' or 'ub' must be combined atomically together. 1670 { 1671 union_i4 vold, vnew; 1672 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1673 vnew = vold; 1674 vnew.p.count++; 1675 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1676 ( volatile kmp_int64* )&pr->u.p.count, 1677 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1678 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1679 KMP_CPU_PAUSE(); 1680 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1681 vnew = vold; 1682 vnew.p.count++; 1683 } 1684 vnew = vold; 1685 init = vnew.p.count; 1686 status = ( init < (UT)vnew.p.ub ) ; 1687 } 1688 1689 if( !status ) { 1690 kmp_info_t **other_threads = team->t.t_threads; 1691 int while_limit = nproc; // nproc attempts to find a victim 1692 int while_index = 0; 1693 1694 // TODO: algorithm of searching for a victim 1695 // should be cleaned up and measured 1696 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1697 union_i4 vold, vnew; 1698 kmp_int32 remaining; 1699 T victimIdx = pr->u.p.parm4; 1700 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; 1701 dispatch_private_info_template< T > * victim = 1702 reinterpret_cast< dispatch_private_info_template< T >* > 1703 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current); 1704 while( (victim == NULL || victim == pr || 1705 (*(volatile T*)&victim->u.p.static_steal_counter != 1706 *(volatile T*)&pr->u.p.static_steal_counter)) && 1707 oldVictimIdx != victimIdx ) 1708 { 1709 victimIdx = (victimIdx + 1) % nproc; 1710 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1711 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1712 }; 1713 if( !victim || 1714 ( *(volatile T *)&victim->u.p.static_steal_counter != 1715 *(volatile T *)&pr->u.p.static_steal_counter ) ) 1716 { 1717 continue; // try once more (nproc attempts in total) 1718 // no victim is ready yet to participate in stealing 1719 // because all victims are still in kmp_init_dispatch 1720 } 1721 pr->u.p.parm4 = victimIdx; // new victim found 1722 while( 1 ) { // CAS loop if victim has enough chunks to steal 1723 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1724 vnew = vold; 1725 1726 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1727 if ( vnew.p.count >= (UT)vnew.p.ub || 1728 (remaining = vnew.p.ub - vnew.p.count) < 2 ) 1729 { 1730 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id 1731 break; // not enough chunks to steal, goto next victim 1732 } 1733 if( remaining > 3 ) { 1734 vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining 1735 } else { 1736 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining 1737 } 1738 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1739 // TODO: Should this be acquire or release? 1740 if ( KMP_COMPARE_AND_STORE_ACQ64( 1741 ( volatile kmp_int64 * )&victim->u.p.count, 1742 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1743 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1744 // stealing succedded 1745 KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub-vnew.p.ub); 1746 status = 1; 1747 while_index = 0; 1748 // now update own count and ub 1749 init = vnew.p.ub; 1750 vold.p.count = init + 1; 1751 #if KMP_ARCH_X86 1752 KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b); 1753 #else 1754 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1755 #endif 1756 break; 1757 } // if (check CAS result) 1758 KMP_CPU_PAUSE(); // CAS failed, repeate attempt 1759 } // while (try to steal from particular victim) 1760 } // while (search for victim) 1761 } // if (try to find victim and steal) 1762 } // if (4-byte induction variable) 1763 if ( !status ) { 1764 *p_lb = 0; 1765 *p_ub = 0; 1766 if ( p_st != NULL ) *p_st = 0; 1767 } else { 1768 start = pr->u.p.parm2; 1769 init *= chunk; 1770 limit = chunk + init - 1; 1771 incr = pr->u.p.st; 1772 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1); 1773 1774 KMP_DEBUG_ASSERT(init <= trip); 1775 if ( (last = (limit >= trip)) != 0 ) 1776 limit = trip; 1777 if ( p_st != NULL ) *p_st = incr; 1778 1779 if ( incr == 1 ) { 1780 *p_lb = start + init; 1781 *p_ub = start + limit; 1782 } else { 1783 *p_lb = start + init * incr; 1784 *p_ub = start + limit * incr; 1785 } 1786 1787 if ( pr->ordered ) { 1788 pr->u.p.ordered_lower = init; 1789 pr->u.p.ordered_upper = limit; 1790 #ifdef KMP_DEBUG 1791 { 1792 const char * buff; 1793 // create format specifiers before the debug output 1794 buff = __kmp_str_format( 1795 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1796 traits_t< UT >::spec, traits_t< UT >::spec ); 1797 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1798 __kmp_str_free( &buff ); 1799 } 1800 #endif 1801 } // if 1802 } // if 1803 break; 1804 } // case 1805 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1806 case kmp_sch_static_balanced: 1807 { 1808 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1809 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1810 pr->u.p.count = 1; 1811 *p_lb = pr->u.p.lb; 1812 *p_ub = pr->u.p.ub; 1813 last = pr->u.p.parm1; 1814 if ( p_st != NULL ) 1815 *p_st = pr->u.p.st; 1816 } else { /* no iterations to do */ 1817 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1818 } 1819 if ( pr->ordered ) { 1820 #ifdef KMP_DEBUG 1821 { 1822 const char * buff; 1823 // create format specifiers before the debug output 1824 buff = __kmp_str_format( 1825 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1826 traits_t< UT >::spec, traits_t< UT >::spec ); 1827 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1828 __kmp_str_free( &buff ); 1829 } 1830 #endif 1831 } // if 1832 } // case 1833 break; 1834 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1835 case kmp_sch_static_chunked: 1836 { 1837 T parm1; 1838 1839 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1840 gtid ) ); 1841 parm1 = pr->u.p.parm1; 1842 1843 trip = pr->u.p.tc - 1; 1844 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1845 1846 if ( (status = (init <= trip)) != 0 ) { 1847 start = pr->u.p.lb; 1848 incr = pr->u.p.st; 1849 limit = parm1 + init - 1; 1850 1851 if ( (last = (limit >= trip)) != 0 ) 1852 limit = trip; 1853 1854 if ( p_st != NULL ) *p_st = incr; 1855 1856 pr->u.p.count += th->th.th_team_nproc; 1857 1858 if ( incr == 1 ) { 1859 *p_lb = start + init; 1860 *p_ub = start + limit; 1861 } 1862 else { 1863 *p_lb = start + init * incr; 1864 *p_ub = start + limit * incr; 1865 } 1866 1867 if ( pr->ordered ) { 1868 pr->u.p.ordered_lower = init; 1869 pr->u.p.ordered_upper = limit; 1870 #ifdef KMP_DEBUG 1871 { 1872 const char * buff; 1873 // create format specifiers before the debug output 1874 buff = __kmp_str_format( 1875 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1876 traits_t< UT >::spec, traits_t< UT >::spec ); 1877 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1878 __kmp_str_free( &buff ); 1879 } 1880 #endif 1881 } // if 1882 } // if 1883 } // case 1884 break; 1885 1886 case kmp_sch_dynamic_chunked: 1887 { 1888 T chunk = pr->u.p.parm1; 1889 1890 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1891 gtid ) ); 1892 1893 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1894 trip = pr->u.p.tc - 1; 1895 1896 if ( (status = (init <= trip)) == 0 ) { 1897 *p_lb = 0; 1898 *p_ub = 0; 1899 if ( p_st != NULL ) *p_st = 0; 1900 } else { 1901 start = pr->u.p.lb; 1902 limit = chunk + init - 1; 1903 incr = pr->u.p.st; 1904 1905 if ( (last = (limit >= trip)) != 0 ) 1906 limit = trip; 1907 1908 if ( p_st != NULL ) *p_st = incr; 1909 1910 if ( incr == 1 ) { 1911 *p_lb = start + init; 1912 *p_ub = start + limit; 1913 } else { 1914 *p_lb = start + init * incr; 1915 *p_ub = start + limit * incr; 1916 } 1917 1918 if ( pr->ordered ) { 1919 pr->u.p.ordered_lower = init; 1920 pr->u.p.ordered_upper = limit; 1921 #ifdef KMP_DEBUG 1922 { 1923 const char * buff; 1924 // create format specifiers before the debug output 1925 buff = __kmp_str_format( 1926 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1927 traits_t< UT >::spec, traits_t< UT >::spec ); 1928 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1929 __kmp_str_free( &buff ); 1930 } 1931 #endif 1932 } // if 1933 } // if 1934 } // case 1935 break; 1936 1937 case kmp_sch_guided_iterative_chunked: 1938 { 1939 T chunkspec = pr->u.p.parm1; 1940 KD_TRACE(100, 1941 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1942 trip = pr->u.p.tc; 1943 // Start atomic part of calculations 1944 while(1) { 1945 ST remaining; // signed, because can be < 0 1946 init = sh->u.s.iteration; // shared value 1947 remaining = trip - init; 1948 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1949 // nothing to do, don't try atomic op 1950 status = 0; 1951 break; 1952 } 1953 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1954 // use dynamic-style shcedule 1955 // atomically inrement iterations, get old value 1956 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1957 remaining = trip - init; 1958 if (remaining <= 0) { 1959 status = 0; // all iterations got by other threads 1960 } else { 1961 // got some iterations to work on 1962 status = 1; 1963 if ( (T)remaining > chunkspec ) { 1964 limit = init + chunkspec - 1; 1965 } else { 1966 last = 1; // the last chunk 1967 limit = init + remaining - 1; 1968 } // if 1969 } // if 1970 break; 1971 } // if 1972 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1973 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1974 // CAS was successful, chunk obtained 1975 status = 1; 1976 --limit; 1977 break; 1978 } // if 1979 } // while 1980 if ( status != 0 ) { 1981 start = pr->u.p.lb; 1982 incr = pr->u.p.st; 1983 if ( p_st != NULL ) 1984 *p_st = incr; 1985 *p_lb = start + init * incr; 1986 *p_ub = start + limit * incr; 1987 if ( pr->ordered ) { 1988 pr->u.p.ordered_lower = init; 1989 pr->u.p.ordered_upper = limit; 1990 #ifdef KMP_DEBUG 1991 { 1992 const char * buff; 1993 // create format specifiers before the debug output 1994 buff = __kmp_str_format( 1995 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1996 traits_t< UT >::spec, traits_t< UT >::spec ); 1997 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1998 __kmp_str_free( &buff ); 1999 } 2000 #endif 2001 } // if 2002 } else { 2003 *p_lb = 0; 2004 *p_ub = 0; 2005 if ( p_st != NULL ) 2006 *p_st = 0; 2007 } // if 2008 } // case 2009 break; 2010 2011 case kmp_sch_guided_analytical_chunked: 2012 { 2013 T chunkspec = pr->u.p.parm1; 2014 UT chunkIdx; 2015 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2016 /* for storing original FPCW value for Windows* OS on 2017 IA-32 architecture 8-byte version */ 2018 unsigned int oldFpcw; 2019 unsigned int fpcwSet = 0; 2020 #endif 2021 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 2022 gtid ) ); 2023 2024 trip = pr->u.p.tc; 2025 2026 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1); 2027 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip); 2028 2029 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 2030 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 2031 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 2032 --trip; 2033 /* use dynamic-style scheduling */ 2034 init = chunkIdx * chunkspec + pr->u.p.count; 2035 /* need to verify init > 0 in case of overflow in the above calculation */ 2036 if ( (status = (init > 0 && init <= trip)) != 0 ) { 2037 limit = init + chunkspec -1; 2038 2039 if ( (last = (limit >= trip)) != 0 ) 2040 limit = trip; 2041 } 2042 break; 2043 } else { 2044 /* use exponential-style scheduling */ 2045 /* The following check is to workaround the lack of long double precision on Windows* OS. 2046 This check works around the possible effect that init != 0 for chunkIdx == 0. 2047 */ 2048 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2049 /* If we haven't already done so, save original 2050 FPCW and set precision to 64-bit, as Windows* OS 2051 on IA-32 architecture defaults to 53-bit */ 2052 if ( !fpcwSet ) { 2053 oldFpcw = _control87(0,0); 2054 _control87(_PC_64,_MCW_PC); 2055 fpcwSet = 0x30000; 2056 } 2057 #endif 2058 if ( chunkIdx ) { 2059 init = __kmp_dispatch_guided_remaining< T >( 2060 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 2061 KMP_DEBUG_ASSERT(init); 2062 init = trip - init; 2063 } else 2064 init = 0; 2065 limit = trip - __kmp_dispatch_guided_remaining< T >( 2066 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 2067 KMP_ASSERT(init <= limit); 2068 if ( init < limit ) { 2069 KMP_DEBUG_ASSERT(limit <= trip); 2070 --limit; 2071 status = 1; 2072 break; 2073 } // if 2074 } // if 2075 } // while (1) 2076 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2077 /* restore FPCW if necessary 2078 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2079 */ 2080 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2081 _control87(oldFpcw,_MCW_PC); 2082 #endif 2083 if ( status != 0 ) { 2084 start = pr->u.p.lb; 2085 incr = pr->u.p.st; 2086 if ( p_st != NULL ) 2087 *p_st = incr; 2088 *p_lb = start + init * incr; 2089 *p_ub = start + limit * incr; 2090 if ( pr->ordered ) { 2091 pr->u.p.ordered_lower = init; 2092 pr->u.p.ordered_upper = limit; 2093 #ifdef KMP_DEBUG 2094 { 2095 const char * buff; 2096 // create format specifiers before the debug output 2097 buff = __kmp_str_format( 2098 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2099 traits_t< UT >::spec, traits_t< UT >::spec ); 2100 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2101 __kmp_str_free( &buff ); 2102 } 2103 #endif 2104 } 2105 } else { 2106 *p_lb = 0; 2107 *p_ub = 0; 2108 if ( p_st != NULL ) 2109 *p_st = 0; 2110 } 2111 } // case 2112 break; 2113 2114 case kmp_sch_trapezoidal: 2115 { 2116 UT index; 2117 T parm2 = pr->u.p.parm2; 2118 T parm3 = pr->u.p.parm3; 2119 T parm4 = pr->u.p.parm4; 2120 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2121 gtid ) ); 2122 2123 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2124 2125 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2126 trip = pr->u.p.tc - 1; 2127 2128 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2129 *p_lb = 0; 2130 *p_ub = 0; 2131 if ( p_st != NULL ) *p_st = 0; 2132 } else { 2133 start = pr->u.p.lb; 2134 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2135 incr = pr->u.p.st; 2136 2137 if ( (last = (limit >= trip)) != 0 ) 2138 limit = trip; 2139 2140 if ( p_st != NULL ) *p_st = incr; 2141 2142 if ( incr == 1 ) { 2143 *p_lb = start + init; 2144 *p_ub = start + limit; 2145 } else { 2146 *p_lb = start + init * incr; 2147 *p_ub = start + limit * incr; 2148 } 2149 2150 if ( pr->ordered ) { 2151 pr->u.p.ordered_lower = init; 2152 pr->u.p.ordered_upper = limit; 2153 #ifdef KMP_DEBUG 2154 { 2155 const char * buff; 2156 // create format specifiers before the debug output 2157 buff = __kmp_str_format( 2158 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2159 traits_t< UT >::spec, traits_t< UT >::spec ); 2160 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2161 __kmp_str_free( &buff ); 2162 } 2163 #endif 2164 } // if 2165 } // if 2166 } // case 2167 break; 2168 default: 2169 { 2170 status = 0; // to avoid complaints on uninitialized variable use 2171 __kmp_msg( 2172 kmp_ms_fatal, // Severity 2173 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2174 KMP_HNT( GetNewerLibrary ), // Hint 2175 __kmp_msg_null // Variadic argument list terminator 2176 ); 2177 } 2178 break; 2179 } // switch 2180 } // if tc == 0; 2181 2182 if ( status == 0 ) { 2183 UT num_done; 2184 2185 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2186 #ifdef KMP_DEBUG 2187 { 2188 const char * buff; 2189 // create format specifiers before the debug output 2190 buff = __kmp_str_format( 2191 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2192 traits_t< UT >::spec ); 2193 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2194 __kmp_str_free( &buff ); 2195 } 2196 #endif 2197 2198 if ( (ST)num_done == th->th.th_team_nproc - 1 ) { 2199 #if ( KMP_STATIC_STEAL_ENABLED ) 2200 if( pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4 ) { 2201 int i; 2202 kmp_info_t **other_threads = team->t.t_threads; 2203 // loop complete, safe to destroy locks used for stealing 2204 for( i = 0; i < th->th.th_team_nproc; ++i ) { 2205 kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock; 2206 KMP_ASSERT(lck != NULL); 2207 __kmp_destroy_lock( lck ); 2208 __kmp_free( lck ); 2209 other_threads[i]->th.th_dispatch->th_steal_lock = NULL; 2210 } 2211 } 2212 #endif 2213 /* NOTE: release this buffer to be reused */ 2214 2215 KMP_MB(); /* Flush all pending memory write invalidates. */ 2216 2217 sh->u.s.num_done = 0; 2218 sh->u.s.iteration = 0; 2219 2220 /* TODO replace with general release procedure? */ 2221 if ( pr->ordered ) { 2222 sh->u.s.ordered_iteration = 0; 2223 } 2224 2225 KMP_MB(); /* Flush all pending memory write invalidates. */ 2226 2227 sh -> buffer_index += __kmp_dispatch_num_buffers; 2228 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2229 gtid, sh->buffer_index) ); 2230 2231 KMP_MB(); /* Flush all pending memory write invalidates. */ 2232 2233 } // if 2234 if ( __kmp_env_consistency_check ) { 2235 if ( pr->pushed_ws != ct_none ) { 2236 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2237 } 2238 } 2239 2240 th -> th.th_dispatch -> th_deo_fcn = NULL; 2241 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2242 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2243 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2244 } // if (status == 0) 2245 #if KMP_OS_WINDOWS 2246 else if ( last ) { 2247 pr->u.p.last_upper = pr->u.p.ub; 2248 } 2249 #endif /* KMP_OS_WINDOWS */ 2250 if ( p_last != NULL && status != 0 ) 2251 *p_last = last; 2252 } // if 2253 2254 #ifdef KMP_DEBUG 2255 { 2256 const char * buff; 2257 // create format specifiers before the debug output 2258 buff = __kmp_str_format( 2259 "__kmp_dispatch_next: T#%%d normal case: " \ 2260 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2261 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2262 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2263 __kmp_str_free( &buff ); 2264 } 2265 #endif 2266 #if INCLUDE_SSC_MARKS 2267 SSC_MARK_DISPATCH_NEXT(); 2268 #endif 2269 OMPT_LOOP_END; 2270 return status; 2271 } 2272 2273 template< typename T > 2274 static void 2275 __kmp_dist_get_bounds( 2276 ident_t *loc, 2277 kmp_int32 gtid, 2278 kmp_int32 *plastiter, 2279 T *plower, 2280 T *pupper, 2281 typename traits_t< T >::signed_t incr 2282 ) { 2283 typedef typename traits_t< T >::unsigned_t UT; 2284 typedef typename traits_t< T >::signed_t ST; 2285 register kmp_uint32 team_id; 2286 register kmp_uint32 nteams; 2287 register UT trip_count; 2288 register kmp_team_t *team; 2289 kmp_info_t * th; 2290 2291 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2292 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2293 #ifdef KMP_DEBUG 2294 { 2295 const char * buff; 2296 // create format specifiers before the debug output 2297 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2298 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2299 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2300 traits_t< T >::spec ); 2301 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2302 __kmp_str_free( &buff ); 2303 } 2304 #endif 2305 2306 if( __kmp_env_consistency_check ) { 2307 if( incr == 0 ) { 2308 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2309 } 2310 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2311 // The loop is illegal. 2312 // Some zero-trip loops maintained by compiler, e.g.: 2313 // for(i=10;i<0;++i) // lower >= upper - run-time check 2314 // for(i=0;i>10;--i) // lower <= upper - run-time check 2315 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2316 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2317 // Compiler does not check the following illegal loops: 2318 // for(i=0;i<10;i+=incr) // where incr<0 2319 // for(i=10;i>0;i-=incr) // where incr<0 2320 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2321 } 2322 } 2323 th = __kmp_threads[gtid]; 2324 team = th->th.th_team; 2325 #if OMP_40_ENABLED 2326 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2327 nteams = th->th.th_teams_size.nteams; 2328 #endif 2329 team_id = team->t.t_master_tid; 2330 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2331 2332 // compute global trip count 2333 if( incr == 1 ) { 2334 trip_count = *pupper - *plower + 1; 2335 } else if(incr == -1) { 2336 trip_count = *plower - *pupper + 1; 2337 } else if ( incr > 0 ) { 2338 // upper-lower can exceed the limit of signed type 2339 trip_count = (UT)(*pupper - *plower) / incr + 1; 2340 } else { 2341 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1; 2342 } 2343 2344 if( trip_count <= nteams ) { 2345 KMP_DEBUG_ASSERT( 2346 __kmp_static == kmp_sch_static_greedy || \ 2347 __kmp_static == kmp_sch_static_balanced 2348 ); // Unknown static scheduling type. 2349 // only some teams get single iteration, others get nothing 2350 if( team_id < trip_count ) { 2351 *pupper = *plower = *plower + team_id * incr; 2352 } else { 2353 *plower = *pupper + incr; // zero-trip loop 2354 } 2355 if( plastiter != NULL ) 2356 *plastiter = ( team_id == trip_count - 1 ); 2357 } else { 2358 if( __kmp_static == kmp_sch_static_balanced ) { 2359 register UT chunk = trip_count / nteams; 2360 register UT extras = trip_count % nteams; 2361 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2362 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2363 if( plastiter != NULL ) 2364 *plastiter = ( team_id == nteams - 1 ); 2365 } else { 2366 register T chunk_inc_count = 2367 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2368 register T upper = *pupper; 2369 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2370 // Unknown static scheduling type. 2371 *plower += team_id * chunk_inc_count; 2372 *pupper = *plower + chunk_inc_count - incr; 2373 // Check/correct bounds if needed 2374 if( incr > 0 ) { 2375 if( *pupper < *plower ) 2376 *pupper = traits_t<T>::max_value; 2377 if( plastiter != NULL ) 2378 *plastiter = *plower <= upper && *pupper > upper - incr; 2379 if( *pupper > upper ) 2380 *pupper = upper; // tracker C73258 2381 } else { 2382 if( *pupper > *plower ) 2383 *pupper = traits_t<T>::min_value; 2384 if( plastiter != NULL ) 2385 *plastiter = *plower >= upper && *pupper < upper - incr; 2386 if( *pupper < upper ) 2387 *pupper = upper; // tracker C73258 2388 } 2389 } 2390 } 2391 } 2392 2393 //----------------------------------------------------------------------------------------- 2394 // Dispatch routines 2395 // Transfer call to template< type T > 2396 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2397 // T lb, T ub, ST st, ST chunk ) 2398 extern "C" { 2399 2400 /*! 2401 @ingroup WORK_SHARING 2402 @{ 2403 @param loc Source location 2404 @param gtid Global thread id 2405 @param schedule Schedule type 2406 @param lb Lower bound 2407 @param ub Upper bound 2408 @param st Step (or increment if you prefer) 2409 @param chunk The chunk size to block with 2410 2411 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2412 These functions are all identical apart from the types of the arguments. 2413 */ 2414 2415 void 2416 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2417 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2418 { 2419 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2420 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2421 } 2422 /*! 2423 See @ref __kmpc_dispatch_init_4 2424 */ 2425 void 2426 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2427 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2428 { 2429 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2430 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2431 } 2432 2433 /*! 2434 See @ref __kmpc_dispatch_init_4 2435 */ 2436 void 2437 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2438 kmp_int64 lb, kmp_int64 ub, 2439 kmp_int64 st, kmp_int64 chunk ) 2440 { 2441 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2442 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2443 } 2444 2445 /*! 2446 See @ref __kmpc_dispatch_init_4 2447 */ 2448 void 2449 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2450 kmp_uint64 lb, kmp_uint64 ub, 2451 kmp_int64 st, kmp_int64 chunk ) 2452 { 2453 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2454 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2455 } 2456 2457 /*! 2458 See @ref __kmpc_dispatch_init_4 2459 2460 Difference from __kmpc_dispatch_init set of functions is these functions 2461 are called for composite distribute parallel for construct. Thus before 2462 regular iterations dispatching we need to calc per-team iteration space. 2463 2464 These functions are all identical apart from the types of the arguments. 2465 */ 2466 void 2467 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2468 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2469 { 2470 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2471 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2472 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2473 } 2474 2475 void 2476 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2477 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2478 { 2479 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2480 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2481 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2482 } 2483 2484 void 2485 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2486 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2487 { 2488 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2489 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2490 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2491 } 2492 2493 void 2494 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2495 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2496 { 2497 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2498 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2499 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2500 } 2501 2502 /*! 2503 @param loc Source code location 2504 @param gtid Global thread id 2505 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2506 @param p_lb Pointer to the lower bound for the next chunk of work 2507 @param p_ub Pointer to the upper bound for the next chunk of work 2508 @param p_st Pointer to the stride for the next chunk of work 2509 @return one if there is work to be done, zero otherwise 2510 2511 Get the next dynamically allocated chunk of work for this thread. 2512 If there is no more work, then the lb,ub and stride need not be modified. 2513 */ 2514 int 2515 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2516 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2517 { 2518 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2519 } 2520 2521 /*! 2522 See @ref __kmpc_dispatch_next_4 2523 */ 2524 int 2525 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2526 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2527 { 2528 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2529 } 2530 2531 /*! 2532 See @ref __kmpc_dispatch_next_4 2533 */ 2534 int 2535 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2536 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2537 { 2538 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2539 } 2540 2541 /*! 2542 See @ref __kmpc_dispatch_next_4 2543 */ 2544 int 2545 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2546 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2547 { 2548 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2549 } 2550 2551 /*! 2552 @param loc Source code location 2553 @param gtid Global thread id 2554 2555 Mark the end of a dynamic loop. 2556 */ 2557 void 2558 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2559 { 2560 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2561 } 2562 2563 /*! 2564 See @ref __kmpc_dispatch_fini_4 2565 */ 2566 void 2567 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2568 { 2569 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2570 } 2571 2572 /*! 2573 See @ref __kmpc_dispatch_fini_4 2574 */ 2575 void 2576 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2577 { 2578 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2579 } 2580 2581 /*! 2582 See @ref __kmpc_dispatch_fini_4 2583 */ 2584 void 2585 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2586 { 2587 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2588 } 2589 /*! @} */ 2590 2591 //----------------------------------------------------------------------------------------- 2592 //Non-template routines from kmp_dispatch.cpp used in other sources 2593 2594 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2595 return value == checker; 2596 } 2597 2598 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2599 return value != checker; 2600 } 2601 2602 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2603 return value < checker; 2604 } 2605 2606 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2607 return value >= checker; 2608 } 2609 2610 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2611 return value <= checker; 2612 } 2613 2614 kmp_uint32 2615 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2616 kmp_uint32 checker, 2617 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2618 , void * obj // Higher-level synchronization object, or NULL. 2619 ) 2620 { 2621 // note: we may not belong to a team at this point 2622 register volatile kmp_uint32 * spin = spinner; 2623 register kmp_uint32 check = checker; 2624 register kmp_uint32 spins; 2625 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2626 register kmp_uint32 r; 2627 2628 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2629 KMP_INIT_YIELD( spins ); 2630 // main wait spin loop 2631 while(!f(r = TCR_4(*spin), check)) { 2632 KMP_FSYNC_SPIN_PREPARE( obj ); 2633 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2634 It causes problems with infinite recursion because of exit lock */ 2635 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2636 __kmp_abort_thread(); */ 2637 2638 /* if we have waited a bit, or are oversubscribed, yield */ 2639 /* pause is in the following code */ 2640 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2641 KMP_YIELD_SPIN( spins ); 2642 } 2643 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2644 return r; 2645 } 2646 2647 void 2648 __kmp_wait_yield_4_ptr(void *spinner, 2649 kmp_uint32 checker, 2650 kmp_uint32 (*pred)( void *, kmp_uint32 ), 2651 void *obj // Higher-level synchronization object, or NULL. 2652 ) 2653 { 2654 // note: we may not belong to a team at this point 2655 register void *spin = spinner; 2656 register kmp_uint32 check = checker; 2657 register kmp_uint32 spins; 2658 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred; 2659 2660 KMP_FSYNC_SPIN_INIT( obj, spin ); 2661 KMP_INIT_YIELD( spins ); 2662 // main wait spin loop 2663 while ( !f( spin, check ) ) { 2664 KMP_FSYNC_SPIN_PREPARE( obj ); 2665 /* if we have waited a bit, or are oversubscribed, yield */ 2666 /* pause is in the following code */ 2667 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc ); 2668 KMP_YIELD_SPIN( spins ); 2669 } 2670 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2671 } 2672 2673 } // extern "C" 2674 2675 #ifdef KMP_GOMP_COMPAT 2676 2677 void 2678 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2679 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2680 kmp_int32 chunk, int push_ws ) 2681 { 2682 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2683 push_ws ); 2684 } 2685 2686 void 2687 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2688 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2689 kmp_int32 chunk, int push_ws ) 2690 { 2691 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2692 push_ws ); 2693 } 2694 2695 void 2696 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2697 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2698 kmp_int64 chunk, int push_ws ) 2699 { 2700 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2701 push_ws ); 2702 } 2703 2704 void 2705 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2706 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2707 kmp_int64 chunk, int push_ws ) 2708 { 2709 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2710 push_ws ); 2711 } 2712 2713 void 2714 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2715 { 2716 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2717 } 2718 2719 void 2720 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2721 { 2722 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2723 } 2724 2725 void 2726 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2727 { 2728 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2729 } 2730 2731 void 2732 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2733 { 2734 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2735 } 2736 2737 #endif /* KMP_GOMP_COMPAT */ 2738 2739 /* ------------------------------------------------------------------------ */ 2740 /* ------------------------------------------------------------------------ */ 2741 2742