1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 #if OMP_41_ENABLED 179 volatile kmp_int32 doacross_buf_idx; // teamwise index 180 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 181 kmp_int32 doacross_num_done; // count finished threads 182 #endif 183 }; 184 185 /* ------------------------------------------------------------------------ */ 186 /* ------------------------------------------------------------------------ */ 187 188 #undef USE_TEST_LOCKS 189 190 // test_then_add template (general template should NOT be used) 191 template< typename T > 192 static __forceinline T 193 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 194 195 template<> 196 __forceinline kmp_int32 197 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 198 { 199 kmp_int32 r; 200 r = KMP_TEST_THEN_ADD32( p, d ); 201 return r; 202 } 203 204 template<> 205 __forceinline kmp_int64 206 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 207 { 208 kmp_int64 r; 209 r = KMP_TEST_THEN_ADD64( p, d ); 210 return r; 211 } 212 213 // test_then_inc_acq template (general template should NOT be used) 214 template< typename T > 215 static __forceinline T 216 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 217 218 template<> 219 __forceinline kmp_int32 220 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 221 { 222 kmp_int32 r; 223 r = KMP_TEST_THEN_INC_ACQ32( p ); 224 return r; 225 } 226 227 template<> 228 __forceinline kmp_int64 229 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 230 { 231 kmp_int64 r; 232 r = KMP_TEST_THEN_INC_ACQ64( p ); 233 return r; 234 } 235 236 // test_then_inc template (general template should NOT be used) 237 template< typename T > 238 static __forceinline T 239 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 240 241 template<> 242 __forceinline kmp_int32 243 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 244 { 245 kmp_int32 r; 246 r = KMP_TEST_THEN_INC32( p ); 247 return r; 248 } 249 250 template<> 251 __forceinline kmp_int64 252 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 253 { 254 kmp_int64 r; 255 r = KMP_TEST_THEN_INC64( p ); 256 return r; 257 } 258 259 // compare_and_swap template (general template should NOT be used) 260 template< typename T > 261 static __forceinline kmp_int32 262 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 263 264 template<> 265 __forceinline kmp_int32 266 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 267 { 268 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 269 } 270 271 template<> 272 __forceinline kmp_int32 273 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 274 { 275 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 276 } 277 278 /* 279 Spin wait loop that first does pause, then yield. 280 Waits until function returns non-zero when called with *spinner and check. 281 Does NOT put threads to sleep. 282 #if USE_ITT_BUILD 283 Arguments: 284 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 285 locks consistently. For example, if lock is acquired immediately, its address is 286 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 287 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 288 address, not an address of low-level spinner. 289 #endif // USE_ITT_BUILD 290 */ 291 template< typename UT > 292 // ToDo: make inline function (move to header file for icl) 293 static UT // unsigned 4- or 8-byte type 294 __kmp_wait_yield( volatile UT * spinner, 295 UT checker, 296 kmp_uint32 (* pred)( UT, UT ) 297 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 298 ) 299 { 300 // note: we may not belong to a team at this point 301 register volatile UT * spin = spinner; 302 register UT check = checker; 303 register kmp_uint32 spins; 304 register kmp_uint32 (*f) ( UT, UT ) = pred; 305 register UT r; 306 307 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 308 KMP_INIT_YIELD( spins ); 309 // main wait spin loop 310 while(!f(r = *spin, check)) 311 { 312 KMP_FSYNC_SPIN_PREPARE( obj ); 313 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 314 It causes problems with infinite recursion because of exit lock */ 315 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 316 __kmp_abort_thread(); */ 317 318 // if we are oversubscribed, 319 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 320 // pause is in the following code 321 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 322 KMP_YIELD_SPIN( spins ); 323 } 324 KMP_FSYNC_SPIN_ACQUIRED( obj ); 325 return r; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_eq( UT value, UT checker) { 330 return value == checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_neq( UT value, UT checker) { 335 return value != checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_lt( UT value, UT checker) { 340 return value < checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_ge( UT value, UT checker) { 345 return value >= checker; 346 } 347 348 template< typename UT > 349 static kmp_uint32 __kmp_le( UT value, UT checker) { 350 return value <= checker; 351 } 352 353 354 /* ------------------------------------------------------------------------ */ 355 /* ------------------------------------------------------------------------ */ 356 357 static void 358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 359 { 360 kmp_info_t *th; 361 362 KMP_DEBUG_ASSERT( gtid_ref ); 363 364 if ( __kmp_env_consistency_check ) { 365 th = __kmp_threads[*gtid_ref]; 366 if ( th -> th.th_root -> r.r_active 367 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 368 #if KMP_USE_DYNAMIC_LOCK 369 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 370 #else 371 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 372 #endif 373 } 374 } 375 } 376 377 template< typename UT > 378 static void 379 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 380 { 381 typedef typename traits_t< UT >::signed_t ST; 382 dispatch_private_info_template< UT > * pr; 383 384 int gtid = *gtid_ref; 385 // int cid = *cid_ref; 386 kmp_info_t *th = __kmp_threads[ gtid ]; 387 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 388 389 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 390 if ( __kmp_env_consistency_check ) { 391 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 392 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 393 if ( pr -> pushed_ws != ct_none ) { 394 #if KMP_USE_DYNAMIC_LOCK 395 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 396 #else 397 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 398 #endif 399 } 400 } 401 402 if ( ! th -> th.th_team -> t.t_serialized ) { 403 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 405 UT lower; 406 407 if ( ! __kmp_env_consistency_check ) { 408 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 409 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 410 } 411 lower = pr->u.p.ordered_lower; 412 413 #if ! defined( KMP_GOMP_COMPAT ) 414 if ( __kmp_env_consistency_check ) { 415 if ( pr->ordered_bumped ) { 416 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 417 __kmp_error_construct2( 418 kmp_i18n_msg_CnsMultipleNesting, 419 ct_ordered_in_pdo, loc_ref, 420 & p->stack_data[ p->w_top ] 421 ); 422 } 423 } 424 #endif /* !defined(KMP_GOMP_COMPAT) */ 425 426 KMP_MB(); 427 #ifdef KMP_DEBUG 428 { 429 const char * buff; 430 // create format specifiers before the debug output 431 buff = __kmp_str_format( 432 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 433 traits_t< UT >::spec, traits_t< UT >::spec ); 434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 435 __kmp_str_free( &buff ); 436 } 437 #endif 438 439 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 440 USE_ITT_BUILD_ARG( NULL ) 441 ); 442 KMP_MB(); /* is this necessary? */ 443 #ifdef KMP_DEBUG 444 { 445 const char * buff; 446 // create format specifiers before the debug output 447 buff = __kmp_str_format( 448 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 449 traits_t< UT >::spec, traits_t< UT >::spec ); 450 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 451 __kmp_str_free( &buff ); 452 } 453 #endif 454 } 455 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 456 } 457 458 static void 459 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 460 { 461 kmp_info_t *th; 462 463 if ( __kmp_env_consistency_check ) { 464 th = __kmp_threads[*gtid_ref]; 465 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 466 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 467 } 468 } 469 } 470 471 template< typename UT > 472 static void 473 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 474 { 475 typedef typename traits_t< UT >::signed_t ST; 476 dispatch_private_info_template< UT > * pr; 477 478 int gtid = *gtid_ref; 479 // int cid = *cid_ref; 480 kmp_info_t *th = __kmp_threads[ gtid ]; 481 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 482 483 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 484 if ( __kmp_env_consistency_check ) { 485 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 486 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 487 if ( pr -> pushed_ws != ct_none ) { 488 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 489 } 490 } 491 492 if ( ! th -> th.th_team -> t.t_serialized ) { 493 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 494 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 495 496 if ( ! __kmp_env_consistency_check ) { 497 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 498 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 499 } 500 501 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 502 #if ! defined( KMP_GOMP_COMPAT ) 503 if ( __kmp_env_consistency_check ) { 504 if ( pr->ordered_bumped != 0 ) { 505 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 506 /* How to test it? - OM */ 507 __kmp_error_construct2( 508 kmp_i18n_msg_CnsMultipleNesting, 509 ct_ordered_in_pdo, loc_ref, 510 & p->stack_data[ p->w_top ] 511 ); 512 } 513 } 514 #endif /* !defined(KMP_GOMP_COMPAT) */ 515 516 KMP_MB(); /* Flush all pending memory write invalidates. */ 517 518 pr->ordered_bumped += 1; 519 520 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 521 gtid, pr->ordered_bumped ) ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 525 /* TODO use general release procedure? */ 526 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 527 528 KMP_MB(); /* Flush all pending memory write invalidates. */ 529 } 530 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 531 } 532 533 /* Computes and returns x to the power of y, where y must a non-negative integer */ 534 template< typename UT > 535 static __forceinline long double 536 __kmp_pow(long double x, UT y) { 537 long double s=1.0L; 538 539 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 540 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 541 while(y) { 542 if ( y & 1 ) 543 s *= x; 544 x *= x; 545 y >>= 1; 546 } 547 return s; 548 } 549 550 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 551 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 552 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 553 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 554 */ 555 template< typename T > 556 static __inline typename traits_t< T >::unsigned_t 557 __kmp_dispatch_guided_remaining( 558 T tc, 559 typename traits_t< T >::floating_t base, 560 typename traits_t< T >::unsigned_t idx 561 ) { 562 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 563 least for ICL 8.1, long double arithmetic may not really have 564 long double precision, even with /Qlong_double. Currently, we 565 workaround that in the caller code, by manipulating the FPCW for 566 Windows* OS on IA-32 architecture. The lack of precision is not 567 expected to be a correctness issue, though. 568 */ 569 typedef typename traits_t< T >::unsigned_t UT; 570 571 long double x = tc * __kmp_pow< UT >(base, idx); 572 UT r = (UT) x; 573 if ( x == r ) 574 return r; 575 return r + 1; 576 } 577 578 // Parameters of the guided-iterative algorithm: 579 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 580 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 581 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 582 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 583 static int guided_int_param = 2; 584 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 585 586 // UT - unsigned flavor of T, ST - signed flavor of T, 587 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 588 template< typename T > 589 static void 590 __kmp_dispatch_init( 591 ident_t * loc, 592 int gtid, 593 enum sched_type schedule, 594 T lb, 595 T ub, 596 typename traits_t< T >::signed_t st, 597 typename traits_t< T >::signed_t chunk, 598 int push_ws 599 ) { 600 typedef typename traits_t< T >::unsigned_t UT; 601 typedef typename traits_t< T >::signed_t ST; 602 typedef typename traits_t< T >::floating_t DBL; 603 static const int ___kmp_size_type = sizeof( UT ); 604 605 int active; 606 T tc; 607 kmp_info_t * th; 608 kmp_team_t * team; 609 kmp_uint32 my_buffer_index; 610 dispatch_private_info_template< T > * pr; 611 dispatch_shared_info_template< UT > volatile * sh; 612 613 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 614 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 615 616 if ( ! TCR_4( __kmp_init_parallel ) ) 617 __kmp_parallel_initialize(); 618 619 #if INCLUDE_SSC_MARKS 620 SSC_MARK_DISPATCH_INIT(); 621 #endif 622 #ifdef KMP_DEBUG 623 { 624 const char * buff; 625 // create format specifiers before the debug output 626 buff = __kmp_str_format( 627 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 628 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 629 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 630 __kmp_str_free( &buff ); 631 } 632 #endif 633 /* setup data */ 634 th = __kmp_threads[ gtid ]; 635 team = th -> th.th_team; 636 active = ! team -> t.t_serialized; 637 th->th.th_ident = loc; 638 639 #if USE_ITT_BUILD 640 kmp_uint64 cur_chunk = chunk; 641 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 642 KMP_MASTER_GTID(gtid) && 643 #if OMP_40_ENABLED 644 th->th.th_teams_microtask == NULL && 645 #endif 646 team->t.t_active_level == 1; 647 #endif 648 if ( ! active ) { 649 pr = reinterpret_cast< dispatch_private_info_template< T >* > 650 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 651 } else { 652 KMP_DEBUG_ASSERT( th->th.th_dispatch == 653 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 654 655 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 656 657 /* What happens when number of threads changes, need to resize buffer? */ 658 pr = reinterpret_cast< dispatch_private_info_template< T > * > 659 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 660 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 661 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] ); 662 } 663 664 /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them 665 * yet anyway). 666 * When it is we'll want to look at them somewhere here and use that information to add to our 667 * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can 668 * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal). 669 */ 670 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 671 672 /* Pick up the nomerge/ordered bits from the scheduling type */ 673 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 674 pr->nomerge = TRUE; 675 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 676 } else { 677 pr->nomerge = FALSE; 678 } 679 pr->type_size = ___kmp_size_type; // remember the size of variables 680 if ( kmp_ord_lower & schedule ) { 681 pr->ordered = TRUE; 682 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 683 } else { 684 pr->ordered = FALSE; 685 } 686 687 if ( schedule == kmp_sch_static ) { 688 schedule = __kmp_static; 689 } else { 690 if ( schedule == kmp_sch_runtime ) { 691 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 692 schedule = team -> t.t_sched.r_sched_type; 693 // Detail the schedule if needed (global controls are differentiated appropriately) 694 if ( schedule == kmp_sch_guided_chunked ) { 695 schedule = __kmp_guided; 696 } else if ( schedule == kmp_sch_static ) { 697 schedule = __kmp_static; 698 } 699 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 700 chunk = team -> t.t_sched.chunk; 701 #if USE_ITT_BUILD 702 cur_chunk = chunk; 703 #endif 704 #ifdef KMP_DEBUG 705 { 706 const char * buff; 707 // create format specifiers before the debug output 708 buff = __kmp_str_format( 709 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 710 traits_t< ST >::spec ); 711 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 712 __kmp_str_free( &buff ); 713 } 714 #endif 715 } else { 716 if ( schedule == kmp_sch_guided_chunked ) { 717 schedule = __kmp_guided; 718 } 719 if ( chunk <= 0 ) { 720 chunk = KMP_DEFAULT_CHUNK; 721 } 722 } 723 724 if ( schedule == kmp_sch_auto ) { 725 // mapping and differentiation: in the __kmp_do_serial_initialize() 726 schedule = __kmp_auto; 727 #ifdef KMP_DEBUG 728 { 729 const char * buff; 730 // create format specifiers before the debug output 731 buff = __kmp_str_format( 732 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 733 traits_t< ST >::spec ); 734 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 735 __kmp_str_free( &buff ); 736 } 737 #endif 738 } 739 740 /* guided analytical not safe for too many threads */ 741 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 742 schedule = kmp_sch_guided_iterative_chunked; 743 KMP_WARNING( DispatchManyThreads ); 744 } 745 pr->u.p.parm1 = chunk; 746 } 747 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 748 "unknown scheduling type" ); 749 750 pr->u.p.count = 0; 751 752 if ( __kmp_env_consistency_check ) { 753 if ( st == 0 ) { 754 __kmp_error_construct( 755 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 756 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 757 ); 758 } 759 } 760 // compute trip count 761 if ( st == 1 ) { // most common case 762 if ( ub >= lb ) { 763 tc = ub - lb + 1; 764 } else { // ub < lb 765 tc = 0; // zero-trip 766 } 767 } else if ( st < 0 ) { 768 if ( lb >= ub ) { 769 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), 770 // where the division needs to be unsigned regardless of the result type 771 tc = (UT)(lb - ub) / (-st) + 1; 772 } else { // lb < ub 773 tc = 0; // zero-trip 774 } 775 } else { // st > 0 776 if ( ub >= lb ) { 777 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), 778 // where the division needs to be unsigned regardless of the result type 779 tc = (UT)(ub - lb) / st + 1; 780 } else { // ub < lb 781 tc = 0; // zero-trip 782 } 783 } 784 785 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 786 // when statistics are disabled. 787 if (schedule == __kmp_static) 788 { 789 KMP_COUNT_BLOCK(OMP_FOR_static); 790 KMP_COUNT_VALUE(FOR_static_iterations, tc); 791 } 792 else 793 { 794 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 795 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 796 } 797 798 pr->u.p.lb = lb; 799 pr->u.p.ub = ub; 800 pr->u.p.st = st; 801 pr->u.p.tc = tc; 802 803 #if KMP_OS_WINDOWS 804 pr->u.p.last_upper = ub + st; 805 #endif /* KMP_OS_WINDOWS */ 806 807 /* NOTE: only the active parallel region(s) has active ordered sections */ 808 809 if ( active ) { 810 if ( pr->ordered == 0 ) { 811 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 812 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 813 } else { 814 pr->ordered_bumped = 0; 815 816 pr->u.p.ordered_lower = 1; 817 pr->u.p.ordered_upper = 0; 818 819 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 820 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 821 } 822 } 823 824 if ( __kmp_env_consistency_check ) { 825 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 826 if ( push_ws ) { 827 __kmp_push_workshare( gtid, ws, loc ); 828 pr->pushed_ws = ws; 829 } else { 830 __kmp_check_workshare( gtid, ws, loc ); 831 pr->pushed_ws = ct_none; 832 } 833 } 834 835 switch ( schedule ) { 836 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 837 case kmp_sch_static_steal: 838 { 839 T nproc = team->t.t_nproc; 840 T ntc, init; 841 842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 843 844 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 845 if ( nproc > 1 && ntc >= nproc ) { 846 T id = __kmp_tid_from_gtid(gtid); 847 T small_chunk, extras; 848 849 small_chunk = ntc / nproc; 850 extras = ntc % nproc; 851 852 init = id * small_chunk + ( id < extras ? id : extras ); 853 pr->u.p.count = init; 854 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 855 856 pr->u.p.parm2 = lb; 857 //pr->pfields.parm3 = 0; // it's not used in static_steal 858 pr->u.p.parm4 = id; 859 pr->u.p.st = st; 860 break; 861 } else { 862 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 863 gtid ) ); 864 schedule = kmp_sch_static_balanced; 865 /* too few iterations: fall-through to kmp_sch_static_balanced */ 866 } // if 867 /* FALL-THROUGH to static balanced */ 868 } // case 869 #endif 870 case kmp_sch_static_balanced: 871 { 872 T nproc = team->t.t_nproc; 873 T init, limit; 874 875 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 876 gtid ) ); 877 878 if ( nproc > 1 ) { 879 T id = __kmp_tid_from_gtid(gtid); 880 881 if ( tc < nproc ) { 882 if ( id < tc ) { 883 init = id; 884 limit = id; 885 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 886 } else { 887 pr->u.p.count = 1; /* means no more chunks to execute */ 888 pr->u.p.parm1 = FALSE; 889 break; 890 } 891 } else { 892 T small_chunk = tc / nproc; 893 T extras = tc % nproc; 894 init = id * small_chunk + (id < extras ? id : extras); 895 limit = init + small_chunk - (id < extras ? 0 : 1); 896 pr->u.p.parm1 = (id == nproc - 1); 897 } 898 } else { 899 if ( tc > 0 ) { 900 init = 0; 901 limit = tc - 1; 902 pr->u.p.parm1 = TRUE; 903 } else { 904 // zero trip count 905 pr->u.p.count = 1; /* means no more chunks to execute */ 906 pr->u.p.parm1 = FALSE; 907 break; 908 } 909 } 910 #if USE_ITT_BUILD 911 // Calculate chunk for metadata report 912 if ( itt_need_metadata_reporting ) 913 cur_chunk = limit - init + 1; 914 #endif 915 if ( st == 1 ) { 916 pr->u.p.lb = lb + init; 917 pr->u.p.ub = lb + limit; 918 } else { 919 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 920 pr->u.p.lb = lb + init * st; 921 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 922 if ( st > 0 ) { 923 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 924 } else { 925 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 926 } 927 } 928 if ( pr->ordered ) { 929 pr->u.p.ordered_lower = init; 930 pr->u.p.ordered_upper = limit; 931 } 932 break; 933 } // case 934 case kmp_sch_guided_iterative_chunked : 935 { 936 T nproc = team->t.t_nproc; 937 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 938 939 if ( nproc > 1 ) { 940 if ( (2L * chunk + 1 ) * nproc >= tc ) { 941 /* chunk size too large, switch to dynamic */ 942 schedule = kmp_sch_dynamic_chunked; 943 } else { 944 // when remaining iters become less than parm2 - switch to dynamic 945 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 946 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 947 } 948 } else { 949 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 950 schedule = kmp_sch_static_greedy; 951 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 952 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 953 pr->u.p.parm1 = tc; 954 } // if 955 } // case 956 break; 957 case kmp_sch_guided_analytical_chunked: 958 { 959 T nproc = team->t.t_nproc; 960 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 961 962 if ( nproc > 1 ) { 963 if ( (2L * chunk + 1 ) * nproc >= tc ) { 964 /* chunk size too large, switch to dynamic */ 965 schedule = kmp_sch_dynamic_chunked; 966 } else { 967 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 968 DBL x; 969 970 #if KMP_OS_WINDOWS && KMP_ARCH_X86 971 /* Linux* OS already has 64-bit computation by default for 972 long double, and on Windows* OS on Intel(R) 64, 973 /Qlong_double doesn't work. On Windows* OS 974 on IA-32 architecture, we need to set precision to 975 64-bit instead of the default 53-bit. Even though long 976 double doesn't work on Windows* OS on Intel(R) 64, the 977 resulting lack of precision is not expected to impact 978 the correctness of the algorithm, but this has not been 979 mathematically proven. 980 */ 981 // save original FPCW and set precision to 64-bit, as 982 // Windows* OS on IA-32 architecture defaults to 53-bit 983 unsigned int oldFpcw = _control87(0,0); 984 _control87(_PC_64,_MCW_PC); // 0,0x30000 985 #endif 986 /* value used for comparison in solver for cross-over point */ 987 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 988 989 /* crossover point--chunk indexes equal to or greater than 990 this point switch to dynamic-style scheduling */ 991 UT cross; 992 993 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 994 x = (long double)1.0 - (long double)0.5 / nproc; 995 996 #ifdef KMP_DEBUG 997 { // test natural alignment 998 struct _test_a { 999 char a; 1000 union { 1001 char b; 1002 DBL d; 1003 }; 1004 } t; 1005 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 1006 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 1007 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 1008 } 1009 #endif // KMP_DEBUG 1010 1011 /* save the term in thread private dispatch structure */ 1012 *(DBL*)&pr->u.p.parm3 = x; 1013 1014 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 1015 { 1016 UT left, right, mid; 1017 long double p; 1018 1019 /* estimate initial upper and lower bound */ 1020 1021 /* doesn't matter what value right is as long as it is positive, but 1022 it affects performance of the solver 1023 */ 1024 right = 229; 1025 p = __kmp_pow< UT >(x,right); 1026 if ( p > target ) { 1027 do{ 1028 p *= p; 1029 right <<= 1; 1030 } while(p>target && right < (1<<27)); 1031 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1032 } else { 1033 left = 0; 1034 } 1035 1036 /* bisection root-finding method */ 1037 while ( left + 1 < right ) { 1038 mid = (left + right) / 2; 1039 if ( __kmp_pow< UT >(x,mid) > target ) { 1040 left = mid; 1041 } else { 1042 right = mid; 1043 } 1044 } // while 1045 cross = right; 1046 } 1047 /* assert sanity of computed crossover point */ 1048 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1049 1050 /* save the crossover point in thread private dispatch structure */ 1051 pr->u.p.parm2 = cross; 1052 1053 // C75803 1054 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1055 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1056 #else 1057 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1058 #endif 1059 /* dynamic-style scheduling offset */ 1060 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1061 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1062 // restore FPCW 1063 _control87(oldFpcw,_MCW_PC); 1064 #endif 1065 } // if 1066 } else { 1067 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1068 gtid ) ); 1069 schedule = kmp_sch_static_greedy; 1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1071 pr->u.p.parm1 = tc; 1072 } // if 1073 } // case 1074 break; 1075 case kmp_sch_static_greedy: 1076 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1077 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1078 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1079 tc; 1080 break; 1081 case kmp_sch_static_chunked : 1082 case kmp_sch_dynamic_chunked : 1083 if ( pr->u.p.parm1 <= 0 ) { 1084 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1085 } 1086 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1087 break; 1088 case kmp_sch_trapezoidal : 1089 { 1090 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1091 1092 T parm1, parm2, parm3, parm4; 1093 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1094 1095 parm1 = chunk; 1096 1097 /* F : size of the first cycle */ 1098 parm2 = ( tc / (2 * team->t.t_nproc) ); 1099 1100 if ( parm2 < 1 ) { 1101 parm2 = 1; 1102 } 1103 1104 /* L : size of the last cycle. Make sure the last cycle 1105 * is not larger than the first cycle. 1106 */ 1107 if ( parm1 < 1 ) { 1108 parm1 = 1; 1109 } else if ( parm1 > parm2 ) { 1110 parm1 = parm2; 1111 } 1112 1113 /* N : number of cycles */ 1114 parm3 = ( parm2 + parm1 ); 1115 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1116 1117 if ( parm3 < 2 ) { 1118 parm3 = 2; 1119 } 1120 1121 /* sigma : decreasing incr of the trapezoid */ 1122 parm4 = ( parm3 - 1 ); 1123 parm4 = ( parm2 - parm1 ) / parm4; 1124 1125 // pointless check, because parm4 >= 0 always 1126 //if ( parm4 < 0 ) { 1127 // parm4 = 0; 1128 //} 1129 1130 pr->u.p.parm1 = parm1; 1131 pr->u.p.parm2 = parm2; 1132 pr->u.p.parm3 = parm3; 1133 pr->u.p.parm4 = parm4; 1134 } // case 1135 break; 1136 1137 default: 1138 { 1139 __kmp_msg( 1140 kmp_ms_fatal, // Severity 1141 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1142 KMP_HNT( GetNewerLibrary ), // Hint 1143 __kmp_msg_null // Variadic argument list terminator 1144 ); 1145 } 1146 break; 1147 } // switch 1148 pr->schedule = schedule; 1149 if ( active ) { 1150 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1151 1152 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1153 gtid, my_buffer_index, sh->buffer_index) ); 1154 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1155 USE_ITT_BUILD_ARG( NULL ) 1156 ); 1157 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1158 // *always* 32-bit integers. 1159 KMP_MB(); /* is this necessary? */ 1160 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1161 gtid, my_buffer_index, sh->buffer_index) ); 1162 1163 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1164 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1165 #if USE_ITT_BUILD 1166 if ( pr->ordered ) { 1167 __kmp_itt_ordered_init( gtid ); 1168 }; // if 1169 // Report loop metadata 1170 if ( itt_need_metadata_reporting ) { 1171 // Only report metadata by master of active team at level 1 1172 kmp_uint64 schedtype = 0; 1173 switch ( schedule ) { 1174 case kmp_sch_static_chunked: 1175 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1176 break; 1177 case kmp_sch_static_greedy: 1178 cur_chunk = pr->u.p.parm1; 1179 break; 1180 case kmp_sch_dynamic_chunked: 1181 schedtype = 1; 1182 break; 1183 case kmp_sch_guided_iterative_chunked: 1184 case kmp_sch_guided_analytical_chunked: 1185 schedtype = 2; 1186 break; 1187 default: 1188 // Should we put this case under "static"? 1189 // case kmp_sch_static_steal: 1190 schedtype = 3; 1191 break; 1192 } 1193 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1194 } 1195 #endif /* USE_ITT_BUILD */ 1196 }; // if 1197 1198 #ifdef KMP_DEBUG 1199 { 1200 const char * buff; 1201 // create format specifiers before the debug output 1202 buff = __kmp_str_format( 1203 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1204 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1205 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1206 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1207 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1208 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1209 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1210 KD_TRACE(10, ( buff, 1211 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1212 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1213 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1214 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1215 __kmp_str_free( &buff ); 1216 } 1217 #endif 1218 #if ( KMP_STATIC_STEAL_ENABLED ) 1219 if ( ___kmp_size_type < 8 ) { 1220 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1221 // all the parm3 variables will contain the same value. 1222 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1223 // rather than program life-time increment. 1224 // So the dedicated variable is required. The 'static_steal_counter' is used. 1225 if( schedule == kmp_sch_static_steal ) { 1226 // Other threads will inspect this variable when searching for a victim. 1227 // This is a flag showing that other threads may steal from this thread since then. 1228 volatile T * p = &pr->u.p.static_steal_counter; 1229 *p = *p + 1; 1230 } 1231 } 1232 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1233 1234 #if OMPT_SUPPORT && OMPT_TRACE 1235 if (ompt_enabled && 1236 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1237 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1238 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1239 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1240 team_info->parallel_id, task_info->task_id, team_info->microtask); 1241 } 1242 #endif 1243 } 1244 1245 /* 1246 * For ordered loops, either __kmp_dispatch_finish() should be called after 1247 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1248 * every chunk of iterations. If the ordered section(s) were not executed 1249 * for this iteration (or every iteration in this chunk), we need to set the 1250 * ordered iteration counters so that the next thread can proceed. 1251 */ 1252 template< typename UT > 1253 static void 1254 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1255 { 1256 typedef typename traits_t< UT >::signed_t ST; 1257 kmp_info_t *th = __kmp_threads[ gtid ]; 1258 1259 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1260 if ( ! th -> th.th_team -> t.t_serialized ) { 1261 1262 dispatch_private_info_template< UT > * pr = 1263 reinterpret_cast< dispatch_private_info_template< UT >* > 1264 ( th->th.th_dispatch->th_dispatch_pr_current ); 1265 dispatch_shared_info_template< UT > volatile * sh = 1266 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1267 ( th->th.th_dispatch->th_dispatch_sh_current ); 1268 KMP_DEBUG_ASSERT( pr ); 1269 KMP_DEBUG_ASSERT( sh ); 1270 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1271 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1272 1273 if ( pr->ordered_bumped ) { 1274 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1275 gtid ) ); 1276 pr->ordered_bumped = 0; 1277 } else { 1278 UT lower = pr->u.p.ordered_lower; 1279 1280 #ifdef KMP_DEBUG 1281 { 1282 const char * buff; 1283 // create format specifiers before the debug output 1284 buff = __kmp_str_format( 1285 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1286 traits_t< UT >::spec, traits_t< UT >::spec ); 1287 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1288 __kmp_str_free( &buff ); 1289 } 1290 #endif 1291 1292 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1293 USE_ITT_BUILD_ARG(NULL) 1294 ); 1295 KMP_MB(); /* is this necessary? */ 1296 #ifdef KMP_DEBUG 1297 { 1298 const char * buff; 1299 // create format specifiers before the debug output 1300 buff = __kmp_str_format( 1301 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1302 traits_t< UT >::spec, traits_t< UT >::spec ); 1303 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1304 __kmp_str_free( &buff ); 1305 } 1306 #endif 1307 1308 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1309 } // if 1310 } // if 1311 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1312 } 1313 1314 #ifdef KMP_GOMP_COMPAT 1315 1316 template< typename UT > 1317 static void 1318 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1319 { 1320 typedef typename traits_t< UT >::signed_t ST; 1321 kmp_info_t *th = __kmp_threads[ gtid ]; 1322 1323 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1324 if ( ! th -> th.th_team -> t.t_serialized ) { 1325 // int cid; 1326 dispatch_private_info_template< UT > * pr = 1327 reinterpret_cast< dispatch_private_info_template< UT >* > 1328 ( th->th.th_dispatch->th_dispatch_pr_current ); 1329 dispatch_shared_info_template< UT > volatile * sh = 1330 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1331 ( th->th.th_dispatch->th_dispatch_sh_current ); 1332 KMP_DEBUG_ASSERT( pr ); 1333 KMP_DEBUG_ASSERT( sh ); 1334 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1335 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1336 1337 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1338 UT lower = pr->u.p.ordered_lower; 1339 UT upper = pr->u.p.ordered_upper; 1340 UT inc = upper - lower + 1; 1341 1342 if ( pr->ordered_bumped == inc ) { 1343 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1344 gtid ) ); 1345 pr->ordered_bumped = 0; 1346 } else { 1347 inc -= pr->ordered_bumped; 1348 1349 #ifdef KMP_DEBUG 1350 { 1351 const char * buff; 1352 // create format specifiers before the debug output 1353 buff = __kmp_str_format( 1354 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1355 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1356 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1357 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1358 __kmp_str_free( &buff ); 1359 } 1360 #endif 1361 1362 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1363 USE_ITT_BUILD_ARG(NULL) 1364 ); 1365 1366 KMP_MB(); /* is this necessary? */ 1367 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1368 gtid ) ); 1369 pr->ordered_bumped = 0; 1370 //!!!!! TODO check if the inc should be unsigned, or signed??? 1371 #ifdef KMP_DEBUG 1372 { 1373 const char * buff; 1374 // create format specifiers before the debug output 1375 buff = __kmp_str_format( 1376 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1377 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1378 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1379 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1380 __kmp_str_free( &buff ); 1381 } 1382 #endif 1383 1384 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1385 } 1386 // } 1387 } 1388 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1389 } 1390 1391 #endif /* KMP_GOMP_COMPAT */ 1392 1393 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1394 * (no more work), then tell OMPT the loop is over. In some cases 1395 * kmp_dispatch_fini() is not called. */ 1396 #if OMPT_SUPPORT && OMPT_TRACE 1397 #define OMPT_LOOP_END \ 1398 if (status == 0) { \ 1399 if (ompt_enabled && \ 1400 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1401 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1402 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1403 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1404 team_info->parallel_id, task_info->task_id); \ 1405 } \ 1406 } 1407 #else 1408 #define OMPT_LOOP_END // no-op 1409 #endif 1410 1411 template< typename T > 1412 static int 1413 __kmp_dispatch_next( 1414 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1415 ) { 1416 1417 typedef typename traits_t< T >::unsigned_t UT; 1418 typedef typename traits_t< T >::signed_t ST; 1419 typedef typename traits_t< T >::floating_t DBL; 1420 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1421 static const int ___kmp_size_type = sizeof( UT ); 1422 #endif 1423 1424 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1425 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1426 // more than a compile time choice to use static scheduling would.) 1427 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); 1428 1429 int status; 1430 dispatch_private_info_template< T > * pr; 1431 kmp_info_t * th = __kmp_threads[ gtid ]; 1432 kmp_team_t * team = th -> th.th_team; 1433 1434 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1435 #ifdef KMP_DEBUG 1436 { 1437 const char * buff; 1438 // create format specifiers before the debug output 1439 buff = __kmp_str_format( 1440 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1441 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1442 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1443 __kmp_str_free( &buff ); 1444 } 1445 #endif 1446 1447 if ( team -> t.t_serialized ) { 1448 /* NOTE: serialize this dispatch becase we are not at the active level */ 1449 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1450 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1451 KMP_DEBUG_ASSERT( pr ); 1452 1453 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1454 *p_lb = 0; 1455 *p_ub = 0; 1456 // if ( p_last != NULL ) 1457 // *p_last = 0; 1458 if ( p_st != NULL ) 1459 *p_st = 0; 1460 if ( __kmp_env_consistency_check ) { 1461 if ( pr->pushed_ws != ct_none ) { 1462 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1463 } 1464 } 1465 } else if ( pr->nomerge ) { 1466 kmp_int32 last; 1467 T start; 1468 UT limit, trip, init; 1469 ST incr; 1470 T chunk = pr->u.p.parm1; 1471 1472 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1473 1474 init = chunk * pr->u.p.count++; 1475 trip = pr->u.p.tc - 1; 1476 1477 if ( (status = (init <= trip)) == 0 ) { 1478 *p_lb = 0; 1479 *p_ub = 0; 1480 // if ( p_last != NULL ) 1481 // *p_last = 0; 1482 if ( p_st != NULL ) 1483 *p_st = 0; 1484 if ( __kmp_env_consistency_check ) { 1485 if ( pr->pushed_ws != ct_none ) { 1486 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1487 } 1488 } 1489 } else { 1490 start = pr->u.p.lb; 1491 limit = chunk + init - 1; 1492 incr = pr->u.p.st; 1493 1494 if ( (last = (limit >= trip)) != 0 ) { 1495 limit = trip; 1496 #if KMP_OS_WINDOWS 1497 pr->u.p.last_upper = pr->u.p.ub; 1498 #endif /* KMP_OS_WINDOWS */ 1499 } 1500 if ( p_last != NULL ) 1501 *p_last = last; 1502 if ( p_st != NULL ) 1503 *p_st = incr; 1504 if ( incr == 1 ) { 1505 *p_lb = start + init; 1506 *p_ub = start + limit; 1507 } else { 1508 *p_lb = start + init * incr; 1509 *p_ub = start + limit * incr; 1510 } 1511 1512 if ( pr->ordered ) { 1513 pr->u.p.ordered_lower = init; 1514 pr->u.p.ordered_upper = limit; 1515 #ifdef KMP_DEBUG 1516 { 1517 const char * buff; 1518 // create format specifiers before the debug output 1519 buff = __kmp_str_format( 1520 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1521 traits_t< UT >::spec, traits_t< UT >::spec ); 1522 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1523 __kmp_str_free( &buff ); 1524 } 1525 #endif 1526 } // if 1527 } // if 1528 } else { 1529 pr->u.p.tc = 0; 1530 *p_lb = pr->u.p.lb; 1531 *p_ub = pr->u.p.ub; 1532 #if KMP_OS_WINDOWS 1533 pr->u.p.last_upper = *p_ub; 1534 #endif /* KMP_OS_WINDOWS */ 1535 if ( p_last != NULL ) 1536 *p_last = TRUE; 1537 if ( p_st != NULL ) 1538 *p_st = pr->u.p.st; 1539 } // if 1540 #ifdef KMP_DEBUG 1541 { 1542 const char * buff; 1543 // create format specifiers before the debug output 1544 buff = __kmp_str_format( 1545 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1546 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1547 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1548 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1549 __kmp_str_free( &buff ); 1550 } 1551 #endif 1552 #if INCLUDE_SSC_MARKS 1553 SSC_MARK_DISPATCH_NEXT(); 1554 #endif 1555 OMPT_LOOP_END; 1556 return status; 1557 } else { 1558 kmp_int32 last = 0; 1559 dispatch_shared_info_template< UT > *sh; 1560 T start; 1561 ST incr; 1562 UT limit, trip, init; 1563 1564 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1565 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1566 1567 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1568 ( th->th.th_dispatch->th_dispatch_pr_current ); 1569 KMP_DEBUG_ASSERT( pr ); 1570 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1571 ( th->th.th_dispatch->th_dispatch_sh_current ); 1572 KMP_DEBUG_ASSERT( sh ); 1573 1574 if ( pr->u.p.tc == 0 ) { 1575 // zero trip count 1576 status = 0; 1577 } else { 1578 switch (pr->schedule) { 1579 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1580 case kmp_sch_static_steal: 1581 { 1582 T chunk = pr->u.p.parm1; 1583 1584 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1585 1586 trip = pr->u.p.tc - 1; 1587 1588 if ( ___kmp_size_type > 4 ) { 1589 // Other threads do not look into the data of this thread, 1590 // so it's not necessary to make volatile casting. 1591 init = ( pr->u.p.count )++; 1592 status = ( init < (UT)pr->u.p.ub ); 1593 } else { 1594 typedef union { 1595 struct { 1596 UT count; 1597 T ub; 1598 } p; 1599 kmp_int64 b; 1600 } union_i4; 1601 // All operations on 'count' or 'ub' must be combined atomically together. 1602 // stealing implemented only for 4-byte indexes 1603 { 1604 union_i4 vold, vnew; 1605 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1606 vnew = vold; 1607 vnew.p.count++; 1608 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1609 ( volatile kmp_int64* )&pr->u.p.count, 1610 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1611 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1612 KMP_CPU_PAUSE(); 1613 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1614 vnew = vold; 1615 vnew.p.count++; 1616 } 1617 vnew = vold; 1618 init = vnew.p.count; 1619 status = ( init < (UT)vnew.p.ub ) ; 1620 } 1621 1622 if( !status ) { 1623 kmp_info_t **other_threads = team->t.t_threads; 1624 int while_limit = 10; 1625 int while_index = 0; 1626 1627 // TODO: algorithm of searching for a victim 1628 // should be cleaned up and measured 1629 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1630 union_i4 vold, vnew; 1631 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1632 T victimIdx = pr->u.p.parm4; 1633 T oldVictimIdx = victimIdx; 1634 dispatch_private_info_template< T > * victim; 1635 1636 do { 1637 if( !victimIdx ) { 1638 victimIdx = team->t.t_nproc - 1; 1639 } else { 1640 --victimIdx; 1641 } 1642 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1643 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1644 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1645 // TODO: think about a proper place of this test 1646 if ( ( !victim ) || 1647 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1648 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1649 // TODO: delay would be nice 1650 continue; 1651 // the victim is not ready yet to participate in stealing 1652 // because the victim is still in kmp_init_dispatch 1653 } 1654 if ( oldVictimIdx == victimIdx ) { 1655 break; 1656 } 1657 pr->u.p.parm4 = victimIdx; 1658 1659 while( 1 ) { 1660 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1661 vnew = vold; 1662 1663 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1664 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1665 break; 1666 } 1667 vnew.p.ub -= (remaining >> 2); 1668 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1669 #pragma warning( push ) 1670 // disable warning on pointless comparison of unsigned with 0 1671 #pragma warning( disable: 186 ) 1672 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1673 #pragma warning( pop ) 1674 // TODO: Should this be acquire or release? 1675 if ( KMP_COMPARE_AND_STORE_ACQ64( 1676 ( volatile kmp_int64 * )&victim->u.p.count, 1677 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1678 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1679 status = 1; 1680 while_index = 0; 1681 // now update own count and ub 1682 #if KMP_ARCH_X86 1683 // stealing executed on non-KMP_ARCH_X86 only 1684 // Atomic 64-bit write on ia32 is 1685 // unavailable, so we do this in steps. 1686 // This code is not tested. 1687 init = vold.p.count; 1688 pr->u.p.ub = 0; 1689 pr->u.p.count = init + 1; 1690 pr->u.p.ub = vnew.p.count; 1691 #else 1692 init = vnew.p.ub; 1693 vold.p.count = init + 1; 1694 // TODO: is it safe and enough? 1695 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1696 #endif // KMP_ARCH_X86 1697 break; 1698 } // if 1699 KMP_CPU_PAUSE(); 1700 } // while (1) 1701 } // while 1702 } // if 1703 } // if 1704 if ( !status ) { 1705 *p_lb = 0; 1706 *p_ub = 0; 1707 if ( p_st != NULL ) *p_st = 0; 1708 } else { 1709 start = pr->u.p.parm2; 1710 init *= chunk; 1711 limit = chunk + init - 1; 1712 incr = pr->u.p.st; 1713 1714 KMP_DEBUG_ASSERT(init <= trip); 1715 if ( (last = (limit >= trip)) != 0 ) 1716 limit = trip; 1717 if ( p_st != NULL ) *p_st = incr; 1718 1719 if ( incr == 1 ) { 1720 *p_lb = start + init; 1721 *p_ub = start + limit; 1722 } else { 1723 *p_lb = start + init * incr; 1724 *p_ub = start + limit * incr; 1725 } 1726 1727 if ( pr->ordered ) { 1728 pr->u.p.ordered_lower = init; 1729 pr->u.p.ordered_upper = limit; 1730 #ifdef KMP_DEBUG 1731 { 1732 const char * buff; 1733 // create format specifiers before the debug output 1734 buff = __kmp_str_format( 1735 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1736 traits_t< UT >::spec, traits_t< UT >::spec ); 1737 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1738 __kmp_str_free( &buff ); 1739 } 1740 #endif 1741 } // if 1742 } // if 1743 break; 1744 } // case 1745 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1746 case kmp_sch_static_balanced: 1747 { 1748 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1749 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1750 pr->u.p.count = 1; 1751 *p_lb = pr->u.p.lb; 1752 *p_ub = pr->u.p.ub; 1753 last = pr->u.p.parm1; 1754 if ( p_st != NULL ) 1755 *p_st = pr->u.p.st; 1756 } else { /* no iterations to do */ 1757 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1758 } 1759 if ( pr->ordered ) { 1760 #ifdef KMP_DEBUG 1761 { 1762 const char * buff; 1763 // create format specifiers before the debug output 1764 buff = __kmp_str_format( 1765 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1766 traits_t< UT >::spec, traits_t< UT >::spec ); 1767 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1768 __kmp_str_free( &buff ); 1769 } 1770 #endif 1771 } // if 1772 } // case 1773 break; 1774 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1775 case kmp_sch_static_chunked: 1776 { 1777 T parm1; 1778 1779 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1780 gtid ) ); 1781 parm1 = pr->u.p.parm1; 1782 1783 trip = pr->u.p.tc - 1; 1784 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1785 1786 if ( (status = (init <= trip)) != 0 ) { 1787 start = pr->u.p.lb; 1788 incr = pr->u.p.st; 1789 limit = parm1 + init - 1; 1790 1791 if ( (last = (limit >= trip)) != 0 ) 1792 limit = trip; 1793 1794 if ( p_st != NULL ) *p_st = incr; 1795 1796 pr->u.p.count += team->t.t_nproc; 1797 1798 if ( incr == 1 ) { 1799 *p_lb = start + init; 1800 *p_ub = start + limit; 1801 } 1802 else { 1803 *p_lb = start + init * incr; 1804 *p_ub = start + limit * incr; 1805 } 1806 1807 if ( pr->ordered ) { 1808 pr->u.p.ordered_lower = init; 1809 pr->u.p.ordered_upper = limit; 1810 #ifdef KMP_DEBUG 1811 { 1812 const char * buff; 1813 // create format specifiers before the debug output 1814 buff = __kmp_str_format( 1815 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1816 traits_t< UT >::spec, traits_t< UT >::spec ); 1817 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1818 __kmp_str_free( &buff ); 1819 } 1820 #endif 1821 } // if 1822 } // if 1823 } // case 1824 break; 1825 1826 case kmp_sch_dynamic_chunked: 1827 { 1828 T chunk = pr->u.p.parm1; 1829 1830 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1831 gtid ) ); 1832 1833 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1834 trip = pr->u.p.tc - 1; 1835 1836 if ( (status = (init <= trip)) == 0 ) { 1837 *p_lb = 0; 1838 *p_ub = 0; 1839 if ( p_st != NULL ) *p_st = 0; 1840 } else { 1841 start = pr->u.p.lb; 1842 limit = chunk + init - 1; 1843 incr = pr->u.p.st; 1844 1845 if ( (last = (limit >= trip)) != 0 ) 1846 limit = trip; 1847 1848 if ( p_st != NULL ) *p_st = incr; 1849 1850 if ( incr == 1 ) { 1851 *p_lb = start + init; 1852 *p_ub = start + limit; 1853 } else { 1854 *p_lb = start + init * incr; 1855 *p_ub = start + limit * incr; 1856 } 1857 1858 if ( pr->ordered ) { 1859 pr->u.p.ordered_lower = init; 1860 pr->u.p.ordered_upper = limit; 1861 #ifdef KMP_DEBUG 1862 { 1863 const char * buff; 1864 // create format specifiers before the debug output 1865 buff = __kmp_str_format( 1866 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1867 traits_t< UT >::spec, traits_t< UT >::spec ); 1868 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1869 __kmp_str_free( &buff ); 1870 } 1871 #endif 1872 } // if 1873 } // if 1874 } // case 1875 break; 1876 1877 case kmp_sch_guided_iterative_chunked: 1878 { 1879 T chunkspec = pr->u.p.parm1; 1880 KD_TRACE(100, 1881 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1882 trip = pr->u.p.tc; 1883 // Start atomic part of calculations 1884 while(1) { 1885 ST remaining; // signed, because can be < 0 1886 init = sh->u.s.iteration; // shared value 1887 remaining = trip - init; 1888 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1889 // nothing to do, don't try atomic op 1890 status = 0; 1891 break; 1892 } 1893 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1894 // use dynamic-style shcedule 1895 // atomically inrement iterations, get old value 1896 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1897 remaining = trip - init; 1898 if (remaining <= 0) { 1899 status = 0; // all iterations got by other threads 1900 } else { 1901 // got some iterations to work on 1902 status = 1; 1903 if ( (T)remaining > chunkspec ) { 1904 limit = init + chunkspec - 1; 1905 } else { 1906 last = 1; // the last chunk 1907 limit = init + remaining - 1; 1908 } // if 1909 } // if 1910 break; 1911 } // if 1912 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1913 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1914 // CAS was successful, chunk obtained 1915 status = 1; 1916 --limit; 1917 break; 1918 } // if 1919 } // while 1920 if ( status != 0 ) { 1921 start = pr->u.p.lb; 1922 incr = pr->u.p.st; 1923 if ( p_st != NULL ) 1924 *p_st = incr; 1925 *p_lb = start + init * incr; 1926 *p_ub = start + limit * incr; 1927 if ( pr->ordered ) { 1928 pr->u.p.ordered_lower = init; 1929 pr->u.p.ordered_upper = limit; 1930 #ifdef KMP_DEBUG 1931 { 1932 const char * buff; 1933 // create format specifiers before the debug output 1934 buff = __kmp_str_format( 1935 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1936 traits_t< UT >::spec, traits_t< UT >::spec ); 1937 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1938 __kmp_str_free( &buff ); 1939 } 1940 #endif 1941 } // if 1942 } else { 1943 *p_lb = 0; 1944 *p_ub = 0; 1945 if ( p_st != NULL ) 1946 *p_st = 0; 1947 } // if 1948 } // case 1949 break; 1950 1951 case kmp_sch_guided_analytical_chunked: 1952 { 1953 T chunkspec = pr->u.p.parm1; 1954 UT chunkIdx; 1955 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1956 /* for storing original FPCW value for Windows* OS on 1957 IA-32 architecture 8-byte version */ 1958 unsigned int oldFpcw; 1959 unsigned int fpcwSet = 0; 1960 #endif 1961 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1962 gtid ) ); 1963 1964 trip = pr->u.p.tc; 1965 1966 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1967 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1968 1969 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1970 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1971 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1972 --trip; 1973 /* use dynamic-style scheduling */ 1974 init = chunkIdx * chunkspec + pr->u.p.count; 1975 /* need to verify init > 0 in case of overflow in the above calculation */ 1976 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1977 limit = init + chunkspec -1; 1978 1979 if ( (last = (limit >= trip)) != 0 ) 1980 limit = trip; 1981 } 1982 break; 1983 } else { 1984 /* use exponential-style scheduling */ 1985 /* The following check is to workaround the lack of long double precision on Windows* OS. 1986 This check works around the possible effect that init != 0 for chunkIdx == 0. 1987 */ 1988 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1989 /* If we haven't already done so, save original 1990 FPCW and set precision to 64-bit, as Windows* OS 1991 on IA-32 architecture defaults to 53-bit */ 1992 if ( !fpcwSet ) { 1993 oldFpcw = _control87(0,0); 1994 _control87(_PC_64,_MCW_PC); 1995 fpcwSet = 0x30000; 1996 } 1997 #endif 1998 if ( chunkIdx ) { 1999 init = __kmp_dispatch_guided_remaining< T >( 2000 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 2001 KMP_DEBUG_ASSERT(init); 2002 init = trip - init; 2003 } else 2004 init = 0; 2005 limit = trip - __kmp_dispatch_guided_remaining< T >( 2006 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 2007 KMP_ASSERT(init <= limit); 2008 if ( init < limit ) { 2009 KMP_DEBUG_ASSERT(limit <= trip); 2010 --limit; 2011 status = 1; 2012 break; 2013 } // if 2014 } // if 2015 } // while (1) 2016 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2017 /* restore FPCW if necessary 2018 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2019 */ 2020 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2021 _control87(oldFpcw,_MCW_PC); 2022 #endif 2023 if ( status != 0 ) { 2024 start = pr->u.p.lb; 2025 incr = pr->u.p.st; 2026 if ( p_st != NULL ) 2027 *p_st = incr; 2028 *p_lb = start + init * incr; 2029 *p_ub = start + limit * incr; 2030 if ( pr->ordered ) { 2031 pr->u.p.ordered_lower = init; 2032 pr->u.p.ordered_upper = limit; 2033 #ifdef KMP_DEBUG 2034 { 2035 const char * buff; 2036 // create format specifiers before the debug output 2037 buff = __kmp_str_format( 2038 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2039 traits_t< UT >::spec, traits_t< UT >::spec ); 2040 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2041 __kmp_str_free( &buff ); 2042 } 2043 #endif 2044 } 2045 } else { 2046 *p_lb = 0; 2047 *p_ub = 0; 2048 if ( p_st != NULL ) 2049 *p_st = 0; 2050 } 2051 } // case 2052 break; 2053 2054 case kmp_sch_trapezoidal: 2055 { 2056 UT index; 2057 T parm2 = pr->u.p.parm2; 2058 T parm3 = pr->u.p.parm3; 2059 T parm4 = pr->u.p.parm4; 2060 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2061 gtid ) ); 2062 2063 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2064 2065 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2066 trip = pr->u.p.tc - 1; 2067 2068 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2069 *p_lb = 0; 2070 *p_ub = 0; 2071 if ( p_st != NULL ) *p_st = 0; 2072 } else { 2073 start = pr->u.p.lb; 2074 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2075 incr = pr->u.p.st; 2076 2077 if ( (last = (limit >= trip)) != 0 ) 2078 limit = trip; 2079 2080 if ( p_st != NULL ) *p_st = incr; 2081 2082 if ( incr == 1 ) { 2083 *p_lb = start + init; 2084 *p_ub = start + limit; 2085 } else { 2086 *p_lb = start + init * incr; 2087 *p_ub = start + limit * incr; 2088 } 2089 2090 if ( pr->ordered ) { 2091 pr->u.p.ordered_lower = init; 2092 pr->u.p.ordered_upper = limit; 2093 #ifdef KMP_DEBUG 2094 { 2095 const char * buff; 2096 // create format specifiers before the debug output 2097 buff = __kmp_str_format( 2098 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2099 traits_t< UT >::spec, traits_t< UT >::spec ); 2100 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2101 __kmp_str_free( &buff ); 2102 } 2103 #endif 2104 } // if 2105 } // if 2106 } // case 2107 break; 2108 default: 2109 { 2110 status = 0; // to avoid complaints on uninitialized variable use 2111 __kmp_msg( 2112 kmp_ms_fatal, // Severity 2113 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2114 KMP_HNT( GetNewerLibrary ), // Hint 2115 __kmp_msg_null // Variadic argument list terminator 2116 ); 2117 } 2118 break; 2119 } // switch 2120 } // if tc == 0; 2121 2122 if ( status == 0 ) { 2123 UT num_done; 2124 2125 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2126 #ifdef KMP_DEBUG 2127 { 2128 const char * buff; 2129 // create format specifiers before the debug output 2130 buff = __kmp_str_format( 2131 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2132 traits_t< UT >::spec ); 2133 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2134 __kmp_str_free( &buff ); 2135 } 2136 #endif 2137 2138 if ( (ST)num_done == team->t.t_nproc-1 ) { 2139 /* NOTE: release this buffer to be reused */ 2140 2141 KMP_MB(); /* Flush all pending memory write invalidates. */ 2142 2143 sh->u.s.num_done = 0; 2144 sh->u.s.iteration = 0; 2145 2146 /* TODO replace with general release procedure? */ 2147 if ( pr->ordered ) { 2148 sh->u.s.ordered_iteration = 0; 2149 } 2150 2151 KMP_MB(); /* Flush all pending memory write invalidates. */ 2152 2153 sh -> buffer_index += __kmp_dispatch_num_buffers; 2154 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2155 gtid, sh->buffer_index) ); 2156 2157 KMP_MB(); /* Flush all pending memory write invalidates. */ 2158 2159 } // if 2160 if ( __kmp_env_consistency_check ) { 2161 if ( pr->pushed_ws != ct_none ) { 2162 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2163 } 2164 } 2165 2166 th -> th.th_dispatch -> th_deo_fcn = NULL; 2167 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2168 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2169 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2170 } // if (status == 0) 2171 #if KMP_OS_WINDOWS 2172 else if ( last ) { 2173 pr->u.p.last_upper = pr->u.p.ub; 2174 } 2175 #endif /* KMP_OS_WINDOWS */ 2176 if ( p_last != NULL && status != 0 ) 2177 *p_last = last; 2178 } // if 2179 2180 #ifdef KMP_DEBUG 2181 { 2182 const char * buff; 2183 // create format specifiers before the debug output 2184 buff = __kmp_str_format( 2185 "__kmp_dispatch_next: T#%%d normal case: " \ 2186 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2187 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2188 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2189 __kmp_str_free( &buff ); 2190 } 2191 #endif 2192 #if INCLUDE_SSC_MARKS 2193 SSC_MARK_DISPATCH_NEXT(); 2194 #endif 2195 OMPT_LOOP_END; 2196 return status; 2197 } 2198 2199 template< typename T > 2200 static void 2201 __kmp_dist_get_bounds( 2202 ident_t *loc, 2203 kmp_int32 gtid, 2204 kmp_int32 *plastiter, 2205 T *plower, 2206 T *pupper, 2207 typename traits_t< T >::signed_t incr 2208 ) { 2209 typedef typename traits_t< T >::unsigned_t UT; 2210 typedef typename traits_t< T >::signed_t ST; 2211 register kmp_uint32 team_id; 2212 register kmp_uint32 nteams; 2213 register UT trip_count; 2214 register kmp_team_t *team; 2215 kmp_info_t * th; 2216 2217 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2218 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2219 #ifdef KMP_DEBUG 2220 { 2221 const char * buff; 2222 // create format specifiers before the debug output 2223 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2224 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2225 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2226 traits_t< T >::spec ); 2227 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2228 __kmp_str_free( &buff ); 2229 } 2230 #endif 2231 2232 if( __kmp_env_consistency_check ) { 2233 if( incr == 0 ) { 2234 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2235 } 2236 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2237 // The loop is illegal. 2238 // Some zero-trip loops maintained by compiler, e.g.: 2239 // for(i=10;i<0;++i) // lower >= upper - run-time check 2240 // for(i=0;i>10;--i) // lower <= upper - run-time check 2241 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2242 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2243 // Compiler does not check the following illegal loops: 2244 // for(i=0;i<10;i+=incr) // where incr<0 2245 // for(i=10;i>0;i-=incr) // where incr<0 2246 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2247 } 2248 } 2249 th = __kmp_threads[gtid]; 2250 team = th->th.th_team; 2251 #if OMP_40_ENABLED 2252 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2253 nteams = th->th.th_teams_size.nteams; 2254 #endif 2255 team_id = team->t.t_master_tid; 2256 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2257 2258 // compute global trip count 2259 if( incr == 1 ) { 2260 trip_count = *pupper - *plower + 1; 2261 } else if(incr == -1) { 2262 trip_count = *plower - *pupper + 1; 2263 } else if ( incr > 0 ) { 2264 // upper-lower can exceed the limit of signed type 2265 trip_count = (UT)(*pupper - *plower) / incr + 1; 2266 } else { 2267 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1; 2268 } 2269 2270 if( trip_count <= nteams ) { 2271 KMP_DEBUG_ASSERT( 2272 __kmp_static == kmp_sch_static_greedy || \ 2273 __kmp_static == kmp_sch_static_balanced 2274 ); // Unknown static scheduling type. 2275 // only some teams get single iteration, others get nothing 2276 if( team_id < trip_count ) { 2277 *pupper = *plower = *plower + team_id * incr; 2278 } else { 2279 *plower = *pupper + incr; // zero-trip loop 2280 } 2281 if( plastiter != NULL ) 2282 *plastiter = ( team_id == trip_count - 1 ); 2283 } else { 2284 if( __kmp_static == kmp_sch_static_balanced ) { 2285 register UT chunk = trip_count / nteams; 2286 register UT extras = trip_count % nteams; 2287 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2288 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2289 if( plastiter != NULL ) 2290 *plastiter = ( team_id == nteams - 1 ); 2291 } else { 2292 register T chunk_inc_count = 2293 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2294 register T upper = *pupper; 2295 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2296 // Unknown static scheduling type. 2297 *plower += team_id * chunk_inc_count; 2298 *pupper = *plower + chunk_inc_count - incr; 2299 // Check/correct bounds if needed 2300 if( incr > 0 ) { 2301 if( *pupper < *plower ) 2302 *pupper = i_maxmin< T >::mx; 2303 if( plastiter != NULL ) 2304 *plastiter = *plower <= upper && *pupper > upper - incr; 2305 if( *pupper > upper ) 2306 *pupper = upper; // tracker C73258 2307 } else { 2308 if( *pupper > *plower ) 2309 *pupper = i_maxmin< T >::mn; 2310 if( plastiter != NULL ) 2311 *plastiter = *plower >= upper && *pupper < upper - incr; 2312 if( *pupper < upper ) 2313 *pupper = upper; // tracker C73258 2314 } 2315 } 2316 } 2317 } 2318 2319 //----------------------------------------------------------------------------------------- 2320 // Dispatch routines 2321 // Transfer call to template< type T > 2322 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2323 // T lb, T ub, ST st, ST chunk ) 2324 extern "C" { 2325 2326 /*! 2327 @ingroup WORK_SHARING 2328 @{ 2329 @param loc Source location 2330 @param gtid Global thread id 2331 @param schedule Schedule type 2332 @param lb Lower bound 2333 @param ub Upper bound 2334 @param st Step (or increment if you prefer) 2335 @param chunk The chunk size to block with 2336 2337 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2338 These functions are all identical apart from the types of the arguments. 2339 */ 2340 2341 void 2342 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2343 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2344 { 2345 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2346 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2347 } 2348 /*! 2349 See @ref __kmpc_dispatch_init_4 2350 */ 2351 void 2352 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2353 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2354 { 2355 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2356 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2357 } 2358 2359 /*! 2360 See @ref __kmpc_dispatch_init_4 2361 */ 2362 void 2363 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2364 kmp_int64 lb, kmp_int64 ub, 2365 kmp_int64 st, kmp_int64 chunk ) 2366 { 2367 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2368 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2369 } 2370 2371 /*! 2372 See @ref __kmpc_dispatch_init_4 2373 */ 2374 void 2375 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2376 kmp_uint64 lb, kmp_uint64 ub, 2377 kmp_int64 st, kmp_int64 chunk ) 2378 { 2379 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2380 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2381 } 2382 2383 /*! 2384 See @ref __kmpc_dispatch_init_4 2385 2386 Difference from __kmpc_dispatch_init set of functions is these functions 2387 are called for composite distribute parallel for construct. Thus before 2388 regular iterations dispatching we need to calc per-team iteration space. 2389 2390 These functions are all identical apart from the types of the arguments. 2391 */ 2392 void 2393 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2394 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2395 { 2396 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2397 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2398 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2399 } 2400 2401 void 2402 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2403 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2404 { 2405 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2406 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2407 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2408 } 2409 2410 void 2411 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2412 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2413 { 2414 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2415 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2416 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2417 } 2418 2419 void 2420 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2421 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2422 { 2423 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2424 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2425 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2426 } 2427 2428 /*! 2429 @param loc Source code location 2430 @param gtid Global thread id 2431 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2432 @param p_lb Pointer to the lower bound for the next chunk of work 2433 @param p_ub Pointer to the upper bound for the next chunk of work 2434 @param p_st Pointer to the stride for the next chunk of work 2435 @return one if there is work to be done, zero otherwise 2436 2437 Get the next dynamically allocated chunk of work for this thread. 2438 If there is no more work, then the lb,ub and stride need not be modified. 2439 */ 2440 int 2441 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2442 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2443 { 2444 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2445 } 2446 2447 /*! 2448 See @ref __kmpc_dispatch_next_4 2449 */ 2450 int 2451 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2452 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2453 { 2454 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2455 } 2456 2457 /*! 2458 See @ref __kmpc_dispatch_next_4 2459 */ 2460 int 2461 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2462 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2463 { 2464 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2465 } 2466 2467 /*! 2468 See @ref __kmpc_dispatch_next_4 2469 */ 2470 int 2471 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2472 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2473 { 2474 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2475 } 2476 2477 /*! 2478 @param loc Source code location 2479 @param gtid Global thread id 2480 2481 Mark the end of a dynamic loop. 2482 */ 2483 void 2484 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2485 { 2486 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2487 } 2488 2489 /*! 2490 See @ref __kmpc_dispatch_fini_4 2491 */ 2492 void 2493 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2494 { 2495 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2496 } 2497 2498 /*! 2499 See @ref __kmpc_dispatch_fini_4 2500 */ 2501 void 2502 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2503 { 2504 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2505 } 2506 2507 /*! 2508 See @ref __kmpc_dispatch_fini_4 2509 */ 2510 void 2511 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2512 { 2513 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2514 } 2515 /*! @} */ 2516 2517 //----------------------------------------------------------------------------------------- 2518 //Non-template routines from kmp_dispatch.c used in other sources 2519 2520 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2521 return value == checker; 2522 } 2523 2524 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2525 return value != checker; 2526 } 2527 2528 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2529 return value < checker; 2530 } 2531 2532 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2533 return value >= checker; 2534 } 2535 2536 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2537 return value <= checker; 2538 } 2539 2540 kmp_uint32 2541 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2542 kmp_uint32 checker, 2543 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2544 , void * obj // Higher-level synchronization object, or NULL. 2545 ) 2546 { 2547 // note: we may not belong to a team at this point 2548 register volatile kmp_uint32 * spin = spinner; 2549 register kmp_uint32 check = checker; 2550 register kmp_uint32 spins; 2551 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2552 register kmp_uint32 r; 2553 2554 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2555 KMP_INIT_YIELD( spins ); 2556 // main wait spin loop 2557 while(!f(r = TCR_4(*spin), check)) { 2558 KMP_FSYNC_SPIN_PREPARE( obj ); 2559 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2560 It causes problems with infinite recursion because of exit lock */ 2561 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2562 __kmp_abort_thread(); */ 2563 2564 /* if we have waited a bit, or are oversubscribed, yield */ 2565 /* pause is in the following code */ 2566 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2567 KMP_YIELD_SPIN( spins ); 2568 } 2569 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2570 return r; 2571 } 2572 2573 void 2574 __kmp_wait_yield_4_ptr(void *spinner, 2575 kmp_uint32 checker, 2576 kmp_uint32 (*pred)( void *, kmp_uint32 ), 2577 void *obj // Higher-level synchronization object, or NULL. 2578 ) 2579 { 2580 // note: we may not belong to a team at this point 2581 register void *spin = spinner; 2582 register kmp_uint32 check = checker; 2583 register kmp_uint32 spins; 2584 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred; 2585 2586 KMP_FSYNC_SPIN_INIT( obj, spin ); 2587 KMP_INIT_YIELD( spins ); 2588 // main wait spin loop 2589 while ( !f( spin, check ) ) { 2590 KMP_FSYNC_SPIN_PREPARE( obj ); 2591 /* if we have waited a bit, or are oversubscribed, yield */ 2592 /* pause is in the following code */ 2593 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc ); 2594 KMP_YIELD_SPIN( spins ); 2595 } 2596 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2597 } 2598 2599 } // extern "C" 2600 2601 #ifdef KMP_GOMP_COMPAT 2602 2603 void 2604 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2605 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2606 kmp_int32 chunk, int push_ws ) 2607 { 2608 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2609 push_ws ); 2610 } 2611 2612 void 2613 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2614 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2615 kmp_int32 chunk, int push_ws ) 2616 { 2617 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2618 push_ws ); 2619 } 2620 2621 void 2622 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2623 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2624 kmp_int64 chunk, int push_ws ) 2625 { 2626 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2627 push_ws ); 2628 } 2629 2630 void 2631 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2632 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2633 kmp_int64 chunk, int push_ws ) 2634 { 2635 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2636 push_ws ); 2637 } 2638 2639 void 2640 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2641 { 2642 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2643 } 2644 2645 void 2646 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2647 { 2648 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2649 } 2650 2651 void 2652 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2653 { 2654 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2655 } 2656 2657 void 2658 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2659 { 2660 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2661 } 2662 2663 #endif /* KMP_GOMP_COMPAT */ 2664 2665 /* ------------------------------------------------------------------------ */ 2666 /* ------------------------------------------------------------------------ */ 2667 2668