1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 637 KMP_MASTER_GTID(gtid) && 638 #if OMP_40_ENABLED 639 th->th.th_teams_microtask == NULL && 640 #endif 641 team->t.t_active_level == 1; 642 #endif 643 if ( ! active ) { 644 pr = reinterpret_cast< dispatch_private_info_template< T >* > 645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 646 } else { 647 KMP_DEBUG_ASSERT( th->th.th_dispatch == 648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 649 650 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 651 652 /* What happens when number of threads changes, need to resize buffer? */ 653 pr = reinterpret_cast< dispatch_private_info_template< T > * > 654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 657 } 658 659 /* Pick up the nomerge/ordered bits from the scheduling type */ 660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 661 pr->nomerge = TRUE; 662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 663 } else { 664 pr->nomerge = FALSE; 665 } 666 pr->type_size = ___kmp_size_type; // remember the size of variables 667 if ( kmp_ord_lower & schedule ) { 668 pr->ordered = TRUE; 669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 670 } else { 671 pr->ordered = FALSE; 672 } 673 674 if ( schedule == kmp_sch_static ) { 675 schedule = __kmp_static; 676 } else { 677 if ( schedule == kmp_sch_runtime ) { 678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 679 schedule = team -> t.t_sched.r_sched_type; 680 // Detail the schedule if needed (global controls are differentiated appropriately) 681 if ( schedule == kmp_sch_guided_chunked ) { 682 schedule = __kmp_guided; 683 } else if ( schedule == kmp_sch_static ) { 684 schedule = __kmp_static; 685 } 686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 687 chunk = team -> t.t_sched.chunk; 688 689 #ifdef KMP_DEBUG 690 { 691 const char * buff; 692 // create format specifiers before the debug output 693 buff = __kmp_str_format( 694 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 695 traits_t< ST >::spec ); 696 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 697 __kmp_str_free( &buff ); 698 } 699 #endif 700 } else { 701 if ( schedule == kmp_sch_guided_chunked ) { 702 schedule = __kmp_guided; 703 } 704 if ( chunk <= 0 ) { 705 chunk = KMP_DEFAULT_CHUNK; 706 } 707 } 708 709 if ( schedule == kmp_sch_auto ) { 710 // mapping and differentiation: in the __kmp_do_serial_initialize() 711 schedule = __kmp_auto; 712 #ifdef KMP_DEBUG 713 { 714 const char * buff; 715 // create format specifiers before the debug output 716 buff = __kmp_str_format( 717 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 718 traits_t< ST >::spec ); 719 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 720 __kmp_str_free( &buff ); 721 } 722 #endif 723 } 724 725 /* guided analytical not safe for too many threads */ 726 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 727 schedule = kmp_sch_guided_iterative_chunked; 728 KMP_WARNING( DispatchManyThreads ); 729 } 730 pr->u.p.parm1 = chunk; 731 } 732 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 733 "unknown scheduling type" ); 734 735 pr->u.p.count = 0; 736 737 if ( __kmp_env_consistency_check ) { 738 if ( st == 0 ) { 739 __kmp_error_construct( 740 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 741 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 742 ); 743 } 744 } 745 746 tc = ( ub - lb + st ); 747 if ( st != 1 ) { 748 if ( st < 0 ) { 749 if ( lb < ub ) { 750 tc = 0; // zero-trip 751 } else { // lb >= ub 752 tc = (ST)tc / st; // convert to signed division 753 } 754 } else { // st > 0 755 if ( ub < lb ) { 756 tc = 0; // zero-trip 757 } else { // lb >= ub 758 tc /= st; 759 } 760 } 761 } else if ( ub < lb ) { // st == 1 762 tc = 0; // zero-trip 763 } 764 765 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 766 // when statistics are disabled. 767 if (schedule == __kmp_static) 768 { 769 KMP_COUNT_BLOCK(OMP_FOR_static); 770 KMP_COUNT_VALUE(FOR_static_iterations, tc); 771 } 772 else 773 { 774 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 775 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 776 } 777 778 pr->u.p.lb = lb; 779 pr->u.p.ub = ub; 780 pr->u.p.st = st; 781 pr->u.p.tc = tc; 782 783 #if KMP_OS_WINDOWS 784 pr->u.p.last_upper = ub + st; 785 #endif /* KMP_OS_WINDOWS */ 786 787 /* NOTE: only the active parallel region(s) has active ordered sections */ 788 789 if ( active ) { 790 if ( pr->ordered == 0 ) { 791 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 792 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 793 } else { 794 pr->ordered_bumped = 0; 795 796 pr->u.p.ordered_lower = 1; 797 pr->u.p.ordered_upper = 0; 798 799 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 800 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 801 } 802 } 803 804 if ( __kmp_env_consistency_check ) { 805 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 806 if ( push_ws ) { 807 __kmp_push_workshare( gtid, ws, loc ); 808 pr->pushed_ws = ws; 809 } else { 810 __kmp_check_workshare( gtid, ws, loc ); 811 pr->pushed_ws = ct_none; 812 } 813 } 814 815 switch ( schedule ) { 816 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 817 case kmp_sch_static_steal: 818 { 819 T nproc = team->t.t_nproc; 820 T ntc, init; 821 822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 823 824 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 825 if ( nproc > 1 && ntc >= nproc ) { 826 T id = __kmp_tid_from_gtid(gtid); 827 T small_chunk, extras; 828 829 small_chunk = ntc / nproc; 830 extras = ntc % nproc; 831 832 init = id * small_chunk + ( id < extras ? id : extras ); 833 pr->u.p.count = init; 834 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 835 836 pr->u.p.parm2 = lb; 837 //pr->pfields.parm3 = 0; // it's not used in static_steal 838 pr->u.p.parm4 = id; 839 pr->u.p.st = st; 840 break; 841 } else { 842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 843 gtid ) ); 844 schedule = kmp_sch_static_balanced; 845 /* too few iterations: fall-through to kmp_sch_static_balanced */ 846 } // if 847 /* FALL-THROUGH to static balanced */ 848 } // case 849 #endif 850 case kmp_sch_static_balanced: 851 { 852 T nproc = team->t.t_nproc; 853 T init, limit; 854 855 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 856 gtid ) ); 857 858 if ( nproc > 1 ) { 859 T id = __kmp_tid_from_gtid(gtid); 860 861 if ( tc < nproc ) { 862 if ( id < tc ) { 863 init = id; 864 limit = id; 865 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 866 } else { 867 pr->u.p.count = 1; /* means no more chunks to execute */ 868 pr->u.p.parm1 = FALSE; 869 break; 870 } 871 } else { 872 T small_chunk = tc / nproc; 873 T extras = tc % nproc; 874 init = id * small_chunk + (id < extras ? id : extras); 875 limit = init + small_chunk - (id < extras ? 0 : 1); 876 pr->u.p.parm1 = (id == nproc - 1); 877 } 878 } else { 879 if ( tc > 0 ) { 880 init = 0; 881 limit = tc - 1; 882 pr->u.p.parm1 = TRUE; 883 } else { 884 // zero trip count 885 pr->u.p.count = 1; /* means no more chunks to execute */ 886 pr->u.p.parm1 = FALSE; 887 break; 888 } 889 } 890 #if USE_ITT_BUILD 891 // Calculate chunk for metadata report 892 if ( itt_need_metadata_reporting ) 893 cur_chunk = limit - init + 1; 894 #endif 895 if ( st == 1 ) { 896 pr->u.p.lb = lb + init; 897 pr->u.p.ub = lb + limit; 898 } else { 899 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 900 pr->u.p.lb = lb + init * st; 901 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 902 if ( st > 0 ) { 903 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 904 } else { 905 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 906 } 907 } 908 if ( pr->ordered ) { 909 pr->u.p.ordered_lower = init; 910 pr->u.p.ordered_upper = limit; 911 } 912 break; 913 } // case 914 case kmp_sch_guided_iterative_chunked : 915 { 916 T nproc = team->t.t_nproc; 917 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 918 919 if ( nproc > 1 ) { 920 if ( (2L * chunk + 1 ) * nproc >= tc ) { 921 /* chunk size too large, switch to dynamic */ 922 schedule = kmp_sch_dynamic_chunked; 923 } else { 924 // when remaining iters become less than parm2 - switch to dynamic 925 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 926 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 927 } 928 } else { 929 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 930 schedule = kmp_sch_static_greedy; 931 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 933 pr->u.p.parm1 = tc; 934 } // if 935 } // case 936 break; 937 case kmp_sch_guided_analytical_chunked: 938 { 939 T nproc = team->t.t_nproc; 940 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 941 942 if ( nproc > 1 ) { 943 if ( (2L * chunk + 1 ) * nproc >= tc ) { 944 /* chunk size too large, switch to dynamic */ 945 schedule = kmp_sch_dynamic_chunked; 946 } else { 947 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 948 DBL x; 949 950 #if KMP_OS_WINDOWS && KMP_ARCH_X86 951 /* Linux* OS already has 64-bit computation by default for 952 long double, and on Windows* OS on Intel(R) 64, 953 /Qlong_double doesn't work. On Windows* OS 954 on IA-32 architecture, we need to set precision to 955 64-bit instead of the default 53-bit. Even though long 956 double doesn't work on Windows* OS on Intel(R) 64, the 957 resulting lack of precision is not expected to impact 958 the correctness of the algorithm, but this has not been 959 mathematically proven. 960 */ 961 // save original FPCW and set precision to 64-bit, as 962 // Windows* OS on IA-32 architecture defaults to 53-bit 963 unsigned int oldFpcw = _control87(0,0); 964 _control87(_PC_64,_MCW_PC); // 0,0x30000 965 #endif 966 /* value used for comparison in solver for cross-over point */ 967 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 968 969 /* crossover point--chunk indexes equal to or greater than 970 this point switch to dynamic-style scheduling */ 971 UT cross; 972 973 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 974 x = (long double)1.0 - (long double)0.5 / nproc; 975 976 #ifdef KMP_DEBUG 977 { // test natural alignment 978 struct _test_a { 979 char a; 980 union { 981 char b; 982 DBL d; 983 }; 984 } t; 985 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 986 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 987 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 988 } 989 #endif // KMP_DEBUG 990 991 /* save the term in thread private dispatch structure */ 992 *(DBL*)&pr->u.p.parm3 = x; 993 994 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 995 { 996 UT left, right, mid; 997 long double p; 998 999 /* estimate initial upper and lower bound */ 1000 1001 /* doesn't matter what value right is as long as it is positive, but 1002 it affects performance of the solver 1003 */ 1004 right = 229; 1005 p = __kmp_pow< UT >(x,right); 1006 if ( p > target ) { 1007 do{ 1008 p *= p; 1009 right <<= 1; 1010 } while(p>target && right < (1<<27)); 1011 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1012 } else { 1013 left = 0; 1014 } 1015 1016 /* bisection root-finding method */ 1017 while ( left + 1 < right ) { 1018 mid = (left + right) / 2; 1019 if ( __kmp_pow< UT >(x,mid) > target ) { 1020 left = mid; 1021 } else { 1022 right = mid; 1023 } 1024 } // while 1025 cross = right; 1026 } 1027 /* assert sanity of computed crossover point */ 1028 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1029 1030 /* save the crossover point in thread private dispatch structure */ 1031 pr->u.p.parm2 = cross; 1032 1033 // C75803 1034 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1035 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1036 #else 1037 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1038 #endif 1039 /* dynamic-style scheduling offset */ 1040 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1041 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1042 // restore FPCW 1043 _control87(oldFpcw,_MCW_PC); 1044 #endif 1045 } // if 1046 } else { 1047 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1048 gtid ) ); 1049 schedule = kmp_sch_static_greedy; 1050 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1051 pr->u.p.parm1 = tc; 1052 } // if 1053 } // case 1054 break; 1055 case kmp_sch_static_greedy: 1056 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1057 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1058 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1059 tc; 1060 break; 1061 case kmp_sch_static_chunked : 1062 case kmp_sch_dynamic_chunked : 1063 if ( pr->u.p.parm1 <= 0 ) { 1064 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1065 } 1066 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1067 break; 1068 case kmp_sch_trapezoidal : 1069 { 1070 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1071 1072 T parm1, parm2, parm3, parm4; 1073 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1074 1075 parm1 = chunk; 1076 1077 /* F : size of the first cycle */ 1078 parm2 = ( tc / (2 * team->t.t_nproc) ); 1079 1080 if ( parm2 < 1 ) { 1081 parm2 = 1; 1082 } 1083 1084 /* L : size of the last cycle. Make sure the last cycle 1085 * is not larger than the first cycle. 1086 */ 1087 if ( parm1 < 1 ) { 1088 parm1 = 1; 1089 } else if ( parm1 > parm2 ) { 1090 parm1 = parm2; 1091 } 1092 1093 /* N : number of cycles */ 1094 parm3 = ( parm2 + parm1 ); 1095 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1096 1097 if ( parm3 < 2 ) { 1098 parm3 = 2; 1099 } 1100 1101 /* sigma : decreasing incr of the trapezoid */ 1102 parm4 = ( parm3 - 1 ); 1103 parm4 = ( parm2 - parm1 ) / parm4; 1104 1105 // pointless check, because parm4 >= 0 always 1106 //if ( parm4 < 0 ) { 1107 // parm4 = 0; 1108 //} 1109 1110 pr->u.p.parm1 = parm1; 1111 pr->u.p.parm2 = parm2; 1112 pr->u.p.parm3 = parm3; 1113 pr->u.p.parm4 = parm4; 1114 } // case 1115 break; 1116 1117 default: 1118 { 1119 __kmp_msg( 1120 kmp_ms_fatal, // Severity 1121 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1122 KMP_HNT( GetNewerLibrary ), // Hint 1123 __kmp_msg_null // Variadic argument list terminator 1124 ); 1125 } 1126 break; 1127 } // switch 1128 pr->schedule = schedule; 1129 if ( active ) { 1130 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1131 1132 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1133 gtid, my_buffer_index, sh->buffer_index) ); 1134 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1135 USE_ITT_BUILD_ARG( NULL ) 1136 ); 1137 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1138 // *always* 32-bit integers. 1139 KMP_MB(); /* is this necessary? */ 1140 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1141 gtid, my_buffer_index, sh->buffer_index) ); 1142 1143 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1144 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1145 #if USE_ITT_BUILD 1146 if ( pr->ordered ) { 1147 __kmp_itt_ordered_init( gtid ); 1148 }; // if 1149 // Report loop metadata 1150 if ( itt_need_metadata_reporting ) { 1151 // Only report metadata by master of active team at level 1 1152 kmp_uint64 schedtype = 0; 1153 switch ( schedule ) { 1154 case kmp_sch_static_chunked: 1155 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1156 break; 1157 case kmp_sch_static_greedy: 1158 cur_chunk = pr->u.p.parm1; 1159 break; 1160 case kmp_sch_dynamic_chunked: 1161 schedtype = 1; 1162 break; 1163 case kmp_sch_guided_iterative_chunked: 1164 case kmp_sch_guided_analytical_chunked: 1165 schedtype = 2; 1166 break; 1167 default: 1168 // Should we put this case under "static"? 1169 // case kmp_sch_static_steal: 1170 schedtype = 3; 1171 break; 1172 } 1173 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1174 } 1175 #endif /* USE_ITT_BUILD */ 1176 }; // if 1177 1178 #ifdef KMP_DEBUG 1179 { 1180 const char * buff; 1181 // create format specifiers before the debug output 1182 buff = __kmp_str_format( 1183 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1184 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1185 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1186 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1187 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1188 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1189 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1190 KD_TRACE(10, ( buff, 1191 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1192 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1193 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1194 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1195 __kmp_str_free( &buff ); 1196 } 1197 #endif 1198 #if ( KMP_STATIC_STEAL_ENABLED ) 1199 if ( ___kmp_size_type < 8 ) { 1200 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1201 // all the parm3 variables will contain the same value. 1202 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1203 // rather than program life-time increment. 1204 // So the dedicated variable is required. The 'static_steal_counter' is used. 1205 if( schedule == kmp_sch_static_steal ) { 1206 // Other threads will inspect this variable when searching for a victim. 1207 // This is a flag showing that other threads may steal from this thread since then. 1208 volatile T * p = &pr->u.p.static_steal_counter; 1209 *p = *p + 1; 1210 } 1211 } 1212 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1213 1214 #if OMPT_SUPPORT && OMPT_TRACE 1215 if (ompt_enabled && 1216 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1217 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1218 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1219 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1220 team_info->parallel_id, task_info->task_id, team_info->microtask); 1221 } 1222 #endif 1223 } 1224 1225 /* 1226 * For ordered loops, either __kmp_dispatch_finish() should be called after 1227 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1228 * every chunk of iterations. If the ordered section(s) were not executed 1229 * for this iteration (or every iteration in this chunk), we need to set the 1230 * ordered iteration counters so that the next thread can proceed. 1231 */ 1232 template< typename UT > 1233 static void 1234 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1235 { 1236 typedef typename traits_t< UT >::signed_t ST; 1237 kmp_info_t *th = __kmp_threads[ gtid ]; 1238 1239 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1240 if ( ! th -> th.th_team -> t.t_serialized ) { 1241 1242 dispatch_private_info_template< UT > * pr = 1243 reinterpret_cast< dispatch_private_info_template< UT >* > 1244 ( th->th.th_dispatch->th_dispatch_pr_current ); 1245 dispatch_shared_info_template< UT > volatile * sh = 1246 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1247 ( th->th.th_dispatch->th_dispatch_sh_current ); 1248 KMP_DEBUG_ASSERT( pr ); 1249 KMP_DEBUG_ASSERT( sh ); 1250 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1251 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1252 1253 if ( pr->ordered_bumped ) { 1254 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1255 gtid ) ); 1256 pr->ordered_bumped = 0; 1257 } else { 1258 UT lower = pr->u.p.ordered_lower; 1259 1260 #ifdef KMP_DEBUG 1261 { 1262 const char * buff; 1263 // create format specifiers before the debug output 1264 buff = __kmp_str_format( 1265 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1266 traits_t< UT >::spec, traits_t< UT >::spec ); 1267 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1268 __kmp_str_free( &buff ); 1269 } 1270 #endif 1271 1272 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1273 USE_ITT_BUILD_ARG(NULL) 1274 ); 1275 KMP_MB(); /* is this necessary? */ 1276 #ifdef KMP_DEBUG 1277 { 1278 const char * buff; 1279 // create format specifiers before the debug output 1280 buff = __kmp_str_format( 1281 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1282 traits_t< UT >::spec, traits_t< UT >::spec ); 1283 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1284 __kmp_str_free( &buff ); 1285 } 1286 #endif 1287 1288 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1289 } // if 1290 } // if 1291 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1292 } 1293 1294 #ifdef KMP_GOMP_COMPAT 1295 1296 template< typename UT > 1297 static void 1298 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1299 { 1300 typedef typename traits_t< UT >::signed_t ST; 1301 kmp_info_t *th = __kmp_threads[ gtid ]; 1302 1303 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1304 if ( ! th -> th.th_team -> t.t_serialized ) { 1305 // int cid; 1306 dispatch_private_info_template< UT > * pr = 1307 reinterpret_cast< dispatch_private_info_template< UT >* > 1308 ( th->th.th_dispatch->th_dispatch_pr_current ); 1309 dispatch_shared_info_template< UT > volatile * sh = 1310 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1311 ( th->th.th_dispatch->th_dispatch_sh_current ); 1312 KMP_DEBUG_ASSERT( pr ); 1313 KMP_DEBUG_ASSERT( sh ); 1314 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1315 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1316 1317 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1318 UT lower = pr->u.p.ordered_lower; 1319 UT upper = pr->u.p.ordered_upper; 1320 UT inc = upper - lower + 1; 1321 1322 if ( pr->ordered_bumped == inc ) { 1323 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1324 gtid ) ); 1325 pr->ordered_bumped = 0; 1326 } else { 1327 inc -= pr->ordered_bumped; 1328 1329 #ifdef KMP_DEBUG 1330 { 1331 const char * buff; 1332 // create format specifiers before the debug output 1333 buff = __kmp_str_format( 1334 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1335 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1336 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1337 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1338 __kmp_str_free( &buff ); 1339 } 1340 #endif 1341 1342 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1343 USE_ITT_BUILD_ARG(NULL) 1344 ); 1345 1346 KMP_MB(); /* is this necessary? */ 1347 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1348 gtid ) ); 1349 pr->ordered_bumped = 0; 1350 //!!!!! TODO check if the inc should be unsigned, or signed??? 1351 #ifdef KMP_DEBUG 1352 { 1353 const char * buff; 1354 // create format specifiers before the debug output 1355 buff = __kmp_str_format( 1356 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1357 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1358 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1359 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1360 __kmp_str_free( &buff ); 1361 } 1362 #endif 1363 1364 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1365 } 1366 // } 1367 } 1368 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1369 } 1370 1371 #endif /* KMP_GOMP_COMPAT */ 1372 1373 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1374 * (no more work), then tell OMPT the loop is over. In some cases 1375 * kmp_dispatch_fini() is not called. */ 1376 #if OMPT_SUPPORT && OMPT_TRACE 1377 #define OMPT_LOOP_END \ 1378 if (status == 0) { \ 1379 if (ompt_enabled && \ 1380 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1381 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1382 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1383 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1384 team_info->parallel_id, task_info->task_id); \ 1385 } \ 1386 } 1387 #else 1388 #define OMPT_LOOP_END // no-op 1389 #endif 1390 1391 template< typename T > 1392 static int 1393 __kmp_dispatch_next( 1394 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1395 ) { 1396 1397 typedef typename traits_t< T >::unsigned_t UT; 1398 typedef typename traits_t< T >::signed_t ST; 1399 typedef typename traits_t< T >::floating_t DBL; 1400 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1401 static const int ___kmp_size_type = sizeof( UT ); 1402 #endif 1403 1404 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1405 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1406 // more than a compile time choice to use static scheduling would.) 1407 KMP_TIME_BLOCK(FOR_dynamic_scheduling); 1408 1409 int status; 1410 dispatch_private_info_template< T > * pr; 1411 kmp_info_t * th = __kmp_threads[ gtid ]; 1412 kmp_team_t * team = th -> th.th_team; 1413 1414 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1415 #ifdef KMP_DEBUG 1416 { 1417 const char * buff; 1418 // create format specifiers before the debug output 1419 buff = __kmp_str_format( 1420 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1421 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1422 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1423 __kmp_str_free( &buff ); 1424 } 1425 #endif 1426 1427 if ( team -> t.t_serialized ) { 1428 /* NOTE: serialize this dispatch becase we are not at the active level */ 1429 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1430 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1431 KMP_DEBUG_ASSERT( pr ); 1432 1433 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1434 *p_lb = 0; 1435 *p_ub = 0; 1436 // if ( p_last != NULL ) 1437 // *p_last = 0; 1438 if ( p_st != NULL ) 1439 *p_st = 0; 1440 if ( __kmp_env_consistency_check ) { 1441 if ( pr->pushed_ws != ct_none ) { 1442 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1443 } 1444 } 1445 } else if ( pr->nomerge ) { 1446 kmp_int32 last; 1447 T start; 1448 UT limit, trip, init; 1449 ST incr; 1450 T chunk = pr->u.p.parm1; 1451 1452 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1453 1454 init = chunk * pr->u.p.count++; 1455 trip = pr->u.p.tc - 1; 1456 1457 if ( (status = (init <= trip)) == 0 ) { 1458 *p_lb = 0; 1459 *p_ub = 0; 1460 // if ( p_last != NULL ) 1461 // *p_last = 0; 1462 if ( p_st != NULL ) 1463 *p_st = 0; 1464 if ( __kmp_env_consistency_check ) { 1465 if ( pr->pushed_ws != ct_none ) { 1466 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1467 } 1468 } 1469 } else { 1470 start = pr->u.p.lb; 1471 limit = chunk + init - 1; 1472 incr = pr->u.p.st; 1473 1474 if ( (last = (limit >= trip)) != 0 ) { 1475 limit = trip; 1476 #if KMP_OS_WINDOWS 1477 pr->u.p.last_upper = pr->u.p.ub; 1478 #endif /* KMP_OS_WINDOWS */ 1479 } 1480 if ( p_last != NULL ) 1481 *p_last = last; 1482 if ( p_st != NULL ) 1483 *p_st = incr; 1484 if ( incr == 1 ) { 1485 *p_lb = start + init; 1486 *p_ub = start + limit; 1487 } else { 1488 *p_lb = start + init * incr; 1489 *p_ub = start + limit * incr; 1490 } 1491 1492 if ( pr->ordered ) { 1493 pr->u.p.ordered_lower = init; 1494 pr->u.p.ordered_upper = limit; 1495 #ifdef KMP_DEBUG 1496 { 1497 const char * buff; 1498 // create format specifiers before the debug output 1499 buff = __kmp_str_format( 1500 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1501 traits_t< UT >::spec, traits_t< UT >::spec ); 1502 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1503 __kmp_str_free( &buff ); 1504 } 1505 #endif 1506 } // if 1507 } // if 1508 } else { 1509 pr->u.p.tc = 0; 1510 *p_lb = pr->u.p.lb; 1511 *p_ub = pr->u.p.ub; 1512 #if KMP_OS_WINDOWS 1513 pr->u.p.last_upper = *p_ub; 1514 #endif /* KMP_OS_WINDOWS */ 1515 if ( p_last != NULL ) 1516 *p_last = TRUE; 1517 if ( p_st != NULL ) 1518 *p_st = pr->u.p.st; 1519 } // if 1520 #ifdef KMP_DEBUG 1521 { 1522 const char * buff; 1523 // create format specifiers before the debug output 1524 buff = __kmp_str_format( 1525 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1526 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1527 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1528 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1529 __kmp_str_free( &buff ); 1530 } 1531 #endif 1532 #if INCLUDE_SSC_MARKS 1533 SSC_MARK_DISPATCH_NEXT(); 1534 #endif 1535 OMPT_LOOP_END; 1536 return status; 1537 } else { 1538 kmp_int32 last = 0; 1539 dispatch_shared_info_template< UT > *sh; 1540 T start; 1541 ST incr; 1542 UT limit, trip, init; 1543 1544 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1545 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1546 1547 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1548 ( th->th.th_dispatch->th_dispatch_pr_current ); 1549 KMP_DEBUG_ASSERT( pr ); 1550 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1551 ( th->th.th_dispatch->th_dispatch_sh_current ); 1552 KMP_DEBUG_ASSERT( sh ); 1553 1554 if ( pr->u.p.tc == 0 ) { 1555 // zero trip count 1556 status = 0; 1557 } else { 1558 switch (pr->schedule) { 1559 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1560 case kmp_sch_static_steal: 1561 { 1562 T chunk = pr->u.p.parm1; 1563 1564 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1565 1566 trip = pr->u.p.tc - 1; 1567 1568 if ( ___kmp_size_type > 4 ) { 1569 // Other threads do not look into the data of this thread, 1570 // so it's not necessary to make volatile casting. 1571 init = ( pr->u.p.count )++; 1572 status = ( init < (UT)pr->u.p.ub ); 1573 } else { 1574 typedef union { 1575 struct { 1576 UT count; 1577 T ub; 1578 } p; 1579 kmp_int64 b; 1580 } union_i4; 1581 // All operations on 'count' or 'ub' must be combined atomically together. 1582 // stealing implemented only for 4-byte indexes 1583 { 1584 union_i4 vold, vnew; 1585 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1586 vnew = vold; 1587 vnew.p.count++; 1588 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1589 ( volatile kmp_int64* )&pr->u.p.count, 1590 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1591 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1592 KMP_CPU_PAUSE(); 1593 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1594 vnew = vold; 1595 vnew.p.count++; 1596 } 1597 vnew = vold; 1598 init = vnew.p.count; 1599 status = ( init < (UT)vnew.p.ub ) ; 1600 } 1601 1602 if( !status ) { 1603 kmp_info_t **other_threads = team->t.t_threads; 1604 int while_limit = 10; 1605 int while_index = 0; 1606 1607 // TODO: algorithm of searching for a victim 1608 // should be cleaned up and measured 1609 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1610 union_i4 vold, vnew; 1611 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1612 T victimIdx = pr->u.p.parm4; 1613 T oldVictimIdx = victimIdx; 1614 dispatch_private_info_template< T > * victim; 1615 1616 do { 1617 if( !victimIdx ) { 1618 victimIdx = team->t.t_nproc - 1; 1619 } else { 1620 --victimIdx; 1621 } 1622 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1623 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1624 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1625 // TODO: think about a proper place of this test 1626 if ( ( !victim ) || 1627 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1628 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1629 // TODO: delay would be nice 1630 continue; 1631 // the victim is not ready yet to participate in stealing 1632 // because the victim is still in kmp_init_dispatch 1633 } 1634 if ( oldVictimIdx == victimIdx ) { 1635 break; 1636 } 1637 pr->u.p.parm4 = victimIdx; 1638 1639 while( 1 ) { 1640 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1641 vnew = vold; 1642 1643 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1644 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1645 break; 1646 } 1647 vnew.p.ub -= (remaining >> 2); 1648 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1649 #pragma warning( push ) 1650 // disable warning on pointless comparison of unsigned with 0 1651 #pragma warning( disable: 186 ) 1652 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1653 #pragma warning( pop ) 1654 // TODO: Should this be acquire or release? 1655 if ( KMP_COMPARE_AND_STORE_ACQ64( 1656 ( volatile kmp_int64 * )&victim->u.p.count, 1657 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1658 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1659 status = 1; 1660 while_index = 0; 1661 // now update own count and ub 1662 #if KMP_ARCH_X86 1663 // stealing executed on non-KMP_ARCH_X86 only 1664 // Atomic 64-bit write on ia32 is 1665 // unavailable, so we do this in steps. 1666 // This code is not tested. 1667 init = vold.p.count; 1668 pr->u.p.ub = 0; 1669 pr->u.p.count = init + 1; 1670 pr->u.p.ub = vnew.p.count; 1671 #else 1672 init = vnew.p.ub; 1673 vold.p.count = init + 1; 1674 // TODO: is it safe and enough? 1675 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1676 #endif // KMP_ARCH_X86 1677 break; 1678 } // if 1679 KMP_CPU_PAUSE(); 1680 } // while (1) 1681 } // while 1682 } // if 1683 } // if 1684 if ( !status ) { 1685 *p_lb = 0; 1686 *p_ub = 0; 1687 if ( p_st != NULL ) *p_st = 0; 1688 } else { 1689 start = pr->u.p.parm2; 1690 init *= chunk; 1691 limit = chunk + init - 1; 1692 incr = pr->u.p.st; 1693 1694 KMP_DEBUG_ASSERT(init <= trip); 1695 if ( (last = (limit >= trip)) != 0 ) 1696 limit = trip; 1697 if ( p_st != NULL ) *p_st = incr; 1698 1699 if ( incr == 1 ) { 1700 *p_lb = start + init; 1701 *p_ub = start + limit; 1702 } else { 1703 *p_lb = start + init * incr; 1704 *p_ub = start + limit * incr; 1705 } 1706 1707 if ( pr->ordered ) { 1708 pr->u.p.ordered_lower = init; 1709 pr->u.p.ordered_upper = limit; 1710 #ifdef KMP_DEBUG 1711 { 1712 const char * buff; 1713 // create format specifiers before the debug output 1714 buff = __kmp_str_format( 1715 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1716 traits_t< UT >::spec, traits_t< UT >::spec ); 1717 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1718 __kmp_str_free( &buff ); 1719 } 1720 #endif 1721 } // if 1722 } // if 1723 break; 1724 } // case 1725 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1726 case kmp_sch_static_balanced: 1727 { 1728 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1729 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1730 pr->u.p.count = 1; 1731 *p_lb = pr->u.p.lb; 1732 *p_ub = pr->u.p.ub; 1733 last = pr->u.p.parm1; 1734 if ( p_st != NULL ) 1735 *p_st = pr->u.p.st; 1736 } else { /* no iterations to do */ 1737 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1738 } 1739 if ( pr->ordered ) { 1740 #ifdef KMP_DEBUG 1741 { 1742 const char * buff; 1743 // create format specifiers before the debug output 1744 buff = __kmp_str_format( 1745 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1746 traits_t< UT >::spec, traits_t< UT >::spec ); 1747 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1748 __kmp_str_free( &buff ); 1749 } 1750 #endif 1751 } // if 1752 } // case 1753 break; 1754 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1755 case kmp_sch_static_chunked: 1756 { 1757 T parm1; 1758 1759 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1760 gtid ) ); 1761 parm1 = pr->u.p.parm1; 1762 1763 trip = pr->u.p.tc - 1; 1764 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1765 1766 if ( (status = (init <= trip)) != 0 ) { 1767 start = pr->u.p.lb; 1768 incr = pr->u.p.st; 1769 limit = parm1 + init - 1; 1770 1771 if ( (last = (limit >= trip)) != 0 ) 1772 limit = trip; 1773 1774 if ( p_st != NULL ) *p_st = incr; 1775 1776 pr->u.p.count += team->t.t_nproc; 1777 1778 if ( incr == 1 ) { 1779 *p_lb = start + init; 1780 *p_ub = start + limit; 1781 } 1782 else { 1783 *p_lb = start + init * incr; 1784 *p_ub = start + limit * incr; 1785 } 1786 1787 if ( pr->ordered ) { 1788 pr->u.p.ordered_lower = init; 1789 pr->u.p.ordered_upper = limit; 1790 #ifdef KMP_DEBUG 1791 { 1792 const char * buff; 1793 // create format specifiers before the debug output 1794 buff = __kmp_str_format( 1795 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1796 traits_t< UT >::spec, traits_t< UT >::spec ); 1797 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1798 __kmp_str_free( &buff ); 1799 } 1800 #endif 1801 } // if 1802 } // if 1803 } // case 1804 break; 1805 1806 case kmp_sch_dynamic_chunked: 1807 { 1808 T chunk = pr->u.p.parm1; 1809 1810 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1811 gtid ) ); 1812 1813 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1814 trip = pr->u.p.tc - 1; 1815 1816 if ( (status = (init <= trip)) == 0 ) { 1817 *p_lb = 0; 1818 *p_ub = 0; 1819 if ( p_st != NULL ) *p_st = 0; 1820 } else { 1821 start = pr->u.p.lb; 1822 limit = chunk + init - 1; 1823 incr = pr->u.p.st; 1824 1825 if ( (last = (limit >= trip)) != 0 ) 1826 limit = trip; 1827 1828 if ( p_st != NULL ) *p_st = incr; 1829 1830 if ( incr == 1 ) { 1831 *p_lb = start + init; 1832 *p_ub = start + limit; 1833 } else { 1834 *p_lb = start + init * incr; 1835 *p_ub = start + limit * incr; 1836 } 1837 1838 if ( pr->ordered ) { 1839 pr->u.p.ordered_lower = init; 1840 pr->u.p.ordered_upper = limit; 1841 #ifdef KMP_DEBUG 1842 { 1843 const char * buff; 1844 // create format specifiers before the debug output 1845 buff = __kmp_str_format( 1846 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1847 traits_t< UT >::spec, traits_t< UT >::spec ); 1848 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1849 __kmp_str_free( &buff ); 1850 } 1851 #endif 1852 } // if 1853 } // if 1854 } // case 1855 break; 1856 1857 case kmp_sch_guided_iterative_chunked: 1858 { 1859 T chunkspec = pr->u.p.parm1; 1860 KD_TRACE(100, 1861 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1862 trip = pr->u.p.tc; 1863 // Start atomic part of calculations 1864 while(1) { 1865 ST remaining; // signed, because can be < 0 1866 init = sh->u.s.iteration; // shared value 1867 remaining = trip - init; 1868 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1869 // nothing to do, don't try atomic op 1870 status = 0; 1871 break; 1872 } 1873 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1874 // use dynamic-style shcedule 1875 // atomically inrement iterations, get old value 1876 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1877 remaining = trip - init; 1878 if (remaining <= 0) { 1879 status = 0; // all iterations got by other threads 1880 } else { 1881 // got some iterations to work on 1882 status = 1; 1883 if ( (T)remaining > chunkspec ) { 1884 limit = init + chunkspec - 1; 1885 } else { 1886 last = 1; // the last chunk 1887 limit = init + remaining - 1; 1888 } // if 1889 } // if 1890 break; 1891 } // if 1892 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1893 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1894 // CAS was successful, chunk obtained 1895 status = 1; 1896 --limit; 1897 break; 1898 } // if 1899 } // while 1900 if ( status != 0 ) { 1901 start = pr->u.p.lb; 1902 incr = pr->u.p.st; 1903 if ( p_st != NULL ) 1904 *p_st = incr; 1905 *p_lb = start + init * incr; 1906 *p_ub = start + limit * incr; 1907 if ( pr->ordered ) { 1908 pr->u.p.ordered_lower = init; 1909 pr->u.p.ordered_upper = limit; 1910 #ifdef KMP_DEBUG 1911 { 1912 const char * buff; 1913 // create format specifiers before the debug output 1914 buff = __kmp_str_format( 1915 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1916 traits_t< UT >::spec, traits_t< UT >::spec ); 1917 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1918 __kmp_str_free( &buff ); 1919 } 1920 #endif 1921 } // if 1922 } else { 1923 *p_lb = 0; 1924 *p_ub = 0; 1925 if ( p_st != NULL ) 1926 *p_st = 0; 1927 } // if 1928 } // case 1929 break; 1930 1931 case kmp_sch_guided_analytical_chunked: 1932 { 1933 T chunkspec = pr->u.p.parm1; 1934 UT chunkIdx; 1935 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1936 /* for storing original FPCW value for Windows* OS on 1937 IA-32 architecture 8-byte version */ 1938 unsigned int oldFpcw; 1939 unsigned int fpcwSet = 0; 1940 #endif 1941 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1942 gtid ) ); 1943 1944 trip = pr->u.p.tc; 1945 1946 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1947 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1948 1949 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1950 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1951 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1952 --trip; 1953 /* use dynamic-style scheduling */ 1954 init = chunkIdx * chunkspec + pr->u.p.count; 1955 /* need to verify init > 0 in case of overflow in the above calculation */ 1956 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1957 limit = init + chunkspec -1; 1958 1959 if ( (last = (limit >= trip)) != 0 ) 1960 limit = trip; 1961 } 1962 break; 1963 } else { 1964 /* use exponential-style scheduling */ 1965 /* The following check is to workaround the lack of long double precision on Windows* OS. 1966 This check works around the possible effect that init != 0 for chunkIdx == 0. 1967 */ 1968 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1969 /* If we haven't already done so, save original 1970 FPCW and set precision to 64-bit, as Windows* OS 1971 on IA-32 architecture defaults to 53-bit */ 1972 if ( !fpcwSet ) { 1973 oldFpcw = _control87(0,0); 1974 _control87(_PC_64,_MCW_PC); 1975 fpcwSet = 0x30000; 1976 } 1977 #endif 1978 if ( chunkIdx ) { 1979 init = __kmp_dispatch_guided_remaining< T >( 1980 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1981 KMP_DEBUG_ASSERT(init); 1982 init = trip - init; 1983 } else 1984 init = 0; 1985 limit = trip - __kmp_dispatch_guided_remaining< T >( 1986 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1987 KMP_ASSERT(init <= limit); 1988 if ( init < limit ) { 1989 KMP_DEBUG_ASSERT(limit <= trip); 1990 --limit; 1991 status = 1; 1992 break; 1993 } // if 1994 } // if 1995 } // while (1) 1996 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1997 /* restore FPCW if necessary 1998 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1999 */ 2000 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2001 _control87(oldFpcw,_MCW_PC); 2002 #endif 2003 if ( status != 0 ) { 2004 start = pr->u.p.lb; 2005 incr = pr->u.p.st; 2006 if ( p_st != NULL ) 2007 *p_st = incr; 2008 *p_lb = start + init * incr; 2009 *p_ub = start + limit * incr; 2010 if ( pr->ordered ) { 2011 pr->u.p.ordered_lower = init; 2012 pr->u.p.ordered_upper = limit; 2013 #ifdef KMP_DEBUG 2014 { 2015 const char * buff; 2016 // create format specifiers before the debug output 2017 buff = __kmp_str_format( 2018 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2019 traits_t< UT >::spec, traits_t< UT >::spec ); 2020 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2021 __kmp_str_free( &buff ); 2022 } 2023 #endif 2024 } 2025 } else { 2026 *p_lb = 0; 2027 *p_ub = 0; 2028 if ( p_st != NULL ) 2029 *p_st = 0; 2030 } 2031 } // case 2032 break; 2033 2034 case kmp_sch_trapezoidal: 2035 { 2036 UT index; 2037 T parm2 = pr->u.p.parm2; 2038 T parm3 = pr->u.p.parm3; 2039 T parm4 = pr->u.p.parm4; 2040 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2041 gtid ) ); 2042 2043 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2044 2045 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2046 trip = pr->u.p.tc - 1; 2047 2048 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2049 *p_lb = 0; 2050 *p_ub = 0; 2051 if ( p_st != NULL ) *p_st = 0; 2052 } else { 2053 start = pr->u.p.lb; 2054 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2055 incr = pr->u.p.st; 2056 2057 if ( (last = (limit >= trip)) != 0 ) 2058 limit = trip; 2059 2060 if ( p_st != NULL ) *p_st = incr; 2061 2062 if ( incr == 1 ) { 2063 *p_lb = start + init; 2064 *p_ub = start + limit; 2065 } else { 2066 *p_lb = start + init * incr; 2067 *p_ub = start + limit * incr; 2068 } 2069 2070 if ( pr->ordered ) { 2071 pr->u.p.ordered_lower = init; 2072 pr->u.p.ordered_upper = limit; 2073 #ifdef KMP_DEBUG 2074 { 2075 const char * buff; 2076 // create format specifiers before the debug output 2077 buff = __kmp_str_format( 2078 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2079 traits_t< UT >::spec, traits_t< UT >::spec ); 2080 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2081 __kmp_str_free( &buff ); 2082 } 2083 #endif 2084 } // if 2085 } // if 2086 } // case 2087 break; 2088 default: 2089 { 2090 status = 0; // to avoid complaints on uninitialized variable use 2091 __kmp_msg( 2092 kmp_ms_fatal, // Severity 2093 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2094 KMP_HNT( GetNewerLibrary ), // Hint 2095 __kmp_msg_null // Variadic argument list terminator 2096 ); 2097 } 2098 break; 2099 } // switch 2100 } // if tc == 0; 2101 2102 if ( status == 0 ) { 2103 UT num_done; 2104 2105 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2106 #ifdef KMP_DEBUG 2107 { 2108 const char * buff; 2109 // create format specifiers before the debug output 2110 buff = __kmp_str_format( 2111 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2112 traits_t< UT >::spec ); 2113 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2114 __kmp_str_free( &buff ); 2115 } 2116 #endif 2117 2118 if ( (ST)num_done == team->t.t_nproc-1 ) { 2119 /* NOTE: release this buffer to be reused */ 2120 2121 KMP_MB(); /* Flush all pending memory write invalidates. */ 2122 2123 sh->u.s.num_done = 0; 2124 sh->u.s.iteration = 0; 2125 2126 /* TODO replace with general release procedure? */ 2127 if ( pr->ordered ) { 2128 sh->u.s.ordered_iteration = 0; 2129 } 2130 2131 KMP_MB(); /* Flush all pending memory write invalidates. */ 2132 2133 sh -> buffer_index += KMP_MAX_DISP_BUF; 2134 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2135 gtid, sh->buffer_index) ); 2136 2137 KMP_MB(); /* Flush all pending memory write invalidates. */ 2138 2139 } // if 2140 if ( __kmp_env_consistency_check ) { 2141 if ( pr->pushed_ws != ct_none ) { 2142 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2143 } 2144 } 2145 2146 th -> th.th_dispatch -> th_deo_fcn = NULL; 2147 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2148 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2149 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2150 } // if (status == 0) 2151 #if KMP_OS_WINDOWS 2152 else if ( last ) { 2153 pr->u.p.last_upper = pr->u.p.ub; 2154 } 2155 #endif /* KMP_OS_WINDOWS */ 2156 if ( p_last != NULL && status != 0 ) 2157 *p_last = last; 2158 } // if 2159 2160 #ifdef KMP_DEBUG 2161 { 2162 const char * buff; 2163 // create format specifiers before the debug output 2164 buff = __kmp_str_format( 2165 "__kmp_dispatch_next: T#%%d normal case: " \ 2166 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2167 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2168 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2169 __kmp_str_free( &buff ); 2170 } 2171 #endif 2172 #if INCLUDE_SSC_MARKS 2173 SSC_MARK_DISPATCH_NEXT(); 2174 #endif 2175 OMPT_LOOP_END; 2176 return status; 2177 } 2178 2179 template< typename T > 2180 static void 2181 __kmp_dist_get_bounds( 2182 ident_t *loc, 2183 kmp_int32 gtid, 2184 kmp_int32 *plastiter, 2185 T *plower, 2186 T *pupper, 2187 typename traits_t< T >::signed_t incr 2188 ) { 2189 typedef typename traits_t< T >::unsigned_t UT; 2190 typedef typename traits_t< T >::signed_t ST; 2191 register kmp_uint32 team_id; 2192 register kmp_uint32 nteams; 2193 register UT trip_count; 2194 register kmp_team_t *team; 2195 kmp_info_t * th; 2196 2197 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2198 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2199 #ifdef KMP_DEBUG 2200 { 2201 const char * buff; 2202 // create format specifiers before the debug output 2203 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2204 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2205 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2206 traits_t< T >::spec ); 2207 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2208 __kmp_str_free( &buff ); 2209 } 2210 #endif 2211 2212 if( __kmp_env_consistency_check ) { 2213 if( incr == 0 ) { 2214 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2215 } 2216 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2217 // The loop is illegal. 2218 // Some zero-trip loops maintained by compiler, e.g.: 2219 // for(i=10;i<0;++i) // lower >= upper - run-time check 2220 // for(i=0;i>10;--i) // lower <= upper - run-time check 2221 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2222 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2223 // Compiler does not check the following illegal loops: 2224 // for(i=0;i<10;i+=incr) // where incr<0 2225 // for(i=10;i>0;i-=incr) // where incr<0 2226 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2227 } 2228 } 2229 th = __kmp_threads[gtid]; 2230 team = th->th.th_team; 2231 #if OMP_40_ENABLED 2232 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2233 nteams = th->th.th_teams_size.nteams; 2234 #endif 2235 team_id = team->t.t_master_tid; 2236 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2237 2238 // compute global trip count 2239 if( incr == 1 ) { 2240 trip_count = *pupper - *plower + 1; 2241 } else if(incr == -1) { 2242 trip_count = *plower - *pupper + 1; 2243 } else { 2244 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2245 } 2246 2247 if( trip_count <= nteams ) { 2248 KMP_DEBUG_ASSERT( 2249 __kmp_static == kmp_sch_static_greedy || \ 2250 __kmp_static == kmp_sch_static_balanced 2251 ); // Unknown static scheduling type. 2252 // only some teams get single iteration, others get nothing 2253 if( team_id < trip_count ) { 2254 *pupper = *plower = *plower + team_id * incr; 2255 } else { 2256 *plower = *pupper + incr; // zero-trip loop 2257 } 2258 if( plastiter != NULL ) 2259 *plastiter = ( team_id == trip_count - 1 ); 2260 } else { 2261 if( __kmp_static == kmp_sch_static_balanced ) { 2262 register UT chunk = trip_count / nteams; 2263 register UT extras = trip_count % nteams; 2264 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2265 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2266 if( plastiter != NULL ) 2267 *plastiter = ( team_id == nteams - 1 ); 2268 } else { 2269 register T chunk_inc_count = 2270 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2271 register T upper = *pupper; 2272 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2273 // Unknown static scheduling type. 2274 *plower += team_id * chunk_inc_count; 2275 *pupper = *plower + chunk_inc_count - incr; 2276 // Check/correct bounds if needed 2277 if( incr > 0 ) { 2278 if( *pupper < *plower ) 2279 *pupper = i_maxmin< T >::mx; 2280 if( plastiter != NULL ) 2281 *plastiter = *plower <= upper && *pupper > upper - incr; 2282 if( *pupper > upper ) 2283 *pupper = upper; // tracker C73258 2284 } else { 2285 if( *pupper > *plower ) 2286 *pupper = i_maxmin< T >::mn; 2287 if( plastiter != NULL ) 2288 *plastiter = *plower >= upper && *pupper < upper - incr; 2289 if( *pupper < upper ) 2290 *pupper = upper; // tracker C73258 2291 } 2292 } 2293 } 2294 } 2295 2296 //----------------------------------------------------------------------------------------- 2297 // Dispatch routines 2298 // Transfer call to template< type T > 2299 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2300 // T lb, T ub, ST st, ST chunk ) 2301 extern "C" { 2302 2303 /*! 2304 @ingroup WORK_SHARING 2305 @{ 2306 @param loc Source location 2307 @param gtid Global thread id 2308 @param schedule Schedule type 2309 @param lb Lower bound 2310 @param ub Upper bound 2311 @param st Step (or increment if you prefer) 2312 @param chunk The chunk size to block with 2313 2314 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2315 These functions are all identical apart from the types of the arguments. 2316 */ 2317 2318 void 2319 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2320 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2321 { 2322 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2323 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2324 } 2325 /*! 2326 See @ref __kmpc_dispatch_init_4 2327 */ 2328 void 2329 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2330 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2331 { 2332 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2333 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2334 } 2335 2336 /*! 2337 See @ref __kmpc_dispatch_init_4 2338 */ 2339 void 2340 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2341 kmp_int64 lb, kmp_int64 ub, 2342 kmp_int64 st, kmp_int64 chunk ) 2343 { 2344 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2345 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2346 } 2347 2348 /*! 2349 See @ref __kmpc_dispatch_init_4 2350 */ 2351 void 2352 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2353 kmp_uint64 lb, kmp_uint64 ub, 2354 kmp_int64 st, kmp_int64 chunk ) 2355 { 2356 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2357 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2358 } 2359 2360 /*! 2361 See @ref __kmpc_dispatch_init_4 2362 2363 Difference from __kmpc_dispatch_init set of functions is these functions 2364 are called for composite distribute parallel for construct. Thus before 2365 regular iterations dispatching we need to calc per-team iteration space. 2366 2367 These functions are all identical apart from the types of the arguments. 2368 */ 2369 void 2370 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2371 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2372 { 2373 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2374 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2375 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2376 } 2377 2378 void 2379 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2380 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2381 { 2382 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2383 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2384 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2385 } 2386 2387 void 2388 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2389 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2390 { 2391 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2392 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2393 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2394 } 2395 2396 void 2397 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2398 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2399 { 2400 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2401 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2402 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2403 } 2404 2405 /*! 2406 @param loc Source code location 2407 @param gtid Global thread id 2408 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2409 @param p_lb Pointer to the lower bound for the next chunk of work 2410 @param p_ub Pointer to the upper bound for the next chunk of work 2411 @param p_st Pointer to the stride for the next chunk of work 2412 @return one if there is work to be done, zero otherwise 2413 2414 Get the next dynamically allocated chunk of work for this thread. 2415 If there is no more work, then the lb,ub and stride need not be modified. 2416 */ 2417 int 2418 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2419 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2420 { 2421 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2422 } 2423 2424 /*! 2425 See @ref __kmpc_dispatch_next_4 2426 */ 2427 int 2428 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2429 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2430 { 2431 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2432 } 2433 2434 /*! 2435 See @ref __kmpc_dispatch_next_4 2436 */ 2437 int 2438 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2439 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2440 { 2441 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2442 } 2443 2444 /*! 2445 See @ref __kmpc_dispatch_next_4 2446 */ 2447 int 2448 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2449 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2450 { 2451 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2452 } 2453 2454 /*! 2455 @param loc Source code location 2456 @param gtid Global thread id 2457 2458 Mark the end of a dynamic loop. 2459 */ 2460 void 2461 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2462 { 2463 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2464 } 2465 2466 /*! 2467 See @ref __kmpc_dispatch_fini_4 2468 */ 2469 void 2470 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2471 { 2472 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2473 } 2474 2475 /*! 2476 See @ref __kmpc_dispatch_fini_4 2477 */ 2478 void 2479 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2480 { 2481 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2482 } 2483 2484 /*! 2485 See @ref __kmpc_dispatch_fini_4 2486 */ 2487 void 2488 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2489 { 2490 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2491 } 2492 /*! @} */ 2493 2494 //----------------------------------------------------------------------------------------- 2495 //Non-template routines from kmp_dispatch.c used in other sources 2496 2497 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2498 return value == checker; 2499 } 2500 2501 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2502 return value != checker; 2503 } 2504 2505 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2506 return value < checker; 2507 } 2508 2509 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2510 return value >= checker; 2511 } 2512 2513 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2514 return value <= checker; 2515 } 2516 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2517 return value == checker; 2518 } 2519 2520 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2521 return value != checker; 2522 } 2523 2524 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2525 return value < checker; 2526 } 2527 2528 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2529 return value >= checker; 2530 } 2531 2532 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2533 return value <= checker; 2534 } 2535 2536 kmp_uint32 2537 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2538 kmp_uint32 checker, 2539 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2540 , void * obj // Higher-level synchronization object, or NULL. 2541 ) 2542 { 2543 // note: we may not belong to a team at this point 2544 register volatile kmp_uint32 * spin = spinner; 2545 register kmp_uint32 check = checker; 2546 register kmp_uint32 spins; 2547 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2548 register kmp_uint32 r; 2549 2550 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2551 KMP_INIT_YIELD( spins ); 2552 // main wait spin loop 2553 while(!f(r = TCR_4(*spin), check)) { 2554 KMP_FSYNC_SPIN_PREPARE( obj ); 2555 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2556 It causes problems with infinite recursion because of exit lock */ 2557 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2558 __kmp_abort_thread(); */ 2559 2560 /* if we have waited a bit, or are oversubscribed, yield */ 2561 /* pause is in the following code */ 2562 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2563 KMP_YIELD_SPIN( spins ); 2564 } 2565 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2566 return r; 2567 } 2568 2569 kmp_uint64 2570 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2571 kmp_uint64 checker, 2572 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2573 , void * obj // Higher-level synchronization object, or NULL. 2574 ) 2575 { 2576 // note: we may not belong to a team at this point 2577 register volatile kmp_uint64 * spin = spinner; 2578 register kmp_uint64 check = checker; 2579 register kmp_uint32 spins; 2580 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2581 register kmp_uint64 r; 2582 2583 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2584 KMP_INIT_YIELD( spins ); 2585 // main wait spin loop 2586 while(!f(r = *spin, check)) 2587 { 2588 KMP_FSYNC_SPIN_PREPARE( obj ); 2589 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2590 It causes problems with infinite recursion because of exit lock */ 2591 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2592 __kmp_abort_thread(); */ 2593 2594 // if we are oversubscribed, 2595 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2596 // pause is in the following code 2597 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2598 KMP_YIELD_SPIN( spins ); 2599 } 2600 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2601 return r; 2602 } 2603 2604 } // extern "C" 2605 2606 #ifdef KMP_GOMP_COMPAT 2607 2608 void 2609 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2610 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2611 kmp_int32 chunk, int push_ws ) 2612 { 2613 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2614 push_ws ); 2615 } 2616 2617 void 2618 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2619 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2620 kmp_int32 chunk, int push_ws ) 2621 { 2622 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2623 push_ws ); 2624 } 2625 2626 void 2627 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2628 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2629 kmp_int64 chunk, int push_ws ) 2630 { 2631 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2632 push_ws ); 2633 } 2634 2635 void 2636 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2637 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2638 kmp_int64 chunk, int push_ws ) 2639 { 2640 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2641 push_ws ); 2642 } 2643 2644 void 2645 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2646 { 2647 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2648 } 2649 2650 void 2651 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2652 { 2653 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2654 } 2655 2656 void 2657 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2658 { 2659 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2660 } 2661 2662 void 2663 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2664 { 2665 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2666 } 2667 2668 #endif /* KMP_GOMP_COMPAT */ 2669 2670 /* ------------------------------------------------------------------------ */ 2671 /* ------------------------------------------------------------------------ */ 2672 2673