1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 637 KMP_MASTER_GTID(gtid) && 638 #if OMP_40_ENABLED 639 th->th.th_teams_microtask == NULL && 640 #endif 641 team->t.t_active_level == 1; 642 #endif 643 if ( ! active ) { 644 pr = reinterpret_cast< dispatch_private_info_template< T >* > 645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 646 } else { 647 KMP_DEBUG_ASSERT( th->th.th_dispatch == 648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 649 650 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 651 652 /* What happens when number of threads changes, need to resize buffer? */ 653 pr = reinterpret_cast< dispatch_private_info_template< T > * > 654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 657 } 658 659 /* Pick up the nomerge/ordered bits from the scheduling type */ 660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 661 pr->nomerge = TRUE; 662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 663 } else { 664 pr->nomerge = FALSE; 665 } 666 pr->type_size = ___kmp_size_type; // remember the size of variables 667 if ( kmp_ord_lower & schedule ) { 668 pr->ordered = TRUE; 669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 670 } else { 671 pr->ordered = FALSE; 672 } 673 674 if ( schedule == kmp_sch_static ) { 675 schedule = __kmp_static; 676 } else { 677 if ( schedule == kmp_sch_runtime ) { 678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 679 schedule = team -> t.t_sched.r_sched_type; 680 // Detail the schedule if needed (global controls are differentiated appropriately) 681 if ( schedule == kmp_sch_guided_chunked ) { 682 schedule = __kmp_guided; 683 } else if ( schedule == kmp_sch_static ) { 684 schedule = __kmp_static; 685 } 686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 687 chunk = team -> t.t_sched.chunk; 688 689 #ifdef KMP_DEBUG 690 { 691 const char * buff; 692 // create format specifiers before the debug output 693 buff = __kmp_str_format( 694 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 695 traits_t< ST >::spec ); 696 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 697 __kmp_str_free( &buff ); 698 } 699 #endif 700 } else { 701 if ( schedule == kmp_sch_guided_chunked ) { 702 schedule = __kmp_guided; 703 } 704 if ( chunk <= 0 ) { 705 chunk = KMP_DEFAULT_CHUNK; 706 } 707 } 708 709 if ( schedule == kmp_sch_auto ) { 710 // mapping and differentiation: in the __kmp_do_serial_initialize() 711 schedule = __kmp_auto; 712 #ifdef KMP_DEBUG 713 { 714 const char * buff; 715 // create format specifiers before the debug output 716 buff = __kmp_str_format( 717 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 718 traits_t< ST >::spec ); 719 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 720 __kmp_str_free( &buff ); 721 } 722 #endif 723 } 724 725 /* guided analytical not safe for too many threads */ 726 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 727 schedule = kmp_sch_guided_iterative_chunked; 728 KMP_WARNING( DispatchManyThreads ); 729 } 730 pr->u.p.parm1 = chunk; 731 } 732 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 733 "unknown scheduling type" ); 734 735 pr->u.p.count = 0; 736 737 if ( __kmp_env_consistency_check ) { 738 if ( st == 0 ) { 739 __kmp_error_construct( 740 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 741 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 742 ); 743 } 744 } 745 746 tc = ( ub - lb + st ); 747 if ( st != 1 ) { 748 if ( st < 0 ) { 749 if ( lb < ub ) { 750 tc = 0; // zero-trip 751 } else { // lb >= ub 752 tc = (ST)tc / st; // convert to signed division 753 } 754 } else { // st > 0 755 if ( ub < lb ) { 756 tc = 0; // zero-trip 757 } else { // lb >= ub 758 tc /= st; 759 } 760 } 761 } else if ( ub < lb ) { // st == 1 762 tc = 0; // zero-trip 763 } 764 765 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 766 // when statistics are disabled. 767 if (schedule == __kmp_static) 768 { 769 KMP_COUNT_BLOCK(OMP_FOR_static); 770 KMP_COUNT_VALUE(FOR_static_iterations, tc); 771 } 772 else 773 { 774 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 775 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 776 } 777 778 pr->u.p.lb = lb; 779 pr->u.p.ub = ub; 780 pr->u.p.st = st; 781 pr->u.p.tc = tc; 782 783 #if KMP_OS_WINDOWS 784 pr->u.p.last_upper = ub + st; 785 #endif /* KMP_OS_WINDOWS */ 786 787 /* NOTE: only the active parallel region(s) has active ordered sections */ 788 789 if ( active ) { 790 if ( pr->ordered == 0 ) { 791 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 792 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 793 } else { 794 pr->ordered_bumped = 0; 795 796 pr->u.p.ordered_lower = 1; 797 pr->u.p.ordered_upper = 0; 798 799 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 800 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 801 } 802 } 803 804 if ( __kmp_env_consistency_check ) { 805 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 806 if ( push_ws ) { 807 __kmp_push_workshare( gtid, ws, loc ); 808 pr->pushed_ws = ws; 809 } else { 810 __kmp_check_workshare( gtid, ws, loc ); 811 pr->pushed_ws = ct_none; 812 } 813 } 814 815 switch ( schedule ) { 816 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 817 case kmp_sch_static_steal: 818 { 819 T nproc = team->t.t_nproc; 820 T ntc, init; 821 822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 823 824 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 825 if ( nproc > 1 && ntc >= nproc ) { 826 T id = __kmp_tid_from_gtid(gtid); 827 T small_chunk, extras; 828 829 small_chunk = ntc / nproc; 830 extras = ntc % nproc; 831 832 init = id * small_chunk + ( id < extras ? id : extras ); 833 pr->u.p.count = init; 834 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 835 836 pr->u.p.parm2 = lb; 837 //pr->pfields.parm3 = 0; // it's not used in static_steal 838 pr->u.p.parm4 = id; 839 pr->u.p.st = st; 840 break; 841 } else { 842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 843 gtid ) ); 844 schedule = kmp_sch_static_balanced; 845 /* too few iterations: fall-through to kmp_sch_static_balanced */ 846 } // if 847 /* FALL-THROUGH to static balanced */ 848 } // case 849 #endif 850 case kmp_sch_static_balanced: 851 { 852 T nproc = team->t.t_nproc; 853 T init, limit; 854 855 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 856 gtid ) ); 857 858 if ( nproc > 1 ) { 859 T id = __kmp_tid_from_gtid(gtid); 860 861 if ( tc < nproc ) { 862 if ( id < tc ) { 863 init = id; 864 limit = id; 865 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 866 } else { 867 pr->u.p.count = 1; /* means no more chunks to execute */ 868 pr->u.p.parm1 = FALSE; 869 break; 870 } 871 } else { 872 T small_chunk = tc / nproc; 873 T extras = tc % nproc; 874 init = id * small_chunk + (id < extras ? id : extras); 875 limit = init + small_chunk - (id < extras ? 0 : 1); 876 pr->u.p.parm1 = (id == nproc - 1); 877 } 878 } else { 879 if ( tc > 0 ) { 880 init = 0; 881 limit = tc - 1; 882 pr->u.p.parm1 = TRUE; 883 } else { 884 // zero trip count 885 pr->u.p.count = 1; /* means no more chunks to execute */ 886 pr->u.p.parm1 = FALSE; 887 break; 888 } 889 } 890 #if USE_ITT_BUILD 891 // Calculate chunk for metadata report 892 if ( itt_need_metadata_reporting ) 893 cur_chunk = limit - init + 1; 894 #endif 895 if ( st == 1 ) { 896 pr->u.p.lb = lb + init; 897 pr->u.p.ub = lb + limit; 898 } else { 899 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 900 pr->u.p.lb = lb + init * st; 901 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 902 if ( st > 0 ) { 903 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 904 } else { 905 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 906 } 907 } 908 if ( pr->ordered ) { 909 pr->u.p.ordered_lower = init; 910 pr->u.p.ordered_upper = limit; 911 } 912 break; 913 } // case 914 case kmp_sch_guided_iterative_chunked : 915 { 916 T nproc = team->t.t_nproc; 917 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 918 919 if ( nproc > 1 ) { 920 if ( (2L * chunk + 1 ) * nproc >= tc ) { 921 /* chunk size too large, switch to dynamic */ 922 schedule = kmp_sch_dynamic_chunked; 923 } else { 924 // when remaining iters become less than parm2 - switch to dynamic 925 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 926 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 927 } 928 } else { 929 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 930 schedule = kmp_sch_static_greedy; 931 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 933 pr->u.p.parm1 = tc; 934 } // if 935 } // case 936 break; 937 case kmp_sch_guided_analytical_chunked: 938 { 939 T nproc = team->t.t_nproc; 940 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 941 942 if ( nproc > 1 ) { 943 if ( (2L * chunk + 1 ) * nproc >= tc ) { 944 /* chunk size too large, switch to dynamic */ 945 schedule = kmp_sch_dynamic_chunked; 946 } else { 947 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 948 DBL x; 949 950 #if KMP_OS_WINDOWS && KMP_ARCH_X86 951 /* Linux* OS already has 64-bit computation by default for 952 long double, and on Windows* OS on Intel(R) 64, 953 /Qlong_double doesn't work. On Windows* OS 954 on IA-32 architecture, we need to set precision to 955 64-bit instead of the default 53-bit. Even though long 956 double doesn't work on Windows* OS on Intel(R) 64, the 957 resulting lack of precision is not expected to impact 958 the correctness of the algorithm, but this has not been 959 mathematically proven. 960 */ 961 // save original FPCW and set precision to 64-bit, as 962 // Windows* OS on IA-32 architecture defaults to 53-bit 963 unsigned int oldFpcw = _control87(0,0); 964 _control87(_PC_64,_MCW_PC); // 0,0x30000 965 #endif 966 /* value used for comparison in solver for cross-over point */ 967 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 968 969 /* crossover point--chunk indexes equal to or greater than 970 this point switch to dynamic-style scheduling */ 971 UT cross; 972 973 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 974 x = (long double)1.0 - (long double)0.5 / nproc; 975 976 #ifdef KMP_DEBUG 977 { // test natural alignment 978 struct _test_a { 979 char a; 980 union { 981 char b; 982 DBL d; 983 }; 984 } t; 985 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 986 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 987 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 988 } 989 #endif // KMP_DEBUG 990 991 /* save the term in thread private dispatch structure */ 992 *(DBL*)&pr->u.p.parm3 = x; 993 994 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 995 { 996 UT left, right, mid; 997 long double p; 998 999 /* estimate initial upper and lower bound */ 1000 1001 /* doesn't matter what value right is as long as it is positive, but 1002 it affects performance of the solver 1003 */ 1004 right = 229; 1005 p = __kmp_pow< UT >(x,right); 1006 if ( p > target ) { 1007 do{ 1008 p *= p; 1009 right <<= 1; 1010 } while(p>target && right < (1<<27)); 1011 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1012 } else { 1013 left = 0; 1014 } 1015 1016 /* bisection root-finding method */ 1017 while ( left + 1 < right ) { 1018 mid = (left + right) / 2; 1019 if ( __kmp_pow< UT >(x,mid) > target ) { 1020 left = mid; 1021 } else { 1022 right = mid; 1023 } 1024 } // while 1025 cross = right; 1026 } 1027 /* assert sanity of computed crossover point */ 1028 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1029 1030 /* save the crossover point in thread private dispatch structure */ 1031 pr->u.p.parm2 = cross; 1032 1033 // C75803 1034 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1035 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1036 #else 1037 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1038 #endif 1039 /* dynamic-style scheduling offset */ 1040 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1041 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1042 // restore FPCW 1043 _control87(oldFpcw,_MCW_PC); 1044 #endif 1045 } // if 1046 } else { 1047 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1048 gtid ) ); 1049 schedule = kmp_sch_static_greedy; 1050 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1051 pr->u.p.parm1 = tc; 1052 } // if 1053 } // case 1054 break; 1055 case kmp_sch_static_greedy: 1056 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1057 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1058 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1059 tc; 1060 break; 1061 case kmp_sch_static_chunked : 1062 case kmp_sch_dynamic_chunked : 1063 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1064 break; 1065 case kmp_sch_trapezoidal : 1066 { 1067 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1068 1069 T parm1, parm2, parm3, parm4; 1070 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1071 1072 parm1 = chunk; 1073 1074 /* F : size of the first cycle */ 1075 parm2 = ( tc / (2 * team->t.t_nproc) ); 1076 1077 if ( parm2 < 1 ) { 1078 parm2 = 1; 1079 } 1080 1081 /* L : size of the last cycle. Make sure the last cycle 1082 * is not larger than the first cycle. 1083 */ 1084 if ( parm1 < 1 ) { 1085 parm1 = 1; 1086 } else if ( parm1 > parm2 ) { 1087 parm1 = parm2; 1088 } 1089 1090 /* N : number of cycles */ 1091 parm3 = ( parm2 + parm1 ); 1092 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1093 1094 if ( parm3 < 2 ) { 1095 parm3 = 2; 1096 } 1097 1098 /* sigma : decreasing incr of the trapezoid */ 1099 parm4 = ( parm3 - 1 ); 1100 parm4 = ( parm2 - parm1 ) / parm4; 1101 1102 // pointless check, because parm4 >= 0 always 1103 //if ( parm4 < 0 ) { 1104 // parm4 = 0; 1105 //} 1106 1107 pr->u.p.parm1 = parm1; 1108 pr->u.p.parm2 = parm2; 1109 pr->u.p.parm3 = parm3; 1110 pr->u.p.parm4 = parm4; 1111 } // case 1112 break; 1113 1114 default: 1115 { 1116 __kmp_msg( 1117 kmp_ms_fatal, // Severity 1118 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1119 KMP_HNT( GetNewerLibrary ), // Hint 1120 __kmp_msg_null // Variadic argument list terminator 1121 ); 1122 } 1123 break; 1124 } // switch 1125 pr->schedule = schedule; 1126 if ( active ) { 1127 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1128 1129 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1130 gtid, my_buffer_index, sh->buffer_index) ); 1131 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1132 USE_ITT_BUILD_ARG( NULL ) 1133 ); 1134 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1135 // *always* 32-bit integers. 1136 KMP_MB(); /* is this necessary? */ 1137 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1138 gtid, my_buffer_index, sh->buffer_index) ); 1139 1140 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1141 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1142 #if USE_ITT_BUILD 1143 if ( pr->ordered ) { 1144 __kmp_itt_ordered_init( gtid ); 1145 }; // if 1146 // Report loop metadata 1147 if ( itt_need_metadata_reporting ) { 1148 // Only report metadata by master of active team at level 1 1149 kmp_uint64 schedtype = 0; 1150 switch ( schedule ) { 1151 case kmp_sch_static_chunked: 1152 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1153 break; 1154 case kmp_sch_static_greedy: 1155 cur_chunk = pr->u.p.parm1; 1156 break; 1157 case kmp_sch_dynamic_chunked: 1158 schedtype = 1; 1159 break; 1160 case kmp_sch_guided_iterative_chunked: 1161 case kmp_sch_guided_analytical_chunked: 1162 schedtype = 2; 1163 break; 1164 default: 1165 // Should we put this case under "static"? 1166 // case kmp_sch_static_steal: 1167 schedtype = 3; 1168 break; 1169 } 1170 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1171 } 1172 #endif /* USE_ITT_BUILD */ 1173 }; // if 1174 1175 #ifdef KMP_DEBUG 1176 { 1177 const char * buff; 1178 // create format specifiers before the debug output 1179 buff = __kmp_str_format( 1180 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1181 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1182 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1183 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1184 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1185 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1186 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1187 KD_TRACE(10, ( buff, 1188 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1189 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1190 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1191 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1192 __kmp_str_free( &buff ); 1193 } 1194 #endif 1195 #if ( KMP_STATIC_STEAL_ENABLED ) 1196 if ( ___kmp_size_type < 8 ) { 1197 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1198 // all the parm3 variables will contain the same value. 1199 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1200 // rather than program life-time increment. 1201 // So the dedicated variable is required. The 'static_steal_counter' is used. 1202 if( schedule == kmp_sch_static_steal ) { 1203 // Other threads will inspect this variable when searching for a victim. 1204 // This is a flag showing that other threads may steal from this thread since then. 1205 volatile T * p = &pr->u.p.static_steal_counter; 1206 *p = *p + 1; 1207 } 1208 } 1209 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1210 1211 #if OMPT_SUPPORT && OMPT_TRACE 1212 if (ompt_enabled && 1213 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1214 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1215 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1216 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1217 team_info->parallel_id, task_info->task_id, team_info->microtask); 1218 } 1219 #endif 1220 } 1221 1222 /* 1223 * For ordered loops, either __kmp_dispatch_finish() should be called after 1224 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1225 * every chunk of iterations. If the ordered section(s) were not executed 1226 * for this iteration (or every iteration in this chunk), we need to set the 1227 * ordered iteration counters so that the next thread can proceed. 1228 */ 1229 template< typename UT > 1230 static void 1231 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1232 { 1233 typedef typename traits_t< UT >::signed_t ST; 1234 kmp_info_t *th = __kmp_threads[ gtid ]; 1235 1236 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1237 if ( ! th -> th.th_team -> t.t_serialized ) { 1238 1239 dispatch_private_info_template< UT > * pr = 1240 reinterpret_cast< dispatch_private_info_template< UT >* > 1241 ( th->th.th_dispatch->th_dispatch_pr_current ); 1242 dispatch_shared_info_template< UT > volatile * sh = 1243 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1244 ( th->th.th_dispatch->th_dispatch_sh_current ); 1245 KMP_DEBUG_ASSERT( pr ); 1246 KMP_DEBUG_ASSERT( sh ); 1247 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1248 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1249 1250 if ( pr->ordered_bumped ) { 1251 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1252 gtid ) ); 1253 pr->ordered_bumped = 0; 1254 } else { 1255 UT lower = pr->u.p.ordered_lower; 1256 1257 #ifdef KMP_DEBUG 1258 { 1259 const char * buff; 1260 // create format specifiers before the debug output 1261 buff = __kmp_str_format( 1262 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1263 traits_t< UT >::spec, traits_t< UT >::spec ); 1264 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1265 __kmp_str_free( &buff ); 1266 } 1267 #endif 1268 1269 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1270 USE_ITT_BUILD_ARG(NULL) 1271 ); 1272 KMP_MB(); /* is this necessary? */ 1273 #ifdef KMP_DEBUG 1274 { 1275 const char * buff; 1276 // create format specifiers before the debug output 1277 buff = __kmp_str_format( 1278 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1279 traits_t< UT >::spec, traits_t< UT >::spec ); 1280 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1281 __kmp_str_free( &buff ); 1282 } 1283 #endif 1284 1285 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1286 } // if 1287 } // if 1288 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1289 } 1290 1291 #ifdef KMP_GOMP_COMPAT 1292 1293 template< typename UT > 1294 static void 1295 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1296 { 1297 typedef typename traits_t< UT >::signed_t ST; 1298 kmp_info_t *th = __kmp_threads[ gtid ]; 1299 1300 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1301 if ( ! th -> th.th_team -> t.t_serialized ) { 1302 // int cid; 1303 dispatch_private_info_template< UT > * pr = 1304 reinterpret_cast< dispatch_private_info_template< UT >* > 1305 ( th->th.th_dispatch->th_dispatch_pr_current ); 1306 dispatch_shared_info_template< UT > volatile * sh = 1307 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1308 ( th->th.th_dispatch->th_dispatch_sh_current ); 1309 KMP_DEBUG_ASSERT( pr ); 1310 KMP_DEBUG_ASSERT( sh ); 1311 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1312 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1313 1314 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1315 UT lower = pr->u.p.ordered_lower; 1316 UT upper = pr->u.p.ordered_upper; 1317 UT inc = upper - lower + 1; 1318 1319 if ( pr->ordered_bumped == inc ) { 1320 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1321 gtid ) ); 1322 pr->ordered_bumped = 0; 1323 } else { 1324 inc -= pr->ordered_bumped; 1325 1326 #ifdef KMP_DEBUG 1327 { 1328 const char * buff; 1329 // create format specifiers before the debug output 1330 buff = __kmp_str_format( 1331 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1332 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1333 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1334 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1335 __kmp_str_free( &buff ); 1336 } 1337 #endif 1338 1339 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1340 USE_ITT_BUILD_ARG(NULL) 1341 ); 1342 1343 KMP_MB(); /* is this necessary? */ 1344 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1345 gtid ) ); 1346 pr->ordered_bumped = 0; 1347 //!!!!! TODO check if the inc should be unsigned, or signed??? 1348 #ifdef KMP_DEBUG 1349 { 1350 const char * buff; 1351 // create format specifiers before the debug output 1352 buff = __kmp_str_format( 1353 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1354 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1355 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1356 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1357 __kmp_str_free( &buff ); 1358 } 1359 #endif 1360 1361 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1362 } 1363 // } 1364 } 1365 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1366 } 1367 1368 #endif /* KMP_GOMP_COMPAT */ 1369 1370 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1371 * (no more work), then tell OMPT the loop is over. In some cases 1372 * kmp_dispatch_fini() is not called. */ 1373 #if OMPT_SUPPORT && OMPT_TRACE 1374 #define OMPT_LOOP_END \ 1375 if (status == 0) { \ 1376 if (ompt_enabled && \ 1377 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1378 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1379 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1380 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1381 team_info->parallel_id, task_info->task_id); \ 1382 } \ 1383 } 1384 #else 1385 #define OMPT_LOOP_END // no-op 1386 #endif 1387 1388 template< typename T > 1389 static int 1390 __kmp_dispatch_next( 1391 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1392 ) { 1393 1394 typedef typename traits_t< T >::unsigned_t UT; 1395 typedef typename traits_t< T >::signed_t ST; 1396 typedef typename traits_t< T >::floating_t DBL; 1397 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1398 static const int ___kmp_size_type = sizeof( UT ); 1399 #endif 1400 1401 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1402 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1403 // more than a compile time choice to use static scheduling would.) 1404 KMP_TIME_BLOCK(FOR_dynamic_scheduling); 1405 1406 int status; 1407 dispatch_private_info_template< T > * pr; 1408 kmp_info_t * th = __kmp_threads[ gtid ]; 1409 kmp_team_t * team = th -> th.th_team; 1410 1411 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1412 #ifdef KMP_DEBUG 1413 { 1414 const char * buff; 1415 // create format specifiers before the debug output 1416 buff = __kmp_str_format( 1417 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1418 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1419 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1420 __kmp_str_free( &buff ); 1421 } 1422 #endif 1423 1424 if ( team -> t.t_serialized ) { 1425 /* NOTE: serialize this dispatch becase we are not at the active level */ 1426 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1427 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1428 KMP_DEBUG_ASSERT( pr ); 1429 1430 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1431 *p_lb = 0; 1432 *p_ub = 0; 1433 // if ( p_last != NULL ) 1434 // *p_last = 0; 1435 if ( p_st != NULL ) 1436 *p_st = 0; 1437 if ( __kmp_env_consistency_check ) { 1438 if ( pr->pushed_ws != ct_none ) { 1439 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1440 } 1441 } 1442 } else if ( pr->nomerge ) { 1443 kmp_int32 last; 1444 T start; 1445 UT limit, trip, init; 1446 ST incr; 1447 T chunk = pr->u.p.parm1; 1448 1449 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1450 1451 init = chunk * pr->u.p.count++; 1452 trip = pr->u.p.tc - 1; 1453 1454 if ( (status = (init <= trip)) == 0 ) { 1455 *p_lb = 0; 1456 *p_ub = 0; 1457 // if ( p_last != NULL ) 1458 // *p_last = 0; 1459 if ( p_st != NULL ) 1460 *p_st = 0; 1461 if ( __kmp_env_consistency_check ) { 1462 if ( pr->pushed_ws != ct_none ) { 1463 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1464 } 1465 } 1466 } else { 1467 start = pr->u.p.lb; 1468 limit = chunk + init - 1; 1469 incr = pr->u.p.st; 1470 1471 if ( (last = (limit >= trip)) != 0 ) { 1472 limit = trip; 1473 #if KMP_OS_WINDOWS 1474 pr->u.p.last_upper = pr->u.p.ub; 1475 #endif /* KMP_OS_WINDOWS */ 1476 } 1477 if ( p_last != NULL ) 1478 *p_last = last; 1479 if ( p_st != NULL ) 1480 *p_st = incr; 1481 if ( incr == 1 ) { 1482 *p_lb = start + init; 1483 *p_ub = start + limit; 1484 } else { 1485 *p_lb = start + init * incr; 1486 *p_ub = start + limit * incr; 1487 } 1488 1489 if ( pr->ordered ) { 1490 pr->u.p.ordered_lower = init; 1491 pr->u.p.ordered_upper = limit; 1492 #ifdef KMP_DEBUG 1493 { 1494 const char * buff; 1495 // create format specifiers before the debug output 1496 buff = __kmp_str_format( 1497 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1498 traits_t< UT >::spec, traits_t< UT >::spec ); 1499 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1500 __kmp_str_free( &buff ); 1501 } 1502 #endif 1503 } // if 1504 } // if 1505 } else { 1506 pr->u.p.tc = 0; 1507 *p_lb = pr->u.p.lb; 1508 *p_ub = pr->u.p.ub; 1509 #if KMP_OS_WINDOWS 1510 pr->u.p.last_upper = *p_ub; 1511 #endif /* KMP_OS_WINDOWS */ 1512 if ( p_last != NULL ) 1513 *p_last = TRUE; 1514 if ( p_st != NULL ) 1515 *p_st = pr->u.p.st; 1516 } // if 1517 #ifdef KMP_DEBUG 1518 { 1519 const char * buff; 1520 // create format specifiers before the debug output 1521 buff = __kmp_str_format( 1522 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1523 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1524 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1525 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1526 __kmp_str_free( &buff ); 1527 } 1528 #endif 1529 #if INCLUDE_SSC_MARKS 1530 SSC_MARK_DISPATCH_NEXT(); 1531 #endif 1532 OMPT_LOOP_END; 1533 return status; 1534 } else { 1535 kmp_int32 last = 0; 1536 dispatch_shared_info_template< UT > *sh; 1537 T start; 1538 ST incr; 1539 UT limit, trip, init; 1540 1541 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1542 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1543 1544 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1545 ( th->th.th_dispatch->th_dispatch_pr_current ); 1546 KMP_DEBUG_ASSERT( pr ); 1547 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1548 ( th->th.th_dispatch->th_dispatch_sh_current ); 1549 KMP_DEBUG_ASSERT( sh ); 1550 1551 if ( pr->u.p.tc == 0 ) { 1552 // zero trip count 1553 status = 0; 1554 } else { 1555 switch (pr->schedule) { 1556 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1557 case kmp_sch_static_steal: 1558 { 1559 T chunk = pr->u.p.parm1; 1560 1561 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1562 1563 trip = pr->u.p.tc - 1; 1564 1565 if ( ___kmp_size_type > 4 ) { 1566 // Other threads do not look into the data of this thread, 1567 // so it's not necessary to make volatile casting. 1568 init = ( pr->u.p.count )++; 1569 status = ( init < (UT)pr->u.p.ub ); 1570 } else { 1571 typedef union { 1572 struct { 1573 UT count; 1574 T ub; 1575 } p; 1576 kmp_int64 b; 1577 } union_i4; 1578 // All operations on 'count' or 'ub' must be combined atomically together. 1579 // stealing implemented only for 4-byte indexes 1580 { 1581 union_i4 vold, vnew; 1582 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1583 vnew = vold; 1584 vnew.p.count++; 1585 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1586 ( volatile kmp_int64* )&pr->u.p.count, 1587 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1588 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1589 KMP_CPU_PAUSE(); 1590 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1591 vnew = vold; 1592 vnew.p.count++; 1593 } 1594 vnew = vold; 1595 init = vnew.p.count; 1596 status = ( init < (UT)vnew.p.ub ) ; 1597 } 1598 1599 if( !status ) { 1600 kmp_info_t **other_threads = team->t.t_threads; 1601 int while_limit = 10; 1602 int while_index = 0; 1603 1604 // TODO: algorithm of searching for a victim 1605 // should be cleaned up and measured 1606 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1607 union_i4 vold, vnew; 1608 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1609 T victimIdx = pr->u.p.parm4; 1610 T oldVictimIdx = victimIdx; 1611 dispatch_private_info_template< T > * victim; 1612 1613 do { 1614 if( !victimIdx ) { 1615 victimIdx = team->t.t_nproc - 1; 1616 } else { 1617 --victimIdx; 1618 } 1619 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1620 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1621 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1622 // TODO: think about a proper place of this test 1623 if ( ( !victim ) || 1624 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1625 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1626 // TODO: delay would be nice 1627 continue; 1628 // the victim is not ready yet to participate in stealing 1629 // because the victim is still in kmp_init_dispatch 1630 } 1631 if ( oldVictimIdx == victimIdx ) { 1632 break; 1633 } 1634 pr->u.p.parm4 = victimIdx; 1635 1636 while( 1 ) { 1637 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1638 vnew = vold; 1639 1640 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1641 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1642 break; 1643 } 1644 vnew.p.ub -= (remaining >> 2); 1645 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1646 #pragma warning( push ) 1647 // disable warning on pointless comparison of unsigned with 0 1648 #pragma warning( disable: 186 ) 1649 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1650 #pragma warning( pop ) 1651 // TODO: Should this be acquire or release? 1652 if ( KMP_COMPARE_AND_STORE_ACQ64( 1653 ( volatile kmp_int64 * )&victim->u.p.count, 1654 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1655 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1656 status = 1; 1657 while_index = 0; 1658 // now update own count and ub 1659 #if KMP_ARCH_X86 1660 // stealing executed on non-KMP_ARCH_X86 only 1661 // Atomic 64-bit write on ia32 is 1662 // unavailable, so we do this in steps. 1663 // This code is not tested. 1664 init = vold.p.count; 1665 pr->u.p.ub = 0; 1666 pr->u.p.count = init + 1; 1667 pr->u.p.ub = vnew.p.count; 1668 #else 1669 init = vnew.p.ub; 1670 vold.p.count = init + 1; 1671 // TODO: is it safe and enough? 1672 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1673 #endif // KMP_ARCH_X86 1674 break; 1675 } // if 1676 KMP_CPU_PAUSE(); 1677 } // while (1) 1678 } // while 1679 } // if 1680 } // if 1681 if ( !status ) { 1682 *p_lb = 0; 1683 *p_ub = 0; 1684 if ( p_st != NULL ) *p_st = 0; 1685 } else { 1686 start = pr->u.p.parm2; 1687 init *= chunk; 1688 limit = chunk + init - 1; 1689 incr = pr->u.p.st; 1690 1691 KMP_DEBUG_ASSERT(init <= trip); 1692 if ( (last = (limit >= trip)) != 0 ) 1693 limit = trip; 1694 if ( p_st != NULL ) *p_st = incr; 1695 1696 if ( incr == 1 ) { 1697 *p_lb = start + init; 1698 *p_ub = start + limit; 1699 } else { 1700 *p_lb = start + init * incr; 1701 *p_ub = start + limit * incr; 1702 } 1703 1704 if ( pr->ordered ) { 1705 pr->u.p.ordered_lower = init; 1706 pr->u.p.ordered_upper = limit; 1707 #ifdef KMP_DEBUG 1708 { 1709 const char * buff; 1710 // create format specifiers before the debug output 1711 buff = __kmp_str_format( 1712 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1713 traits_t< UT >::spec, traits_t< UT >::spec ); 1714 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1715 __kmp_str_free( &buff ); 1716 } 1717 #endif 1718 } // if 1719 } // if 1720 break; 1721 } // case 1722 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1723 case kmp_sch_static_balanced: 1724 { 1725 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1726 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1727 pr->u.p.count = 1; 1728 *p_lb = pr->u.p.lb; 1729 *p_ub = pr->u.p.ub; 1730 last = pr->u.p.parm1; 1731 if ( p_st != NULL ) 1732 *p_st = pr->u.p.st; 1733 } else { /* no iterations to do */ 1734 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1735 } 1736 if ( pr->ordered ) { 1737 #ifdef KMP_DEBUG 1738 { 1739 const char * buff; 1740 // create format specifiers before the debug output 1741 buff = __kmp_str_format( 1742 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1743 traits_t< UT >::spec, traits_t< UT >::spec ); 1744 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1745 __kmp_str_free( &buff ); 1746 } 1747 #endif 1748 } // if 1749 } // case 1750 break; 1751 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1752 case kmp_sch_static_chunked: 1753 { 1754 T parm1; 1755 1756 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1757 gtid ) ); 1758 parm1 = pr->u.p.parm1; 1759 1760 trip = pr->u.p.tc - 1; 1761 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1762 1763 if ( (status = (init <= trip)) != 0 ) { 1764 start = pr->u.p.lb; 1765 incr = pr->u.p.st; 1766 limit = parm1 + init - 1; 1767 1768 if ( (last = (limit >= trip)) != 0 ) 1769 limit = trip; 1770 1771 if ( p_st != NULL ) *p_st = incr; 1772 1773 pr->u.p.count += team->t.t_nproc; 1774 1775 if ( incr == 1 ) { 1776 *p_lb = start + init; 1777 *p_ub = start + limit; 1778 } 1779 else { 1780 *p_lb = start + init * incr; 1781 *p_ub = start + limit * incr; 1782 } 1783 1784 if ( pr->ordered ) { 1785 pr->u.p.ordered_lower = init; 1786 pr->u.p.ordered_upper = limit; 1787 #ifdef KMP_DEBUG 1788 { 1789 const char * buff; 1790 // create format specifiers before the debug output 1791 buff = __kmp_str_format( 1792 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1793 traits_t< UT >::spec, traits_t< UT >::spec ); 1794 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1795 __kmp_str_free( &buff ); 1796 } 1797 #endif 1798 } // if 1799 } // if 1800 } // case 1801 break; 1802 1803 case kmp_sch_dynamic_chunked: 1804 { 1805 T chunk = pr->u.p.parm1; 1806 1807 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1808 gtid ) ); 1809 1810 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1811 trip = pr->u.p.tc - 1; 1812 1813 if ( (status = (init <= trip)) == 0 ) { 1814 *p_lb = 0; 1815 *p_ub = 0; 1816 if ( p_st != NULL ) *p_st = 0; 1817 } else { 1818 start = pr->u.p.lb; 1819 limit = chunk + init - 1; 1820 incr = pr->u.p.st; 1821 1822 if ( (last = (limit >= trip)) != 0 ) 1823 limit = trip; 1824 1825 if ( p_st != NULL ) *p_st = incr; 1826 1827 if ( incr == 1 ) { 1828 *p_lb = start + init; 1829 *p_ub = start + limit; 1830 } else { 1831 *p_lb = start + init * incr; 1832 *p_ub = start + limit * incr; 1833 } 1834 1835 if ( pr->ordered ) { 1836 pr->u.p.ordered_lower = init; 1837 pr->u.p.ordered_upper = limit; 1838 #ifdef KMP_DEBUG 1839 { 1840 const char * buff; 1841 // create format specifiers before the debug output 1842 buff = __kmp_str_format( 1843 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1844 traits_t< UT >::spec, traits_t< UT >::spec ); 1845 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1846 __kmp_str_free( &buff ); 1847 } 1848 #endif 1849 } // if 1850 } // if 1851 } // case 1852 break; 1853 1854 case kmp_sch_guided_iterative_chunked: 1855 { 1856 T chunkspec = pr->u.p.parm1; 1857 KD_TRACE(100, 1858 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1859 trip = pr->u.p.tc; 1860 // Start atomic part of calculations 1861 while(1) { 1862 ST remaining; // signed, because can be < 0 1863 init = sh->u.s.iteration; // shared value 1864 remaining = trip - init; 1865 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1866 // nothing to do, don't try atomic op 1867 status = 0; 1868 break; 1869 } 1870 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1871 // use dynamic-style shcedule 1872 // atomically inrement iterations, get old value 1873 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1874 remaining = trip - init; 1875 if (remaining <= 0) { 1876 status = 0; // all iterations got by other threads 1877 } else { 1878 // got some iterations to work on 1879 status = 1; 1880 if ( (T)remaining > chunkspec ) { 1881 limit = init + chunkspec - 1; 1882 } else { 1883 last = 1; // the last chunk 1884 limit = init + remaining - 1; 1885 } // if 1886 } // if 1887 break; 1888 } // if 1889 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1890 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1891 // CAS was successful, chunk obtained 1892 status = 1; 1893 --limit; 1894 break; 1895 } // if 1896 } // while 1897 if ( status != 0 ) { 1898 start = pr->u.p.lb; 1899 incr = pr->u.p.st; 1900 if ( p_st != NULL ) 1901 *p_st = incr; 1902 *p_lb = start + init * incr; 1903 *p_ub = start + limit * incr; 1904 if ( pr->ordered ) { 1905 pr->u.p.ordered_lower = init; 1906 pr->u.p.ordered_upper = limit; 1907 #ifdef KMP_DEBUG 1908 { 1909 const char * buff; 1910 // create format specifiers before the debug output 1911 buff = __kmp_str_format( 1912 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1913 traits_t< UT >::spec, traits_t< UT >::spec ); 1914 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1915 __kmp_str_free( &buff ); 1916 } 1917 #endif 1918 } // if 1919 } else { 1920 *p_lb = 0; 1921 *p_ub = 0; 1922 if ( p_st != NULL ) 1923 *p_st = 0; 1924 } // if 1925 } // case 1926 break; 1927 1928 case kmp_sch_guided_analytical_chunked: 1929 { 1930 T chunkspec = pr->u.p.parm1; 1931 UT chunkIdx; 1932 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1933 /* for storing original FPCW value for Windows* OS on 1934 IA-32 architecture 8-byte version */ 1935 unsigned int oldFpcw; 1936 unsigned int fpcwSet = 0; 1937 #endif 1938 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1939 gtid ) ); 1940 1941 trip = pr->u.p.tc; 1942 1943 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1944 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1945 1946 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1947 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1948 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1949 --trip; 1950 /* use dynamic-style scheduling */ 1951 init = chunkIdx * chunkspec + pr->u.p.count; 1952 /* need to verify init > 0 in case of overflow in the above calculation */ 1953 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1954 limit = init + chunkspec -1; 1955 1956 if ( (last = (limit >= trip)) != 0 ) 1957 limit = trip; 1958 } 1959 break; 1960 } else { 1961 /* use exponential-style scheduling */ 1962 /* The following check is to workaround the lack of long double precision on Windows* OS. 1963 This check works around the possible effect that init != 0 for chunkIdx == 0. 1964 */ 1965 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1966 /* If we haven't already done so, save original 1967 FPCW and set precision to 64-bit, as Windows* OS 1968 on IA-32 architecture defaults to 53-bit */ 1969 if ( !fpcwSet ) { 1970 oldFpcw = _control87(0,0); 1971 _control87(_PC_64,_MCW_PC); 1972 fpcwSet = 0x30000; 1973 } 1974 #endif 1975 if ( chunkIdx ) { 1976 init = __kmp_dispatch_guided_remaining< T >( 1977 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1978 KMP_DEBUG_ASSERT(init); 1979 init = trip - init; 1980 } else 1981 init = 0; 1982 limit = trip - __kmp_dispatch_guided_remaining< T >( 1983 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1984 KMP_ASSERT(init <= limit); 1985 if ( init < limit ) { 1986 KMP_DEBUG_ASSERT(limit <= trip); 1987 --limit; 1988 status = 1; 1989 break; 1990 } // if 1991 } // if 1992 } // while (1) 1993 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1994 /* restore FPCW if necessary 1995 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1996 */ 1997 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1998 _control87(oldFpcw,_MCW_PC); 1999 #endif 2000 if ( status != 0 ) { 2001 start = pr->u.p.lb; 2002 incr = pr->u.p.st; 2003 if ( p_st != NULL ) 2004 *p_st = incr; 2005 *p_lb = start + init * incr; 2006 *p_ub = start + limit * incr; 2007 if ( pr->ordered ) { 2008 pr->u.p.ordered_lower = init; 2009 pr->u.p.ordered_upper = limit; 2010 #ifdef KMP_DEBUG 2011 { 2012 const char * buff; 2013 // create format specifiers before the debug output 2014 buff = __kmp_str_format( 2015 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2016 traits_t< UT >::spec, traits_t< UT >::spec ); 2017 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2018 __kmp_str_free( &buff ); 2019 } 2020 #endif 2021 } 2022 } else { 2023 *p_lb = 0; 2024 *p_ub = 0; 2025 if ( p_st != NULL ) 2026 *p_st = 0; 2027 } 2028 } // case 2029 break; 2030 2031 case kmp_sch_trapezoidal: 2032 { 2033 UT index; 2034 T parm2 = pr->u.p.parm2; 2035 T parm3 = pr->u.p.parm3; 2036 T parm4 = pr->u.p.parm4; 2037 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2038 gtid ) ); 2039 2040 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2041 2042 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2043 trip = pr->u.p.tc - 1; 2044 2045 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2046 *p_lb = 0; 2047 *p_ub = 0; 2048 if ( p_st != NULL ) *p_st = 0; 2049 } else { 2050 start = pr->u.p.lb; 2051 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2052 incr = pr->u.p.st; 2053 2054 if ( (last = (limit >= trip)) != 0 ) 2055 limit = trip; 2056 2057 if ( p_st != NULL ) *p_st = incr; 2058 2059 if ( incr == 1 ) { 2060 *p_lb = start + init; 2061 *p_ub = start + limit; 2062 } else { 2063 *p_lb = start + init * incr; 2064 *p_ub = start + limit * incr; 2065 } 2066 2067 if ( pr->ordered ) { 2068 pr->u.p.ordered_lower = init; 2069 pr->u.p.ordered_upper = limit; 2070 #ifdef KMP_DEBUG 2071 { 2072 const char * buff; 2073 // create format specifiers before the debug output 2074 buff = __kmp_str_format( 2075 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2076 traits_t< UT >::spec, traits_t< UT >::spec ); 2077 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2078 __kmp_str_free( &buff ); 2079 } 2080 #endif 2081 } // if 2082 } // if 2083 } // case 2084 break; 2085 default: 2086 { 2087 status = 0; // to avoid complaints on uninitialized variable use 2088 __kmp_msg( 2089 kmp_ms_fatal, // Severity 2090 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2091 KMP_HNT( GetNewerLibrary ), // Hint 2092 __kmp_msg_null // Variadic argument list terminator 2093 ); 2094 } 2095 break; 2096 } // switch 2097 } // if tc == 0; 2098 2099 if ( status == 0 ) { 2100 UT num_done; 2101 2102 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2103 #ifdef KMP_DEBUG 2104 { 2105 const char * buff; 2106 // create format specifiers before the debug output 2107 buff = __kmp_str_format( 2108 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2109 traits_t< UT >::spec ); 2110 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2111 __kmp_str_free( &buff ); 2112 } 2113 #endif 2114 2115 if ( (ST)num_done == team->t.t_nproc-1 ) { 2116 /* NOTE: release this buffer to be reused */ 2117 2118 KMP_MB(); /* Flush all pending memory write invalidates. */ 2119 2120 sh->u.s.num_done = 0; 2121 sh->u.s.iteration = 0; 2122 2123 /* TODO replace with general release procedure? */ 2124 if ( pr->ordered ) { 2125 sh->u.s.ordered_iteration = 0; 2126 } 2127 2128 KMP_MB(); /* Flush all pending memory write invalidates. */ 2129 2130 sh -> buffer_index += KMP_MAX_DISP_BUF; 2131 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2132 gtid, sh->buffer_index) ); 2133 2134 KMP_MB(); /* Flush all pending memory write invalidates. */ 2135 2136 } // if 2137 if ( __kmp_env_consistency_check ) { 2138 if ( pr->pushed_ws != ct_none ) { 2139 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2140 } 2141 } 2142 2143 th -> th.th_dispatch -> th_deo_fcn = NULL; 2144 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2145 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2146 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2147 } // if (status == 0) 2148 #if KMP_OS_WINDOWS 2149 else if ( last ) { 2150 pr->u.p.last_upper = pr->u.p.ub; 2151 } 2152 #endif /* KMP_OS_WINDOWS */ 2153 if ( p_last != NULL && status != 0 ) 2154 *p_last = last; 2155 } // if 2156 2157 #ifdef KMP_DEBUG 2158 { 2159 const char * buff; 2160 // create format specifiers before the debug output 2161 buff = __kmp_str_format( 2162 "__kmp_dispatch_next: T#%%d normal case: " \ 2163 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2164 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2165 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2166 __kmp_str_free( &buff ); 2167 } 2168 #endif 2169 #if INCLUDE_SSC_MARKS 2170 SSC_MARK_DISPATCH_NEXT(); 2171 #endif 2172 OMPT_LOOP_END; 2173 return status; 2174 } 2175 2176 template< typename T > 2177 static void 2178 __kmp_dist_get_bounds( 2179 ident_t *loc, 2180 kmp_int32 gtid, 2181 kmp_int32 *plastiter, 2182 T *plower, 2183 T *pupper, 2184 typename traits_t< T >::signed_t incr 2185 ) { 2186 typedef typename traits_t< T >::unsigned_t UT; 2187 typedef typename traits_t< T >::signed_t ST; 2188 register kmp_uint32 team_id; 2189 register kmp_uint32 nteams; 2190 register UT trip_count; 2191 register kmp_team_t *team; 2192 kmp_info_t * th; 2193 2194 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2195 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2196 #ifdef KMP_DEBUG 2197 { 2198 const char * buff; 2199 // create format specifiers before the debug output 2200 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2201 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2202 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2203 traits_t< T >::spec ); 2204 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2205 __kmp_str_free( &buff ); 2206 } 2207 #endif 2208 2209 if( __kmp_env_consistency_check ) { 2210 if( incr == 0 ) { 2211 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2212 } 2213 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2214 // The loop is illegal. 2215 // Some zero-trip loops maintained by compiler, e.g.: 2216 // for(i=10;i<0;++i) // lower >= upper - run-time check 2217 // for(i=0;i>10;--i) // lower <= upper - run-time check 2218 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2219 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2220 // Compiler does not check the following illegal loops: 2221 // for(i=0;i<10;i+=incr) // where incr<0 2222 // for(i=10;i>0;i-=incr) // where incr<0 2223 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2224 } 2225 } 2226 th = __kmp_threads[gtid]; 2227 team = th->th.th_team; 2228 #if OMP_40_ENABLED 2229 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2230 nteams = th->th.th_teams_size.nteams; 2231 #endif 2232 team_id = team->t.t_master_tid; 2233 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2234 2235 // compute global trip count 2236 if( incr == 1 ) { 2237 trip_count = *pupper - *plower + 1; 2238 } else if(incr == -1) { 2239 trip_count = *plower - *pupper + 1; 2240 } else { 2241 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2242 } 2243 2244 if( trip_count <= nteams ) { 2245 KMP_DEBUG_ASSERT( 2246 __kmp_static == kmp_sch_static_greedy || \ 2247 __kmp_static == kmp_sch_static_balanced 2248 ); // Unknown static scheduling type. 2249 // only some teams get single iteration, others get nothing 2250 if( team_id < trip_count ) { 2251 *pupper = *plower = *plower + team_id * incr; 2252 } else { 2253 *plower = *pupper + incr; // zero-trip loop 2254 } 2255 if( plastiter != NULL ) 2256 *plastiter = ( team_id == trip_count - 1 ); 2257 } else { 2258 if( __kmp_static == kmp_sch_static_balanced ) { 2259 register UT chunk = trip_count / nteams; 2260 register UT extras = trip_count % nteams; 2261 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2262 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2263 if( plastiter != NULL ) 2264 *plastiter = ( team_id == nteams - 1 ); 2265 } else { 2266 register T chunk_inc_count = 2267 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2268 register T upper = *pupper; 2269 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2270 // Unknown static scheduling type. 2271 *plower += team_id * chunk_inc_count; 2272 *pupper = *plower + chunk_inc_count - incr; 2273 // Check/correct bounds if needed 2274 if( incr > 0 ) { 2275 if( *pupper < *plower ) 2276 *pupper = i_maxmin< T >::mx; 2277 if( plastiter != NULL ) 2278 *plastiter = *plower <= upper && *pupper > upper - incr; 2279 if( *pupper > upper ) 2280 *pupper = upper; // tracker C73258 2281 } else { 2282 if( *pupper > *plower ) 2283 *pupper = i_maxmin< T >::mn; 2284 if( plastiter != NULL ) 2285 *plastiter = *plower >= upper && *pupper < upper - incr; 2286 if( *pupper < upper ) 2287 *pupper = upper; // tracker C73258 2288 } 2289 } 2290 } 2291 } 2292 2293 //----------------------------------------------------------------------------------------- 2294 // Dispatch routines 2295 // Transfer call to template< type T > 2296 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2297 // T lb, T ub, ST st, ST chunk ) 2298 extern "C" { 2299 2300 /*! 2301 @ingroup WORK_SHARING 2302 @{ 2303 @param loc Source location 2304 @param gtid Global thread id 2305 @param schedule Schedule type 2306 @param lb Lower bound 2307 @param ub Upper bound 2308 @param st Step (or increment if you prefer) 2309 @param chunk The chunk size to block with 2310 2311 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2312 These functions are all identical apart from the types of the arguments. 2313 */ 2314 2315 void 2316 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2317 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2318 { 2319 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2320 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2321 } 2322 /*! 2323 See @ref __kmpc_dispatch_init_4 2324 */ 2325 void 2326 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2327 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2328 { 2329 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2330 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2331 } 2332 2333 /*! 2334 See @ref __kmpc_dispatch_init_4 2335 */ 2336 void 2337 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2338 kmp_int64 lb, kmp_int64 ub, 2339 kmp_int64 st, kmp_int64 chunk ) 2340 { 2341 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2342 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2343 } 2344 2345 /*! 2346 See @ref __kmpc_dispatch_init_4 2347 */ 2348 void 2349 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2350 kmp_uint64 lb, kmp_uint64 ub, 2351 kmp_int64 st, kmp_int64 chunk ) 2352 { 2353 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2354 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2355 } 2356 2357 /*! 2358 See @ref __kmpc_dispatch_init_4 2359 2360 Difference from __kmpc_dispatch_init set of functions is these functions 2361 are called for composite distribute parallel for construct. Thus before 2362 regular iterations dispatching we need to calc per-team iteration space. 2363 2364 These functions are all identical apart from the types of the arguments. 2365 */ 2366 void 2367 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2368 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2369 { 2370 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2371 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2372 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2373 } 2374 2375 void 2376 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2377 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2378 { 2379 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2380 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2381 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2382 } 2383 2384 void 2385 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2386 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2387 { 2388 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2389 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2390 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2391 } 2392 2393 void 2394 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2395 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2396 { 2397 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2398 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2399 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2400 } 2401 2402 /*! 2403 @param loc Source code location 2404 @param gtid Global thread id 2405 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2406 @param p_lb Pointer to the lower bound for the next chunk of work 2407 @param p_ub Pointer to the upper bound for the next chunk of work 2408 @param p_st Pointer to the stride for the next chunk of work 2409 @return one if there is work to be done, zero otherwise 2410 2411 Get the next dynamically allocated chunk of work for this thread. 2412 If there is no more work, then the lb,ub and stride need not be modified. 2413 */ 2414 int 2415 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2416 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2417 { 2418 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2419 } 2420 2421 /*! 2422 See @ref __kmpc_dispatch_next_4 2423 */ 2424 int 2425 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2426 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2427 { 2428 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2429 } 2430 2431 /*! 2432 See @ref __kmpc_dispatch_next_4 2433 */ 2434 int 2435 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2436 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2437 { 2438 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2439 } 2440 2441 /*! 2442 See @ref __kmpc_dispatch_next_4 2443 */ 2444 int 2445 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2446 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2447 { 2448 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2449 } 2450 2451 /*! 2452 @param loc Source code location 2453 @param gtid Global thread id 2454 2455 Mark the end of a dynamic loop. 2456 */ 2457 void 2458 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2459 { 2460 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2461 } 2462 2463 /*! 2464 See @ref __kmpc_dispatch_fini_4 2465 */ 2466 void 2467 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2468 { 2469 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2470 } 2471 2472 /*! 2473 See @ref __kmpc_dispatch_fini_4 2474 */ 2475 void 2476 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2477 { 2478 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2479 } 2480 2481 /*! 2482 See @ref __kmpc_dispatch_fini_4 2483 */ 2484 void 2485 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2486 { 2487 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2488 } 2489 /*! @} */ 2490 2491 //----------------------------------------------------------------------------------------- 2492 //Non-template routines from kmp_dispatch.c used in other sources 2493 2494 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2495 return value == checker; 2496 } 2497 2498 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2499 return value != checker; 2500 } 2501 2502 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2503 return value < checker; 2504 } 2505 2506 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2507 return value >= checker; 2508 } 2509 2510 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2511 return value <= checker; 2512 } 2513 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2514 return value == checker; 2515 } 2516 2517 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2518 return value != checker; 2519 } 2520 2521 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2522 return value < checker; 2523 } 2524 2525 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2526 return value >= checker; 2527 } 2528 2529 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2530 return value <= checker; 2531 } 2532 2533 kmp_uint32 2534 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2535 kmp_uint32 checker, 2536 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2537 , void * obj // Higher-level synchronization object, or NULL. 2538 ) 2539 { 2540 // note: we may not belong to a team at this point 2541 register volatile kmp_uint32 * spin = spinner; 2542 register kmp_uint32 check = checker; 2543 register kmp_uint32 spins; 2544 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2545 register kmp_uint32 r; 2546 2547 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2548 KMP_INIT_YIELD( spins ); 2549 // main wait spin loop 2550 while(!f(r = TCR_4(*spin), check)) { 2551 KMP_FSYNC_SPIN_PREPARE( obj ); 2552 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2553 It causes problems with infinite recursion because of exit lock */ 2554 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2555 __kmp_abort_thread(); */ 2556 2557 /* if we have waited a bit, or are oversubscribed, yield */ 2558 /* pause is in the following code */ 2559 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2560 KMP_YIELD_SPIN( spins ); 2561 } 2562 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2563 return r; 2564 } 2565 2566 kmp_uint64 2567 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2568 kmp_uint64 checker, 2569 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2570 , void * obj // Higher-level synchronization object, or NULL. 2571 ) 2572 { 2573 // note: we may not belong to a team at this point 2574 register volatile kmp_uint64 * spin = spinner; 2575 register kmp_uint64 check = checker; 2576 register kmp_uint32 spins; 2577 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2578 register kmp_uint64 r; 2579 2580 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2581 KMP_INIT_YIELD( spins ); 2582 // main wait spin loop 2583 while(!f(r = *spin, check)) 2584 { 2585 KMP_FSYNC_SPIN_PREPARE( obj ); 2586 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2587 It causes problems with infinite recursion because of exit lock */ 2588 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2589 __kmp_abort_thread(); */ 2590 2591 // if we are oversubscribed, 2592 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2593 // pause is in the following code 2594 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2595 KMP_YIELD_SPIN( spins ); 2596 } 2597 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2598 return r; 2599 } 2600 2601 } // extern "C" 2602 2603 #ifdef KMP_GOMP_COMPAT 2604 2605 void 2606 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2607 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2608 kmp_int32 chunk, int push_ws ) 2609 { 2610 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2611 push_ws ); 2612 } 2613 2614 void 2615 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2616 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2617 kmp_int32 chunk, int push_ws ) 2618 { 2619 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2620 push_ws ); 2621 } 2622 2623 void 2624 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2625 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2626 kmp_int64 chunk, int push_ws ) 2627 { 2628 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2629 push_ws ); 2630 } 2631 2632 void 2633 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2634 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2635 kmp_int64 chunk, int push_ws ) 2636 { 2637 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2638 push_ws ); 2639 } 2640 2641 void 2642 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2643 { 2644 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2645 } 2646 2647 void 2648 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2649 { 2650 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2651 } 2652 2653 void 2654 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2655 { 2656 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2657 } 2658 2659 void 2660 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2661 { 2662 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2663 } 2664 2665 #endif /* KMP_GOMP_COMPAT */ 2666 2667 /* ------------------------------------------------------------------------ */ 2668 /* ------------------------------------------------------------------------ */ 2669 2670