1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 637 KMP_MASTER_GTID(gtid) && 638 #if OMP_40_ENABLED 639 th->th.th_teams_microtask == NULL && 640 #endif 641 team->t.t_active_level == 1; 642 #endif 643 if ( ! active ) { 644 pr = reinterpret_cast< dispatch_private_info_template< T >* > 645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 646 } else { 647 KMP_DEBUG_ASSERT( th->th.th_dispatch == 648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 649 650 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 651 652 /* What happens when number of threads changes, need to resize buffer? */ 653 pr = reinterpret_cast< dispatch_private_info_template< T > * > 654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 657 } 658 659 /* Pick up the nomerge/ordered bits from the scheduling type */ 660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 661 pr->nomerge = TRUE; 662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 663 } else { 664 pr->nomerge = FALSE; 665 } 666 pr->type_size = ___kmp_size_type; // remember the size of variables 667 if ( kmp_ord_lower & schedule ) { 668 pr->ordered = TRUE; 669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 670 } else { 671 pr->ordered = FALSE; 672 } 673 674 if ( schedule == kmp_sch_static ) { 675 schedule = __kmp_static; 676 } else { 677 if ( schedule == kmp_sch_runtime ) { 678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 679 schedule = team -> t.t_sched.r_sched_type; 680 // Detail the schedule if needed (global controls are differentiated appropriately) 681 if ( schedule == kmp_sch_guided_chunked ) { 682 schedule = __kmp_guided; 683 } else if ( schedule == kmp_sch_static ) { 684 schedule = __kmp_static; 685 } 686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 687 chunk = team -> t.t_sched.chunk; 688 #if USE_ITT_BUILD 689 cur_chunk = chunk; 690 #endif 691 #ifdef KMP_DEBUG 692 { 693 const char * buff; 694 // create format specifiers before the debug output 695 buff = __kmp_str_format( 696 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 697 traits_t< ST >::spec ); 698 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 699 __kmp_str_free( &buff ); 700 } 701 #endif 702 } else { 703 if ( schedule == kmp_sch_guided_chunked ) { 704 schedule = __kmp_guided; 705 } 706 if ( chunk <= 0 ) { 707 chunk = KMP_DEFAULT_CHUNK; 708 } 709 } 710 711 if ( schedule == kmp_sch_auto ) { 712 // mapping and differentiation: in the __kmp_do_serial_initialize() 713 schedule = __kmp_auto; 714 #ifdef KMP_DEBUG 715 { 716 const char * buff; 717 // create format specifiers before the debug output 718 buff = __kmp_str_format( 719 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 720 traits_t< ST >::spec ); 721 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 722 __kmp_str_free( &buff ); 723 } 724 #endif 725 } 726 727 /* guided analytical not safe for too many threads */ 728 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 729 schedule = kmp_sch_guided_iterative_chunked; 730 KMP_WARNING( DispatchManyThreads ); 731 } 732 pr->u.p.parm1 = chunk; 733 } 734 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 735 "unknown scheduling type" ); 736 737 pr->u.p.count = 0; 738 739 if ( __kmp_env_consistency_check ) { 740 if ( st == 0 ) { 741 __kmp_error_construct( 742 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 743 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 744 ); 745 } 746 } 747 748 tc = ( ub - lb + st ); 749 if ( st != 1 ) { 750 if ( st < 0 ) { 751 if ( lb < ub ) { 752 tc = 0; // zero-trip 753 } else { // lb >= ub 754 tc = (ST)tc / st; // convert to signed division 755 } 756 } else { // st > 0 757 if ( ub < lb ) { 758 tc = 0; // zero-trip 759 } else { // lb >= ub 760 tc /= st; 761 } 762 } 763 } else if ( ub < lb ) { // st == 1 764 tc = 0; // zero-trip 765 } 766 767 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 768 // when statistics are disabled. 769 if (schedule == __kmp_static) 770 { 771 KMP_COUNT_BLOCK(OMP_FOR_static); 772 KMP_COUNT_VALUE(FOR_static_iterations, tc); 773 } 774 else 775 { 776 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 777 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 778 } 779 780 pr->u.p.lb = lb; 781 pr->u.p.ub = ub; 782 pr->u.p.st = st; 783 pr->u.p.tc = tc; 784 785 #if KMP_OS_WINDOWS 786 pr->u.p.last_upper = ub + st; 787 #endif /* KMP_OS_WINDOWS */ 788 789 /* NOTE: only the active parallel region(s) has active ordered sections */ 790 791 if ( active ) { 792 if ( pr->ordered == 0 ) { 793 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 794 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 795 } else { 796 pr->ordered_bumped = 0; 797 798 pr->u.p.ordered_lower = 1; 799 pr->u.p.ordered_upper = 0; 800 801 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 802 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 803 } 804 } 805 806 if ( __kmp_env_consistency_check ) { 807 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 808 if ( push_ws ) { 809 __kmp_push_workshare( gtid, ws, loc ); 810 pr->pushed_ws = ws; 811 } else { 812 __kmp_check_workshare( gtid, ws, loc ); 813 pr->pushed_ws = ct_none; 814 } 815 } 816 817 switch ( schedule ) { 818 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 819 case kmp_sch_static_steal: 820 { 821 T nproc = team->t.t_nproc; 822 T ntc, init; 823 824 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 825 826 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 827 if ( nproc > 1 && ntc >= nproc ) { 828 T id = __kmp_tid_from_gtid(gtid); 829 T small_chunk, extras; 830 831 small_chunk = ntc / nproc; 832 extras = ntc % nproc; 833 834 init = id * small_chunk + ( id < extras ? id : extras ); 835 pr->u.p.count = init; 836 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 837 838 pr->u.p.parm2 = lb; 839 //pr->pfields.parm3 = 0; // it's not used in static_steal 840 pr->u.p.parm4 = id; 841 pr->u.p.st = st; 842 break; 843 } else { 844 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 845 gtid ) ); 846 schedule = kmp_sch_static_balanced; 847 /* too few iterations: fall-through to kmp_sch_static_balanced */ 848 } // if 849 /* FALL-THROUGH to static balanced */ 850 } // case 851 #endif 852 case kmp_sch_static_balanced: 853 { 854 T nproc = team->t.t_nproc; 855 T init, limit; 856 857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 858 gtid ) ); 859 860 if ( nproc > 1 ) { 861 T id = __kmp_tid_from_gtid(gtid); 862 863 if ( tc < nproc ) { 864 if ( id < tc ) { 865 init = id; 866 limit = id; 867 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 868 } else { 869 pr->u.p.count = 1; /* means no more chunks to execute */ 870 pr->u.p.parm1 = FALSE; 871 break; 872 } 873 } else { 874 T small_chunk = tc / nproc; 875 T extras = tc % nproc; 876 init = id * small_chunk + (id < extras ? id : extras); 877 limit = init + small_chunk - (id < extras ? 0 : 1); 878 pr->u.p.parm1 = (id == nproc - 1); 879 } 880 } else { 881 if ( tc > 0 ) { 882 init = 0; 883 limit = tc - 1; 884 pr->u.p.parm1 = TRUE; 885 } else { 886 // zero trip count 887 pr->u.p.count = 1; /* means no more chunks to execute */ 888 pr->u.p.parm1 = FALSE; 889 break; 890 } 891 } 892 #if USE_ITT_BUILD 893 // Calculate chunk for metadata report 894 if ( itt_need_metadata_reporting ) 895 cur_chunk = limit - init + 1; 896 #endif 897 if ( st == 1 ) { 898 pr->u.p.lb = lb + init; 899 pr->u.p.ub = lb + limit; 900 } else { 901 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 902 pr->u.p.lb = lb + init * st; 903 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 904 if ( st > 0 ) { 905 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 906 } else { 907 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 908 } 909 } 910 if ( pr->ordered ) { 911 pr->u.p.ordered_lower = init; 912 pr->u.p.ordered_upper = limit; 913 } 914 break; 915 } // case 916 case kmp_sch_guided_iterative_chunked : 917 { 918 T nproc = team->t.t_nproc; 919 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 920 921 if ( nproc > 1 ) { 922 if ( (2L * chunk + 1 ) * nproc >= tc ) { 923 /* chunk size too large, switch to dynamic */ 924 schedule = kmp_sch_dynamic_chunked; 925 } else { 926 // when remaining iters become less than parm2 - switch to dynamic 927 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 928 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 929 } 930 } else { 931 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 932 schedule = kmp_sch_static_greedy; 933 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 934 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 935 pr->u.p.parm1 = tc; 936 } // if 937 } // case 938 break; 939 case kmp_sch_guided_analytical_chunked: 940 { 941 T nproc = team->t.t_nproc; 942 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 943 944 if ( nproc > 1 ) { 945 if ( (2L * chunk + 1 ) * nproc >= tc ) { 946 /* chunk size too large, switch to dynamic */ 947 schedule = kmp_sch_dynamic_chunked; 948 } else { 949 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 950 DBL x; 951 952 #if KMP_OS_WINDOWS && KMP_ARCH_X86 953 /* Linux* OS already has 64-bit computation by default for 954 long double, and on Windows* OS on Intel(R) 64, 955 /Qlong_double doesn't work. On Windows* OS 956 on IA-32 architecture, we need to set precision to 957 64-bit instead of the default 53-bit. Even though long 958 double doesn't work on Windows* OS on Intel(R) 64, the 959 resulting lack of precision is not expected to impact 960 the correctness of the algorithm, but this has not been 961 mathematically proven. 962 */ 963 // save original FPCW and set precision to 64-bit, as 964 // Windows* OS on IA-32 architecture defaults to 53-bit 965 unsigned int oldFpcw = _control87(0,0); 966 _control87(_PC_64,_MCW_PC); // 0,0x30000 967 #endif 968 /* value used for comparison in solver for cross-over point */ 969 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 970 971 /* crossover point--chunk indexes equal to or greater than 972 this point switch to dynamic-style scheduling */ 973 UT cross; 974 975 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 976 x = (long double)1.0 - (long double)0.5 / nproc; 977 978 #ifdef KMP_DEBUG 979 { // test natural alignment 980 struct _test_a { 981 char a; 982 union { 983 char b; 984 DBL d; 985 }; 986 } t; 987 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 988 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 989 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 990 } 991 #endif // KMP_DEBUG 992 993 /* save the term in thread private dispatch structure */ 994 *(DBL*)&pr->u.p.parm3 = x; 995 996 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 997 { 998 UT left, right, mid; 999 long double p; 1000 1001 /* estimate initial upper and lower bound */ 1002 1003 /* doesn't matter what value right is as long as it is positive, but 1004 it affects performance of the solver 1005 */ 1006 right = 229; 1007 p = __kmp_pow< UT >(x,right); 1008 if ( p > target ) { 1009 do{ 1010 p *= p; 1011 right <<= 1; 1012 } while(p>target && right < (1<<27)); 1013 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1014 } else { 1015 left = 0; 1016 } 1017 1018 /* bisection root-finding method */ 1019 while ( left + 1 < right ) { 1020 mid = (left + right) / 2; 1021 if ( __kmp_pow< UT >(x,mid) > target ) { 1022 left = mid; 1023 } else { 1024 right = mid; 1025 } 1026 } // while 1027 cross = right; 1028 } 1029 /* assert sanity of computed crossover point */ 1030 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1031 1032 /* save the crossover point in thread private dispatch structure */ 1033 pr->u.p.parm2 = cross; 1034 1035 // C75803 1036 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1037 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1038 #else 1039 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1040 #endif 1041 /* dynamic-style scheduling offset */ 1042 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1043 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1044 // restore FPCW 1045 _control87(oldFpcw,_MCW_PC); 1046 #endif 1047 } // if 1048 } else { 1049 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1050 gtid ) ); 1051 schedule = kmp_sch_static_greedy; 1052 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1053 pr->u.p.parm1 = tc; 1054 } // if 1055 } // case 1056 break; 1057 case kmp_sch_static_greedy: 1058 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1059 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1060 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1061 tc; 1062 break; 1063 case kmp_sch_static_chunked : 1064 case kmp_sch_dynamic_chunked : 1065 if ( pr->u.p.parm1 <= 0 ) { 1066 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1067 } 1068 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1069 break; 1070 case kmp_sch_trapezoidal : 1071 { 1072 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1073 1074 T parm1, parm2, parm3, parm4; 1075 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1076 1077 parm1 = chunk; 1078 1079 /* F : size of the first cycle */ 1080 parm2 = ( tc / (2 * team->t.t_nproc) ); 1081 1082 if ( parm2 < 1 ) { 1083 parm2 = 1; 1084 } 1085 1086 /* L : size of the last cycle. Make sure the last cycle 1087 * is not larger than the first cycle. 1088 */ 1089 if ( parm1 < 1 ) { 1090 parm1 = 1; 1091 } else if ( parm1 > parm2 ) { 1092 parm1 = parm2; 1093 } 1094 1095 /* N : number of cycles */ 1096 parm3 = ( parm2 + parm1 ); 1097 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1098 1099 if ( parm3 < 2 ) { 1100 parm3 = 2; 1101 } 1102 1103 /* sigma : decreasing incr of the trapezoid */ 1104 parm4 = ( parm3 - 1 ); 1105 parm4 = ( parm2 - parm1 ) / parm4; 1106 1107 // pointless check, because parm4 >= 0 always 1108 //if ( parm4 < 0 ) { 1109 // parm4 = 0; 1110 //} 1111 1112 pr->u.p.parm1 = parm1; 1113 pr->u.p.parm2 = parm2; 1114 pr->u.p.parm3 = parm3; 1115 pr->u.p.parm4 = parm4; 1116 } // case 1117 break; 1118 1119 default: 1120 { 1121 __kmp_msg( 1122 kmp_ms_fatal, // Severity 1123 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1124 KMP_HNT( GetNewerLibrary ), // Hint 1125 __kmp_msg_null // Variadic argument list terminator 1126 ); 1127 } 1128 break; 1129 } // switch 1130 pr->schedule = schedule; 1131 if ( active ) { 1132 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1133 1134 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1135 gtid, my_buffer_index, sh->buffer_index) ); 1136 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1137 USE_ITT_BUILD_ARG( NULL ) 1138 ); 1139 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1140 // *always* 32-bit integers. 1141 KMP_MB(); /* is this necessary? */ 1142 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1143 gtid, my_buffer_index, sh->buffer_index) ); 1144 1145 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1146 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1147 #if USE_ITT_BUILD 1148 if ( pr->ordered ) { 1149 __kmp_itt_ordered_init( gtid ); 1150 }; // if 1151 // Report loop metadata 1152 if ( itt_need_metadata_reporting ) { 1153 // Only report metadata by master of active team at level 1 1154 kmp_uint64 schedtype = 0; 1155 switch ( schedule ) { 1156 case kmp_sch_static_chunked: 1157 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1158 break; 1159 case kmp_sch_static_greedy: 1160 cur_chunk = pr->u.p.parm1; 1161 break; 1162 case kmp_sch_dynamic_chunked: 1163 schedtype = 1; 1164 break; 1165 case kmp_sch_guided_iterative_chunked: 1166 case kmp_sch_guided_analytical_chunked: 1167 schedtype = 2; 1168 break; 1169 default: 1170 // Should we put this case under "static"? 1171 // case kmp_sch_static_steal: 1172 schedtype = 3; 1173 break; 1174 } 1175 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1176 } 1177 #endif /* USE_ITT_BUILD */ 1178 }; // if 1179 1180 #ifdef KMP_DEBUG 1181 { 1182 const char * buff; 1183 // create format specifiers before the debug output 1184 buff = __kmp_str_format( 1185 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1186 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1187 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1188 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1189 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1190 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1191 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1192 KD_TRACE(10, ( buff, 1193 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1194 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1195 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1196 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1197 __kmp_str_free( &buff ); 1198 } 1199 #endif 1200 #if ( KMP_STATIC_STEAL_ENABLED ) 1201 if ( ___kmp_size_type < 8 ) { 1202 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1203 // all the parm3 variables will contain the same value. 1204 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1205 // rather than program life-time increment. 1206 // So the dedicated variable is required. The 'static_steal_counter' is used. 1207 if( schedule == kmp_sch_static_steal ) { 1208 // Other threads will inspect this variable when searching for a victim. 1209 // This is a flag showing that other threads may steal from this thread since then. 1210 volatile T * p = &pr->u.p.static_steal_counter; 1211 *p = *p + 1; 1212 } 1213 } 1214 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1215 1216 #if OMPT_SUPPORT && OMPT_TRACE 1217 if (ompt_enabled && 1218 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1219 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1220 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1221 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1222 team_info->parallel_id, task_info->task_id, team_info->microtask); 1223 } 1224 #endif 1225 } 1226 1227 /* 1228 * For ordered loops, either __kmp_dispatch_finish() should be called after 1229 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1230 * every chunk of iterations. If the ordered section(s) were not executed 1231 * for this iteration (or every iteration in this chunk), we need to set the 1232 * ordered iteration counters so that the next thread can proceed. 1233 */ 1234 template< typename UT > 1235 static void 1236 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1237 { 1238 typedef typename traits_t< UT >::signed_t ST; 1239 kmp_info_t *th = __kmp_threads[ gtid ]; 1240 1241 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1242 if ( ! th -> th.th_team -> t.t_serialized ) { 1243 1244 dispatch_private_info_template< UT > * pr = 1245 reinterpret_cast< dispatch_private_info_template< UT >* > 1246 ( th->th.th_dispatch->th_dispatch_pr_current ); 1247 dispatch_shared_info_template< UT > volatile * sh = 1248 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1249 ( th->th.th_dispatch->th_dispatch_sh_current ); 1250 KMP_DEBUG_ASSERT( pr ); 1251 KMP_DEBUG_ASSERT( sh ); 1252 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1253 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1254 1255 if ( pr->ordered_bumped ) { 1256 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1257 gtid ) ); 1258 pr->ordered_bumped = 0; 1259 } else { 1260 UT lower = pr->u.p.ordered_lower; 1261 1262 #ifdef KMP_DEBUG 1263 { 1264 const char * buff; 1265 // create format specifiers before the debug output 1266 buff = __kmp_str_format( 1267 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1268 traits_t< UT >::spec, traits_t< UT >::spec ); 1269 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1270 __kmp_str_free( &buff ); 1271 } 1272 #endif 1273 1274 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1275 USE_ITT_BUILD_ARG(NULL) 1276 ); 1277 KMP_MB(); /* is this necessary? */ 1278 #ifdef KMP_DEBUG 1279 { 1280 const char * buff; 1281 // create format specifiers before the debug output 1282 buff = __kmp_str_format( 1283 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1284 traits_t< UT >::spec, traits_t< UT >::spec ); 1285 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1286 __kmp_str_free( &buff ); 1287 } 1288 #endif 1289 1290 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1291 } // if 1292 } // if 1293 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1294 } 1295 1296 #ifdef KMP_GOMP_COMPAT 1297 1298 template< typename UT > 1299 static void 1300 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1301 { 1302 typedef typename traits_t< UT >::signed_t ST; 1303 kmp_info_t *th = __kmp_threads[ gtid ]; 1304 1305 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1306 if ( ! th -> th.th_team -> t.t_serialized ) { 1307 // int cid; 1308 dispatch_private_info_template< UT > * pr = 1309 reinterpret_cast< dispatch_private_info_template< UT >* > 1310 ( th->th.th_dispatch->th_dispatch_pr_current ); 1311 dispatch_shared_info_template< UT > volatile * sh = 1312 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1313 ( th->th.th_dispatch->th_dispatch_sh_current ); 1314 KMP_DEBUG_ASSERT( pr ); 1315 KMP_DEBUG_ASSERT( sh ); 1316 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1317 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1318 1319 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1320 UT lower = pr->u.p.ordered_lower; 1321 UT upper = pr->u.p.ordered_upper; 1322 UT inc = upper - lower + 1; 1323 1324 if ( pr->ordered_bumped == inc ) { 1325 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1326 gtid ) ); 1327 pr->ordered_bumped = 0; 1328 } else { 1329 inc -= pr->ordered_bumped; 1330 1331 #ifdef KMP_DEBUG 1332 { 1333 const char * buff; 1334 // create format specifiers before the debug output 1335 buff = __kmp_str_format( 1336 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1337 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1338 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1339 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1340 __kmp_str_free( &buff ); 1341 } 1342 #endif 1343 1344 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1345 USE_ITT_BUILD_ARG(NULL) 1346 ); 1347 1348 KMP_MB(); /* is this necessary? */ 1349 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1350 gtid ) ); 1351 pr->ordered_bumped = 0; 1352 //!!!!! TODO check if the inc should be unsigned, or signed??? 1353 #ifdef KMP_DEBUG 1354 { 1355 const char * buff; 1356 // create format specifiers before the debug output 1357 buff = __kmp_str_format( 1358 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1359 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1360 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1361 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1362 __kmp_str_free( &buff ); 1363 } 1364 #endif 1365 1366 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1367 } 1368 // } 1369 } 1370 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1371 } 1372 1373 #endif /* KMP_GOMP_COMPAT */ 1374 1375 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1376 * (no more work), then tell OMPT the loop is over. In some cases 1377 * kmp_dispatch_fini() is not called. */ 1378 #if OMPT_SUPPORT && OMPT_TRACE 1379 #define OMPT_LOOP_END \ 1380 if (status == 0) { \ 1381 if (ompt_enabled && \ 1382 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1383 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1384 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1385 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1386 team_info->parallel_id, task_info->task_id); \ 1387 } \ 1388 } 1389 #else 1390 #define OMPT_LOOP_END // no-op 1391 #endif 1392 1393 template< typename T > 1394 static int 1395 __kmp_dispatch_next( 1396 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1397 ) { 1398 1399 typedef typename traits_t< T >::unsigned_t UT; 1400 typedef typename traits_t< T >::signed_t ST; 1401 typedef typename traits_t< T >::floating_t DBL; 1402 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1403 static const int ___kmp_size_type = sizeof( UT ); 1404 #endif 1405 1406 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1407 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1408 // more than a compile time choice to use static scheduling would.) 1409 KMP_TIME_BLOCK(FOR_dynamic_scheduling); 1410 1411 int status; 1412 dispatch_private_info_template< T > * pr; 1413 kmp_info_t * th = __kmp_threads[ gtid ]; 1414 kmp_team_t * team = th -> th.th_team; 1415 1416 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1417 #ifdef KMP_DEBUG 1418 { 1419 const char * buff; 1420 // create format specifiers before the debug output 1421 buff = __kmp_str_format( 1422 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1423 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1424 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1425 __kmp_str_free( &buff ); 1426 } 1427 #endif 1428 1429 if ( team -> t.t_serialized ) { 1430 /* NOTE: serialize this dispatch becase we are not at the active level */ 1431 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1432 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1433 KMP_DEBUG_ASSERT( pr ); 1434 1435 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1436 *p_lb = 0; 1437 *p_ub = 0; 1438 // if ( p_last != NULL ) 1439 // *p_last = 0; 1440 if ( p_st != NULL ) 1441 *p_st = 0; 1442 if ( __kmp_env_consistency_check ) { 1443 if ( pr->pushed_ws != ct_none ) { 1444 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1445 } 1446 } 1447 } else if ( pr->nomerge ) { 1448 kmp_int32 last; 1449 T start; 1450 UT limit, trip, init; 1451 ST incr; 1452 T chunk = pr->u.p.parm1; 1453 1454 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1455 1456 init = chunk * pr->u.p.count++; 1457 trip = pr->u.p.tc - 1; 1458 1459 if ( (status = (init <= trip)) == 0 ) { 1460 *p_lb = 0; 1461 *p_ub = 0; 1462 // if ( p_last != NULL ) 1463 // *p_last = 0; 1464 if ( p_st != NULL ) 1465 *p_st = 0; 1466 if ( __kmp_env_consistency_check ) { 1467 if ( pr->pushed_ws != ct_none ) { 1468 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1469 } 1470 } 1471 } else { 1472 start = pr->u.p.lb; 1473 limit = chunk + init - 1; 1474 incr = pr->u.p.st; 1475 1476 if ( (last = (limit >= trip)) != 0 ) { 1477 limit = trip; 1478 #if KMP_OS_WINDOWS 1479 pr->u.p.last_upper = pr->u.p.ub; 1480 #endif /* KMP_OS_WINDOWS */ 1481 } 1482 if ( p_last != NULL ) 1483 *p_last = last; 1484 if ( p_st != NULL ) 1485 *p_st = incr; 1486 if ( incr == 1 ) { 1487 *p_lb = start + init; 1488 *p_ub = start + limit; 1489 } else { 1490 *p_lb = start + init * incr; 1491 *p_ub = start + limit * incr; 1492 } 1493 1494 if ( pr->ordered ) { 1495 pr->u.p.ordered_lower = init; 1496 pr->u.p.ordered_upper = limit; 1497 #ifdef KMP_DEBUG 1498 { 1499 const char * buff; 1500 // create format specifiers before the debug output 1501 buff = __kmp_str_format( 1502 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1503 traits_t< UT >::spec, traits_t< UT >::spec ); 1504 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1505 __kmp_str_free( &buff ); 1506 } 1507 #endif 1508 } // if 1509 } // if 1510 } else { 1511 pr->u.p.tc = 0; 1512 *p_lb = pr->u.p.lb; 1513 *p_ub = pr->u.p.ub; 1514 #if KMP_OS_WINDOWS 1515 pr->u.p.last_upper = *p_ub; 1516 #endif /* KMP_OS_WINDOWS */ 1517 if ( p_last != NULL ) 1518 *p_last = TRUE; 1519 if ( p_st != NULL ) 1520 *p_st = pr->u.p.st; 1521 } // if 1522 #ifdef KMP_DEBUG 1523 { 1524 const char * buff; 1525 // create format specifiers before the debug output 1526 buff = __kmp_str_format( 1527 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1528 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1529 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1530 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1531 __kmp_str_free( &buff ); 1532 } 1533 #endif 1534 #if INCLUDE_SSC_MARKS 1535 SSC_MARK_DISPATCH_NEXT(); 1536 #endif 1537 OMPT_LOOP_END; 1538 return status; 1539 } else { 1540 kmp_int32 last = 0; 1541 dispatch_shared_info_template< UT > *sh; 1542 T start; 1543 ST incr; 1544 UT limit, trip, init; 1545 1546 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1547 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1548 1549 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1550 ( th->th.th_dispatch->th_dispatch_pr_current ); 1551 KMP_DEBUG_ASSERT( pr ); 1552 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1553 ( th->th.th_dispatch->th_dispatch_sh_current ); 1554 KMP_DEBUG_ASSERT( sh ); 1555 1556 if ( pr->u.p.tc == 0 ) { 1557 // zero trip count 1558 status = 0; 1559 } else { 1560 switch (pr->schedule) { 1561 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1562 case kmp_sch_static_steal: 1563 { 1564 T chunk = pr->u.p.parm1; 1565 1566 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1567 1568 trip = pr->u.p.tc - 1; 1569 1570 if ( ___kmp_size_type > 4 ) { 1571 // Other threads do not look into the data of this thread, 1572 // so it's not necessary to make volatile casting. 1573 init = ( pr->u.p.count )++; 1574 status = ( init < (UT)pr->u.p.ub ); 1575 } else { 1576 typedef union { 1577 struct { 1578 UT count; 1579 T ub; 1580 } p; 1581 kmp_int64 b; 1582 } union_i4; 1583 // All operations on 'count' or 'ub' must be combined atomically together. 1584 // stealing implemented only for 4-byte indexes 1585 { 1586 union_i4 vold, vnew; 1587 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1588 vnew = vold; 1589 vnew.p.count++; 1590 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1591 ( volatile kmp_int64* )&pr->u.p.count, 1592 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1593 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1594 KMP_CPU_PAUSE(); 1595 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1596 vnew = vold; 1597 vnew.p.count++; 1598 } 1599 vnew = vold; 1600 init = vnew.p.count; 1601 status = ( init < (UT)vnew.p.ub ) ; 1602 } 1603 1604 if( !status ) { 1605 kmp_info_t **other_threads = team->t.t_threads; 1606 int while_limit = 10; 1607 int while_index = 0; 1608 1609 // TODO: algorithm of searching for a victim 1610 // should be cleaned up and measured 1611 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1612 union_i4 vold, vnew; 1613 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1614 T victimIdx = pr->u.p.parm4; 1615 T oldVictimIdx = victimIdx; 1616 dispatch_private_info_template< T > * victim; 1617 1618 do { 1619 if( !victimIdx ) { 1620 victimIdx = team->t.t_nproc - 1; 1621 } else { 1622 --victimIdx; 1623 } 1624 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1625 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1626 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1627 // TODO: think about a proper place of this test 1628 if ( ( !victim ) || 1629 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1630 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1631 // TODO: delay would be nice 1632 continue; 1633 // the victim is not ready yet to participate in stealing 1634 // because the victim is still in kmp_init_dispatch 1635 } 1636 if ( oldVictimIdx == victimIdx ) { 1637 break; 1638 } 1639 pr->u.p.parm4 = victimIdx; 1640 1641 while( 1 ) { 1642 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1643 vnew = vold; 1644 1645 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1646 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1647 break; 1648 } 1649 vnew.p.ub -= (remaining >> 2); 1650 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1651 #pragma warning( push ) 1652 // disable warning on pointless comparison of unsigned with 0 1653 #pragma warning( disable: 186 ) 1654 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1655 #pragma warning( pop ) 1656 // TODO: Should this be acquire or release? 1657 if ( KMP_COMPARE_AND_STORE_ACQ64( 1658 ( volatile kmp_int64 * )&victim->u.p.count, 1659 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1660 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1661 status = 1; 1662 while_index = 0; 1663 // now update own count and ub 1664 #if KMP_ARCH_X86 1665 // stealing executed on non-KMP_ARCH_X86 only 1666 // Atomic 64-bit write on ia32 is 1667 // unavailable, so we do this in steps. 1668 // This code is not tested. 1669 init = vold.p.count; 1670 pr->u.p.ub = 0; 1671 pr->u.p.count = init + 1; 1672 pr->u.p.ub = vnew.p.count; 1673 #else 1674 init = vnew.p.ub; 1675 vold.p.count = init + 1; 1676 // TODO: is it safe and enough? 1677 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1678 #endif // KMP_ARCH_X86 1679 break; 1680 } // if 1681 KMP_CPU_PAUSE(); 1682 } // while (1) 1683 } // while 1684 } // if 1685 } // if 1686 if ( !status ) { 1687 *p_lb = 0; 1688 *p_ub = 0; 1689 if ( p_st != NULL ) *p_st = 0; 1690 } else { 1691 start = pr->u.p.parm2; 1692 init *= chunk; 1693 limit = chunk + init - 1; 1694 incr = pr->u.p.st; 1695 1696 KMP_DEBUG_ASSERT(init <= trip); 1697 if ( (last = (limit >= trip)) != 0 ) 1698 limit = trip; 1699 if ( p_st != NULL ) *p_st = incr; 1700 1701 if ( incr == 1 ) { 1702 *p_lb = start + init; 1703 *p_ub = start + limit; 1704 } else { 1705 *p_lb = start + init * incr; 1706 *p_ub = start + limit * incr; 1707 } 1708 1709 if ( pr->ordered ) { 1710 pr->u.p.ordered_lower = init; 1711 pr->u.p.ordered_upper = limit; 1712 #ifdef KMP_DEBUG 1713 { 1714 const char * buff; 1715 // create format specifiers before the debug output 1716 buff = __kmp_str_format( 1717 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1718 traits_t< UT >::spec, traits_t< UT >::spec ); 1719 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1720 __kmp_str_free( &buff ); 1721 } 1722 #endif 1723 } // if 1724 } // if 1725 break; 1726 } // case 1727 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1728 case kmp_sch_static_balanced: 1729 { 1730 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1731 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1732 pr->u.p.count = 1; 1733 *p_lb = pr->u.p.lb; 1734 *p_ub = pr->u.p.ub; 1735 last = pr->u.p.parm1; 1736 if ( p_st != NULL ) 1737 *p_st = pr->u.p.st; 1738 } else { /* no iterations to do */ 1739 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1740 } 1741 if ( pr->ordered ) { 1742 #ifdef KMP_DEBUG 1743 { 1744 const char * buff; 1745 // create format specifiers before the debug output 1746 buff = __kmp_str_format( 1747 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1748 traits_t< UT >::spec, traits_t< UT >::spec ); 1749 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1750 __kmp_str_free( &buff ); 1751 } 1752 #endif 1753 } // if 1754 } // case 1755 break; 1756 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1757 case kmp_sch_static_chunked: 1758 { 1759 T parm1; 1760 1761 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1762 gtid ) ); 1763 parm1 = pr->u.p.parm1; 1764 1765 trip = pr->u.p.tc - 1; 1766 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1767 1768 if ( (status = (init <= trip)) != 0 ) { 1769 start = pr->u.p.lb; 1770 incr = pr->u.p.st; 1771 limit = parm1 + init - 1; 1772 1773 if ( (last = (limit >= trip)) != 0 ) 1774 limit = trip; 1775 1776 if ( p_st != NULL ) *p_st = incr; 1777 1778 pr->u.p.count += team->t.t_nproc; 1779 1780 if ( incr == 1 ) { 1781 *p_lb = start + init; 1782 *p_ub = start + limit; 1783 } 1784 else { 1785 *p_lb = start + init * incr; 1786 *p_ub = start + limit * incr; 1787 } 1788 1789 if ( pr->ordered ) { 1790 pr->u.p.ordered_lower = init; 1791 pr->u.p.ordered_upper = limit; 1792 #ifdef KMP_DEBUG 1793 { 1794 const char * buff; 1795 // create format specifiers before the debug output 1796 buff = __kmp_str_format( 1797 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1798 traits_t< UT >::spec, traits_t< UT >::spec ); 1799 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1800 __kmp_str_free( &buff ); 1801 } 1802 #endif 1803 } // if 1804 } // if 1805 } // case 1806 break; 1807 1808 case kmp_sch_dynamic_chunked: 1809 { 1810 T chunk = pr->u.p.parm1; 1811 1812 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1813 gtid ) ); 1814 1815 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1816 trip = pr->u.p.tc - 1; 1817 1818 if ( (status = (init <= trip)) == 0 ) { 1819 *p_lb = 0; 1820 *p_ub = 0; 1821 if ( p_st != NULL ) *p_st = 0; 1822 } else { 1823 start = pr->u.p.lb; 1824 limit = chunk + init - 1; 1825 incr = pr->u.p.st; 1826 1827 if ( (last = (limit >= trip)) != 0 ) 1828 limit = trip; 1829 1830 if ( p_st != NULL ) *p_st = incr; 1831 1832 if ( incr == 1 ) { 1833 *p_lb = start + init; 1834 *p_ub = start + limit; 1835 } else { 1836 *p_lb = start + init * incr; 1837 *p_ub = start + limit * incr; 1838 } 1839 1840 if ( pr->ordered ) { 1841 pr->u.p.ordered_lower = init; 1842 pr->u.p.ordered_upper = limit; 1843 #ifdef KMP_DEBUG 1844 { 1845 const char * buff; 1846 // create format specifiers before the debug output 1847 buff = __kmp_str_format( 1848 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1849 traits_t< UT >::spec, traits_t< UT >::spec ); 1850 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1851 __kmp_str_free( &buff ); 1852 } 1853 #endif 1854 } // if 1855 } // if 1856 } // case 1857 break; 1858 1859 case kmp_sch_guided_iterative_chunked: 1860 { 1861 T chunkspec = pr->u.p.parm1; 1862 KD_TRACE(100, 1863 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1864 trip = pr->u.p.tc; 1865 // Start atomic part of calculations 1866 while(1) { 1867 ST remaining; // signed, because can be < 0 1868 init = sh->u.s.iteration; // shared value 1869 remaining = trip - init; 1870 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1871 // nothing to do, don't try atomic op 1872 status = 0; 1873 break; 1874 } 1875 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1876 // use dynamic-style shcedule 1877 // atomically inrement iterations, get old value 1878 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1879 remaining = trip - init; 1880 if (remaining <= 0) { 1881 status = 0; // all iterations got by other threads 1882 } else { 1883 // got some iterations to work on 1884 status = 1; 1885 if ( (T)remaining > chunkspec ) { 1886 limit = init + chunkspec - 1; 1887 } else { 1888 last = 1; // the last chunk 1889 limit = init + remaining - 1; 1890 } // if 1891 } // if 1892 break; 1893 } // if 1894 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1895 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1896 // CAS was successful, chunk obtained 1897 status = 1; 1898 --limit; 1899 break; 1900 } // if 1901 } // while 1902 if ( status != 0 ) { 1903 start = pr->u.p.lb; 1904 incr = pr->u.p.st; 1905 if ( p_st != NULL ) 1906 *p_st = incr; 1907 *p_lb = start + init * incr; 1908 *p_ub = start + limit * incr; 1909 if ( pr->ordered ) { 1910 pr->u.p.ordered_lower = init; 1911 pr->u.p.ordered_upper = limit; 1912 #ifdef KMP_DEBUG 1913 { 1914 const char * buff; 1915 // create format specifiers before the debug output 1916 buff = __kmp_str_format( 1917 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1918 traits_t< UT >::spec, traits_t< UT >::spec ); 1919 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1920 __kmp_str_free( &buff ); 1921 } 1922 #endif 1923 } // if 1924 } else { 1925 *p_lb = 0; 1926 *p_ub = 0; 1927 if ( p_st != NULL ) 1928 *p_st = 0; 1929 } // if 1930 } // case 1931 break; 1932 1933 case kmp_sch_guided_analytical_chunked: 1934 { 1935 T chunkspec = pr->u.p.parm1; 1936 UT chunkIdx; 1937 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1938 /* for storing original FPCW value for Windows* OS on 1939 IA-32 architecture 8-byte version */ 1940 unsigned int oldFpcw; 1941 unsigned int fpcwSet = 0; 1942 #endif 1943 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1944 gtid ) ); 1945 1946 trip = pr->u.p.tc; 1947 1948 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1949 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1950 1951 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1952 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1953 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1954 --trip; 1955 /* use dynamic-style scheduling */ 1956 init = chunkIdx * chunkspec + pr->u.p.count; 1957 /* need to verify init > 0 in case of overflow in the above calculation */ 1958 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1959 limit = init + chunkspec -1; 1960 1961 if ( (last = (limit >= trip)) != 0 ) 1962 limit = trip; 1963 } 1964 break; 1965 } else { 1966 /* use exponential-style scheduling */ 1967 /* The following check is to workaround the lack of long double precision on Windows* OS. 1968 This check works around the possible effect that init != 0 for chunkIdx == 0. 1969 */ 1970 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1971 /* If we haven't already done so, save original 1972 FPCW and set precision to 64-bit, as Windows* OS 1973 on IA-32 architecture defaults to 53-bit */ 1974 if ( !fpcwSet ) { 1975 oldFpcw = _control87(0,0); 1976 _control87(_PC_64,_MCW_PC); 1977 fpcwSet = 0x30000; 1978 } 1979 #endif 1980 if ( chunkIdx ) { 1981 init = __kmp_dispatch_guided_remaining< T >( 1982 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1983 KMP_DEBUG_ASSERT(init); 1984 init = trip - init; 1985 } else 1986 init = 0; 1987 limit = trip - __kmp_dispatch_guided_remaining< T >( 1988 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1989 KMP_ASSERT(init <= limit); 1990 if ( init < limit ) { 1991 KMP_DEBUG_ASSERT(limit <= trip); 1992 --limit; 1993 status = 1; 1994 break; 1995 } // if 1996 } // if 1997 } // while (1) 1998 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1999 /* restore FPCW if necessary 2000 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2001 */ 2002 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2003 _control87(oldFpcw,_MCW_PC); 2004 #endif 2005 if ( status != 0 ) { 2006 start = pr->u.p.lb; 2007 incr = pr->u.p.st; 2008 if ( p_st != NULL ) 2009 *p_st = incr; 2010 *p_lb = start + init * incr; 2011 *p_ub = start + limit * incr; 2012 if ( pr->ordered ) { 2013 pr->u.p.ordered_lower = init; 2014 pr->u.p.ordered_upper = limit; 2015 #ifdef KMP_DEBUG 2016 { 2017 const char * buff; 2018 // create format specifiers before the debug output 2019 buff = __kmp_str_format( 2020 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2021 traits_t< UT >::spec, traits_t< UT >::spec ); 2022 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2023 __kmp_str_free( &buff ); 2024 } 2025 #endif 2026 } 2027 } else { 2028 *p_lb = 0; 2029 *p_ub = 0; 2030 if ( p_st != NULL ) 2031 *p_st = 0; 2032 } 2033 } // case 2034 break; 2035 2036 case kmp_sch_trapezoidal: 2037 { 2038 UT index; 2039 T parm2 = pr->u.p.parm2; 2040 T parm3 = pr->u.p.parm3; 2041 T parm4 = pr->u.p.parm4; 2042 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2043 gtid ) ); 2044 2045 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2046 2047 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2048 trip = pr->u.p.tc - 1; 2049 2050 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2051 *p_lb = 0; 2052 *p_ub = 0; 2053 if ( p_st != NULL ) *p_st = 0; 2054 } else { 2055 start = pr->u.p.lb; 2056 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2057 incr = pr->u.p.st; 2058 2059 if ( (last = (limit >= trip)) != 0 ) 2060 limit = trip; 2061 2062 if ( p_st != NULL ) *p_st = incr; 2063 2064 if ( incr == 1 ) { 2065 *p_lb = start + init; 2066 *p_ub = start + limit; 2067 } else { 2068 *p_lb = start + init * incr; 2069 *p_ub = start + limit * incr; 2070 } 2071 2072 if ( pr->ordered ) { 2073 pr->u.p.ordered_lower = init; 2074 pr->u.p.ordered_upper = limit; 2075 #ifdef KMP_DEBUG 2076 { 2077 const char * buff; 2078 // create format specifiers before the debug output 2079 buff = __kmp_str_format( 2080 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2081 traits_t< UT >::spec, traits_t< UT >::spec ); 2082 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2083 __kmp_str_free( &buff ); 2084 } 2085 #endif 2086 } // if 2087 } // if 2088 } // case 2089 break; 2090 default: 2091 { 2092 status = 0; // to avoid complaints on uninitialized variable use 2093 __kmp_msg( 2094 kmp_ms_fatal, // Severity 2095 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2096 KMP_HNT( GetNewerLibrary ), // Hint 2097 __kmp_msg_null // Variadic argument list terminator 2098 ); 2099 } 2100 break; 2101 } // switch 2102 } // if tc == 0; 2103 2104 if ( status == 0 ) { 2105 UT num_done; 2106 2107 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2108 #ifdef KMP_DEBUG 2109 { 2110 const char * buff; 2111 // create format specifiers before the debug output 2112 buff = __kmp_str_format( 2113 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2114 traits_t< UT >::spec ); 2115 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2116 __kmp_str_free( &buff ); 2117 } 2118 #endif 2119 2120 if ( (ST)num_done == team->t.t_nproc-1 ) { 2121 /* NOTE: release this buffer to be reused */ 2122 2123 KMP_MB(); /* Flush all pending memory write invalidates. */ 2124 2125 sh->u.s.num_done = 0; 2126 sh->u.s.iteration = 0; 2127 2128 /* TODO replace with general release procedure? */ 2129 if ( pr->ordered ) { 2130 sh->u.s.ordered_iteration = 0; 2131 } 2132 2133 KMP_MB(); /* Flush all pending memory write invalidates. */ 2134 2135 sh -> buffer_index += KMP_MAX_DISP_BUF; 2136 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2137 gtid, sh->buffer_index) ); 2138 2139 KMP_MB(); /* Flush all pending memory write invalidates. */ 2140 2141 } // if 2142 if ( __kmp_env_consistency_check ) { 2143 if ( pr->pushed_ws != ct_none ) { 2144 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2145 } 2146 } 2147 2148 th -> th.th_dispatch -> th_deo_fcn = NULL; 2149 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2150 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2151 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2152 } // if (status == 0) 2153 #if KMP_OS_WINDOWS 2154 else if ( last ) { 2155 pr->u.p.last_upper = pr->u.p.ub; 2156 } 2157 #endif /* KMP_OS_WINDOWS */ 2158 if ( p_last != NULL && status != 0 ) 2159 *p_last = last; 2160 } // if 2161 2162 #ifdef KMP_DEBUG 2163 { 2164 const char * buff; 2165 // create format specifiers before the debug output 2166 buff = __kmp_str_format( 2167 "__kmp_dispatch_next: T#%%d normal case: " \ 2168 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2169 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2170 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2171 __kmp_str_free( &buff ); 2172 } 2173 #endif 2174 #if INCLUDE_SSC_MARKS 2175 SSC_MARK_DISPATCH_NEXT(); 2176 #endif 2177 OMPT_LOOP_END; 2178 return status; 2179 } 2180 2181 template< typename T > 2182 static void 2183 __kmp_dist_get_bounds( 2184 ident_t *loc, 2185 kmp_int32 gtid, 2186 kmp_int32 *plastiter, 2187 T *plower, 2188 T *pupper, 2189 typename traits_t< T >::signed_t incr 2190 ) { 2191 typedef typename traits_t< T >::unsigned_t UT; 2192 typedef typename traits_t< T >::signed_t ST; 2193 register kmp_uint32 team_id; 2194 register kmp_uint32 nteams; 2195 register UT trip_count; 2196 register kmp_team_t *team; 2197 kmp_info_t * th; 2198 2199 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2200 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2201 #ifdef KMP_DEBUG 2202 { 2203 const char * buff; 2204 // create format specifiers before the debug output 2205 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2206 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2207 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2208 traits_t< T >::spec ); 2209 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2210 __kmp_str_free( &buff ); 2211 } 2212 #endif 2213 2214 if( __kmp_env_consistency_check ) { 2215 if( incr == 0 ) { 2216 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2217 } 2218 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2219 // The loop is illegal. 2220 // Some zero-trip loops maintained by compiler, e.g.: 2221 // for(i=10;i<0;++i) // lower >= upper - run-time check 2222 // for(i=0;i>10;--i) // lower <= upper - run-time check 2223 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2224 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2225 // Compiler does not check the following illegal loops: 2226 // for(i=0;i<10;i+=incr) // where incr<0 2227 // for(i=10;i>0;i-=incr) // where incr<0 2228 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2229 } 2230 } 2231 th = __kmp_threads[gtid]; 2232 team = th->th.th_team; 2233 #if OMP_40_ENABLED 2234 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2235 nteams = th->th.th_teams_size.nteams; 2236 #endif 2237 team_id = team->t.t_master_tid; 2238 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2239 2240 // compute global trip count 2241 if( incr == 1 ) { 2242 trip_count = *pupper - *plower + 1; 2243 } else if(incr == -1) { 2244 trip_count = *plower - *pupper + 1; 2245 } else { 2246 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2247 } 2248 2249 if( trip_count <= nteams ) { 2250 KMP_DEBUG_ASSERT( 2251 __kmp_static == kmp_sch_static_greedy || \ 2252 __kmp_static == kmp_sch_static_balanced 2253 ); // Unknown static scheduling type. 2254 // only some teams get single iteration, others get nothing 2255 if( team_id < trip_count ) { 2256 *pupper = *plower = *plower + team_id * incr; 2257 } else { 2258 *plower = *pupper + incr; // zero-trip loop 2259 } 2260 if( plastiter != NULL ) 2261 *plastiter = ( team_id == trip_count - 1 ); 2262 } else { 2263 if( __kmp_static == kmp_sch_static_balanced ) { 2264 register UT chunk = trip_count / nteams; 2265 register UT extras = trip_count % nteams; 2266 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2267 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2268 if( plastiter != NULL ) 2269 *plastiter = ( team_id == nteams - 1 ); 2270 } else { 2271 register T chunk_inc_count = 2272 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2273 register T upper = *pupper; 2274 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2275 // Unknown static scheduling type. 2276 *plower += team_id * chunk_inc_count; 2277 *pupper = *plower + chunk_inc_count - incr; 2278 // Check/correct bounds if needed 2279 if( incr > 0 ) { 2280 if( *pupper < *plower ) 2281 *pupper = i_maxmin< T >::mx; 2282 if( plastiter != NULL ) 2283 *plastiter = *plower <= upper && *pupper > upper - incr; 2284 if( *pupper > upper ) 2285 *pupper = upper; // tracker C73258 2286 } else { 2287 if( *pupper > *plower ) 2288 *pupper = i_maxmin< T >::mn; 2289 if( plastiter != NULL ) 2290 *plastiter = *plower >= upper && *pupper < upper - incr; 2291 if( *pupper < upper ) 2292 *pupper = upper; // tracker C73258 2293 } 2294 } 2295 } 2296 } 2297 2298 //----------------------------------------------------------------------------------------- 2299 // Dispatch routines 2300 // Transfer call to template< type T > 2301 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2302 // T lb, T ub, ST st, ST chunk ) 2303 extern "C" { 2304 2305 /*! 2306 @ingroup WORK_SHARING 2307 @{ 2308 @param loc Source location 2309 @param gtid Global thread id 2310 @param schedule Schedule type 2311 @param lb Lower bound 2312 @param ub Upper bound 2313 @param st Step (or increment if you prefer) 2314 @param chunk The chunk size to block with 2315 2316 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2317 These functions are all identical apart from the types of the arguments. 2318 */ 2319 2320 void 2321 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2322 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2323 { 2324 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2325 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2326 } 2327 /*! 2328 See @ref __kmpc_dispatch_init_4 2329 */ 2330 void 2331 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2332 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2333 { 2334 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2335 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2336 } 2337 2338 /*! 2339 See @ref __kmpc_dispatch_init_4 2340 */ 2341 void 2342 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2343 kmp_int64 lb, kmp_int64 ub, 2344 kmp_int64 st, kmp_int64 chunk ) 2345 { 2346 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2347 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2348 } 2349 2350 /*! 2351 See @ref __kmpc_dispatch_init_4 2352 */ 2353 void 2354 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2355 kmp_uint64 lb, kmp_uint64 ub, 2356 kmp_int64 st, kmp_int64 chunk ) 2357 { 2358 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2359 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2360 } 2361 2362 /*! 2363 See @ref __kmpc_dispatch_init_4 2364 2365 Difference from __kmpc_dispatch_init set of functions is these functions 2366 are called for composite distribute parallel for construct. Thus before 2367 regular iterations dispatching we need to calc per-team iteration space. 2368 2369 These functions are all identical apart from the types of the arguments. 2370 */ 2371 void 2372 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2373 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2374 { 2375 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2376 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2377 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2378 } 2379 2380 void 2381 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2382 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2383 { 2384 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2385 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2386 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2387 } 2388 2389 void 2390 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2391 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2392 { 2393 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2394 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2395 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2396 } 2397 2398 void 2399 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2400 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2401 { 2402 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2403 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2404 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2405 } 2406 2407 /*! 2408 @param loc Source code location 2409 @param gtid Global thread id 2410 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2411 @param p_lb Pointer to the lower bound for the next chunk of work 2412 @param p_ub Pointer to the upper bound for the next chunk of work 2413 @param p_st Pointer to the stride for the next chunk of work 2414 @return one if there is work to be done, zero otherwise 2415 2416 Get the next dynamically allocated chunk of work for this thread. 2417 If there is no more work, then the lb,ub and stride need not be modified. 2418 */ 2419 int 2420 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2421 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2422 { 2423 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2424 } 2425 2426 /*! 2427 See @ref __kmpc_dispatch_next_4 2428 */ 2429 int 2430 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2431 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2432 { 2433 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2434 } 2435 2436 /*! 2437 See @ref __kmpc_dispatch_next_4 2438 */ 2439 int 2440 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2441 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2442 { 2443 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2444 } 2445 2446 /*! 2447 See @ref __kmpc_dispatch_next_4 2448 */ 2449 int 2450 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2451 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2452 { 2453 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2454 } 2455 2456 /*! 2457 @param loc Source code location 2458 @param gtid Global thread id 2459 2460 Mark the end of a dynamic loop. 2461 */ 2462 void 2463 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2464 { 2465 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2466 } 2467 2468 /*! 2469 See @ref __kmpc_dispatch_fini_4 2470 */ 2471 void 2472 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2473 { 2474 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2475 } 2476 2477 /*! 2478 See @ref __kmpc_dispatch_fini_4 2479 */ 2480 void 2481 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2482 { 2483 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2484 } 2485 2486 /*! 2487 See @ref __kmpc_dispatch_fini_4 2488 */ 2489 void 2490 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2491 { 2492 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2493 } 2494 /*! @} */ 2495 2496 //----------------------------------------------------------------------------------------- 2497 //Non-template routines from kmp_dispatch.c used in other sources 2498 2499 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2500 return value == checker; 2501 } 2502 2503 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2504 return value != checker; 2505 } 2506 2507 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2508 return value < checker; 2509 } 2510 2511 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2512 return value >= checker; 2513 } 2514 2515 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2516 return value <= checker; 2517 } 2518 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2519 return value == checker; 2520 } 2521 2522 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2523 return value != checker; 2524 } 2525 2526 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2527 return value < checker; 2528 } 2529 2530 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2531 return value >= checker; 2532 } 2533 2534 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2535 return value <= checker; 2536 } 2537 2538 kmp_uint32 2539 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2540 kmp_uint32 checker, 2541 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2542 , void * obj // Higher-level synchronization object, or NULL. 2543 ) 2544 { 2545 // note: we may not belong to a team at this point 2546 register volatile kmp_uint32 * spin = spinner; 2547 register kmp_uint32 check = checker; 2548 register kmp_uint32 spins; 2549 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2550 register kmp_uint32 r; 2551 2552 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2553 KMP_INIT_YIELD( spins ); 2554 // main wait spin loop 2555 while(!f(r = TCR_4(*spin), check)) { 2556 KMP_FSYNC_SPIN_PREPARE( obj ); 2557 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2558 It causes problems with infinite recursion because of exit lock */ 2559 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2560 __kmp_abort_thread(); */ 2561 2562 /* if we have waited a bit, or are oversubscribed, yield */ 2563 /* pause is in the following code */ 2564 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2565 KMP_YIELD_SPIN( spins ); 2566 } 2567 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2568 return r; 2569 } 2570 2571 kmp_uint64 2572 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2573 kmp_uint64 checker, 2574 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2575 , void * obj // Higher-level synchronization object, or NULL. 2576 ) 2577 { 2578 // note: we may not belong to a team at this point 2579 register volatile kmp_uint64 * spin = spinner; 2580 register kmp_uint64 check = checker; 2581 register kmp_uint32 spins; 2582 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2583 register kmp_uint64 r; 2584 2585 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2586 KMP_INIT_YIELD( spins ); 2587 // main wait spin loop 2588 while(!f(r = *spin, check)) 2589 { 2590 KMP_FSYNC_SPIN_PREPARE( obj ); 2591 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2592 It causes problems with infinite recursion because of exit lock */ 2593 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2594 __kmp_abort_thread(); */ 2595 2596 // if we are oversubscribed, 2597 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2598 // pause is in the following code 2599 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2600 KMP_YIELD_SPIN( spins ); 2601 } 2602 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2603 return r; 2604 } 2605 2606 } // extern "C" 2607 2608 #ifdef KMP_GOMP_COMPAT 2609 2610 void 2611 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2612 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2613 kmp_int32 chunk, int push_ws ) 2614 { 2615 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2616 push_ws ); 2617 } 2618 2619 void 2620 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2621 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2622 kmp_int32 chunk, int push_ws ) 2623 { 2624 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2625 push_ws ); 2626 } 2627 2628 void 2629 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2630 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2631 kmp_int64 chunk, int push_ws ) 2632 { 2633 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2634 push_ws ); 2635 } 2636 2637 void 2638 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2639 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2640 kmp_int64 chunk, int push_ws ) 2641 { 2642 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2643 push_ws ); 2644 } 2645 2646 void 2647 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2648 { 2649 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2650 } 2651 2652 void 2653 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2654 { 2655 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2656 } 2657 2658 void 2659 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2660 { 2661 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2662 } 2663 2664 void 2665 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2666 { 2667 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2668 } 2669 2670 #endif /* KMP_GOMP_COMPAT */ 2671 2672 /* ------------------------------------------------------------------------ */ 2673 /* ------------------------------------------------------------------------ */ 2674 2675