1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 #if OMP_41_ENABLED 179 volatile kmp_int32 doacross_buf_idx; // teamwise index 180 kmp_uint32 *doacross_flags; // array of iteration flags (0/1) 181 kmp_int32 doacross_num_done; // count finished threads 182 #endif 183 }; 184 185 /* ------------------------------------------------------------------------ */ 186 /* ------------------------------------------------------------------------ */ 187 188 #undef USE_TEST_LOCKS 189 190 // test_then_add template (general template should NOT be used) 191 template< typename T > 192 static __forceinline T 193 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 194 195 template<> 196 __forceinline kmp_int32 197 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 198 { 199 kmp_int32 r; 200 r = KMP_TEST_THEN_ADD32( p, d ); 201 return r; 202 } 203 204 template<> 205 __forceinline kmp_int64 206 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 207 { 208 kmp_int64 r; 209 r = KMP_TEST_THEN_ADD64( p, d ); 210 return r; 211 } 212 213 // test_then_inc_acq template (general template should NOT be used) 214 template< typename T > 215 static __forceinline T 216 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 217 218 template<> 219 __forceinline kmp_int32 220 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 221 { 222 kmp_int32 r; 223 r = KMP_TEST_THEN_INC_ACQ32( p ); 224 return r; 225 } 226 227 template<> 228 __forceinline kmp_int64 229 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 230 { 231 kmp_int64 r; 232 r = KMP_TEST_THEN_INC_ACQ64( p ); 233 return r; 234 } 235 236 // test_then_inc template (general template should NOT be used) 237 template< typename T > 238 static __forceinline T 239 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 240 241 template<> 242 __forceinline kmp_int32 243 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 244 { 245 kmp_int32 r; 246 r = KMP_TEST_THEN_INC32( p ); 247 return r; 248 } 249 250 template<> 251 __forceinline kmp_int64 252 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 253 { 254 kmp_int64 r; 255 r = KMP_TEST_THEN_INC64( p ); 256 return r; 257 } 258 259 // compare_and_swap template (general template should NOT be used) 260 template< typename T > 261 static __forceinline kmp_int32 262 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 263 264 template<> 265 __forceinline kmp_int32 266 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 267 { 268 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 269 } 270 271 template<> 272 __forceinline kmp_int32 273 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 274 { 275 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 276 } 277 278 /* 279 Spin wait loop that first does pause, then yield. 280 Waits until function returns non-zero when called with *spinner and check. 281 Does NOT put threads to sleep. 282 #if USE_ITT_BUILD 283 Arguments: 284 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 285 locks consistently. For example, if lock is acquired immediately, its address is 286 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 287 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 288 address, not an address of low-level spinner. 289 #endif // USE_ITT_BUILD 290 */ 291 template< typename UT > 292 // ToDo: make inline function (move to header file for icl) 293 static UT // unsigned 4- or 8-byte type 294 __kmp_wait_yield( volatile UT * spinner, 295 UT checker, 296 kmp_uint32 (* pred)( UT, UT ) 297 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 298 ) 299 { 300 // note: we may not belong to a team at this point 301 register volatile UT * spin = spinner; 302 register UT check = checker; 303 register kmp_uint32 spins; 304 register kmp_uint32 (*f) ( UT, UT ) = pred; 305 register UT r; 306 307 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 308 KMP_INIT_YIELD( spins ); 309 // main wait spin loop 310 while(!f(r = *spin, check)) 311 { 312 KMP_FSYNC_SPIN_PREPARE( obj ); 313 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 314 It causes problems with infinite recursion because of exit lock */ 315 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 316 __kmp_abort_thread(); */ 317 318 // if we are oversubscribed, 319 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 320 // pause is in the following code 321 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 322 KMP_YIELD_SPIN( spins ); 323 } 324 KMP_FSYNC_SPIN_ACQUIRED( obj ); 325 return r; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_eq( UT value, UT checker) { 330 return value == checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_neq( UT value, UT checker) { 335 return value != checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_lt( UT value, UT checker) { 340 return value < checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_ge( UT value, UT checker) { 345 return value >= checker; 346 } 347 348 template< typename UT > 349 static kmp_uint32 __kmp_le( UT value, UT checker) { 350 return value <= checker; 351 } 352 353 354 /* ------------------------------------------------------------------------ */ 355 /* ------------------------------------------------------------------------ */ 356 357 static void 358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 359 { 360 kmp_info_t *th; 361 362 KMP_DEBUG_ASSERT( gtid_ref ); 363 364 if ( __kmp_env_consistency_check ) { 365 th = __kmp_threads[*gtid_ref]; 366 if ( th -> th.th_root -> r.r_active 367 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 368 #if KMP_USE_DYNAMIC_LOCK 369 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 370 #else 371 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 372 #endif 373 } 374 } 375 } 376 377 template< typename UT > 378 static void 379 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 380 { 381 typedef typename traits_t< UT >::signed_t ST; 382 dispatch_private_info_template< UT > * pr; 383 384 int gtid = *gtid_ref; 385 // int cid = *cid_ref; 386 kmp_info_t *th = __kmp_threads[ gtid ]; 387 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 388 389 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 390 if ( __kmp_env_consistency_check ) { 391 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 392 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 393 if ( pr -> pushed_ws != ct_none ) { 394 #if KMP_USE_DYNAMIC_LOCK 395 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 396 #else 397 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 398 #endif 399 } 400 } 401 402 if ( ! th -> th.th_team -> t.t_serialized ) { 403 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 405 UT lower; 406 407 if ( ! __kmp_env_consistency_check ) { 408 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 409 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 410 } 411 lower = pr->u.p.ordered_lower; 412 413 #if ! defined( KMP_GOMP_COMPAT ) 414 if ( __kmp_env_consistency_check ) { 415 if ( pr->ordered_bumped ) { 416 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 417 __kmp_error_construct2( 418 kmp_i18n_msg_CnsMultipleNesting, 419 ct_ordered_in_pdo, loc_ref, 420 & p->stack_data[ p->w_top ] 421 ); 422 } 423 } 424 #endif /* !defined(KMP_GOMP_COMPAT) */ 425 426 KMP_MB(); 427 #ifdef KMP_DEBUG 428 { 429 const char * buff; 430 // create format specifiers before the debug output 431 buff = __kmp_str_format( 432 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 433 traits_t< UT >::spec, traits_t< UT >::spec ); 434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 435 __kmp_str_free( &buff ); 436 } 437 #endif 438 439 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 440 USE_ITT_BUILD_ARG( NULL ) 441 ); 442 KMP_MB(); /* is this necessary? */ 443 #ifdef KMP_DEBUG 444 { 445 const char * buff; 446 // create format specifiers before the debug output 447 buff = __kmp_str_format( 448 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 449 traits_t< UT >::spec, traits_t< UT >::spec ); 450 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 451 __kmp_str_free( &buff ); 452 } 453 #endif 454 } 455 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 456 } 457 458 static void 459 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 460 { 461 kmp_info_t *th; 462 463 if ( __kmp_env_consistency_check ) { 464 th = __kmp_threads[*gtid_ref]; 465 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 466 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 467 } 468 } 469 } 470 471 template< typename UT > 472 static void 473 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 474 { 475 typedef typename traits_t< UT >::signed_t ST; 476 dispatch_private_info_template< UT > * pr; 477 478 int gtid = *gtid_ref; 479 // int cid = *cid_ref; 480 kmp_info_t *th = __kmp_threads[ gtid ]; 481 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 482 483 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 484 if ( __kmp_env_consistency_check ) { 485 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 486 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 487 if ( pr -> pushed_ws != ct_none ) { 488 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 489 } 490 } 491 492 if ( ! th -> th.th_team -> t.t_serialized ) { 493 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 494 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 495 496 if ( ! __kmp_env_consistency_check ) { 497 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 498 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 499 } 500 501 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 502 #if ! defined( KMP_GOMP_COMPAT ) 503 if ( __kmp_env_consistency_check ) { 504 if ( pr->ordered_bumped != 0 ) { 505 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 506 /* How to test it? - OM */ 507 __kmp_error_construct2( 508 kmp_i18n_msg_CnsMultipleNesting, 509 ct_ordered_in_pdo, loc_ref, 510 & p->stack_data[ p->w_top ] 511 ); 512 } 513 } 514 #endif /* !defined(KMP_GOMP_COMPAT) */ 515 516 KMP_MB(); /* Flush all pending memory write invalidates. */ 517 518 pr->ordered_bumped += 1; 519 520 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 521 gtid, pr->ordered_bumped ) ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 525 /* TODO use general release procedure? */ 526 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 527 528 KMP_MB(); /* Flush all pending memory write invalidates. */ 529 } 530 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 531 } 532 533 /* Computes and returns x to the power of y, where y must a non-negative integer */ 534 template< typename UT > 535 static __forceinline long double 536 __kmp_pow(long double x, UT y) { 537 long double s=1.0L; 538 539 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 540 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 541 while(y) { 542 if ( y & 1 ) 543 s *= x; 544 x *= x; 545 y >>= 1; 546 } 547 return s; 548 } 549 550 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 551 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 552 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 553 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 554 */ 555 template< typename T > 556 static __inline typename traits_t< T >::unsigned_t 557 __kmp_dispatch_guided_remaining( 558 T tc, 559 typename traits_t< T >::floating_t base, 560 typename traits_t< T >::unsigned_t idx 561 ) { 562 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 563 least for ICL 8.1, long double arithmetic may not really have 564 long double precision, even with /Qlong_double. Currently, we 565 workaround that in the caller code, by manipulating the FPCW for 566 Windows* OS on IA-32 architecture. The lack of precision is not 567 expected to be a correctness issue, though. 568 */ 569 typedef typename traits_t< T >::unsigned_t UT; 570 571 long double x = tc * __kmp_pow< UT >(base, idx); 572 UT r = (UT) x; 573 if ( x == r ) 574 return r; 575 return r + 1; 576 } 577 578 // Parameters of the guided-iterative algorithm: 579 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 580 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 581 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 582 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 583 static int guided_int_param = 2; 584 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 585 586 // UT - unsigned flavor of T, ST - signed flavor of T, 587 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 588 template< typename T > 589 static void 590 __kmp_dispatch_init( 591 ident_t * loc, 592 int gtid, 593 enum sched_type schedule, 594 T lb, 595 T ub, 596 typename traits_t< T >::signed_t st, 597 typename traits_t< T >::signed_t chunk, 598 int push_ws 599 ) { 600 typedef typename traits_t< T >::unsigned_t UT; 601 typedef typename traits_t< T >::signed_t ST; 602 typedef typename traits_t< T >::floating_t DBL; 603 static const int ___kmp_size_type = sizeof( UT ); 604 605 int active; 606 T tc; 607 kmp_info_t * th; 608 kmp_team_t * team; 609 kmp_uint32 my_buffer_index; 610 dispatch_private_info_template< T > * pr; 611 dispatch_shared_info_template< UT > volatile * sh; 612 613 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 614 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 615 616 if ( ! TCR_4( __kmp_init_parallel ) ) 617 __kmp_parallel_initialize(); 618 619 #if INCLUDE_SSC_MARKS 620 SSC_MARK_DISPATCH_INIT(); 621 #endif 622 #ifdef KMP_DEBUG 623 { 624 const char * buff; 625 // create format specifiers before the debug output 626 buff = __kmp_str_format( 627 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 628 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 629 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 630 __kmp_str_free( &buff ); 631 } 632 #endif 633 /* setup data */ 634 th = __kmp_threads[ gtid ]; 635 team = th -> th.th_team; 636 active = ! team -> t.t_serialized; 637 th->th.th_ident = loc; 638 639 #if USE_ITT_BUILD 640 kmp_uint64 cur_chunk = chunk; 641 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 642 KMP_MASTER_GTID(gtid) && 643 #if OMP_40_ENABLED 644 th->th.th_teams_microtask == NULL && 645 #endif 646 team->t.t_active_level == 1; 647 #endif 648 if ( ! active ) { 649 pr = reinterpret_cast< dispatch_private_info_template< T >* > 650 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 651 } else { 652 KMP_DEBUG_ASSERT( th->th.th_dispatch == 653 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 654 655 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 656 657 /* What happens when number of threads changes, need to resize buffer? */ 658 pr = reinterpret_cast< dispatch_private_info_template< T > * > 659 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 660 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 661 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 662 } 663 664 /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them 665 * yet anyway). 666 * When it is we'll want to look at them somewhere here and use that information to add to our 667 * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can 668 * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal). 669 */ 670 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); 671 672 /* Pick up the nomerge/ordered bits from the scheduling type */ 673 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 674 pr->nomerge = TRUE; 675 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 676 } else { 677 pr->nomerge = FALSE; 678 } 679 pr->type_size = ___kmp_size_type; // remember the size of variables 680 if ( kmp_ord_lower & schedule ) { 681 pr->ordered = TRUE; 682 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 683 } else { 684 pr->ordered = FALSE; 685 } 686 687 if ( schedule == kmp_sch_static ) { 688 schedule = __kmp_static; 689 } else { 690 if ( schedule == kmp_sch_runtime ) { 691 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 692 schedule = team -> t.t_sched.r_sched_type; 693 // Detail the schedule if needed (global controls are differentiated appropriately) 694 if ( schedule == kmp_sch_guided_chunked ) { 695 schedule = __kmp_guided; 696 } else if ( schedule == kmp_sch_static ) { 697 schedule = __kmp_static; 698 } 699 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 700 chunk = team -> t.t_sched.chunk; 701 #if USE_ITT_BUILD 702 cur_chunk = chunk; 703 #endif 704 #ifdef KMP_DEBUG 705 { 706 const char * buff; 707 // create format specifiers before the debug output 708 buff = __kmp_str_format( 709 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 710 traits_t< ST >::spec ); 711 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 712 __kmp_str_free( &buff ); 713 } 714 #endif 715 } else { 716 if ( schedule == kmp_sch_guided_chunked ) { 717 schedule = __kmp_guided; 718 } 719 if ( chunk <= 0 ) { 720 chunk = KMP_DEFAULT_CHUNK; 721 } 722 } 723 724 if ( schedule == kmp_sch_auto ) { 725 // mapping and differentiation: in the __kmp_do_serial_initialize() 726 schedule = __kmp_auto; 727 #ifdef KMP_DEBUG 728 { 729 const char * buff; 730 // create format specifiers before the debug output 731 buff = __kmp_str_format( 732 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 733 traits_t< ST >::spec ); 734 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 735 __kmp_str_free( &buff ); 736 } 737 #endif 738 } 739 740 /* guided analytical not safe for too many threads */ 741 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 742 schedule = kmp_sch_guided_iterative_chunked; 743 KMP_WARNING( DispatchManyThreads ); 744 } 745 pr->u.p.parm1 = chunk; 746 } 747 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 748 "unknown scheduling type" ); 749 750 pr->u.p.count = 0; 751 752 if ( __kmp_env_consistency_check ) { 753 if ( st == 0 ) { 754 __kmp_error_construct( 755 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 756 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 757 ); 758 } 759 } 760 761 tc = ( ub - lb + st ); 762 if ( st != 1 ) { 763 if ( st < 0 ) { 764 if ( lb < ub ) { 765 tc = 0; // zero-trip 766 } else { // lb >= ub 767 tc = (ST)tc / st; // convert to signed division 768 } 769 } else { // st > 0 770 if ( ub < lb ) { 771 tc = 0; // zero-trip 772 } else { // lb >= ub 773 tc /= st; 774 } 775 } 776 } else if ( ub < lb ) { // st == 1 777 tc = 0; // zero-trip 778 } 779 780 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing 781 // when statistics are disabled. 782 if (schedule == __kmp_static) 783 { 784 KMP_COUNT_BLOCK(OMP_FOR_static); 785 KMP_COUNT_VALUE(FOR_static_iterations, tc); 786 } 787 else 788 { 789 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 790 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc); 791 } 792 793 pr->u.p.lb = lb; 794 pr->u.p.ub = ub; 795 pr->u.p.st = st; 796 pr->u.p.tc = tc; 797 798 #if KMP_OS_WINDOWS 799 pr->u.p.last_upper = ub + st; 800 #endif /* KMP_OS_WINDOWS */ 801 802 /* NOTE: only the active parallel region(s) has active ordered sections */ 803 804 if ( active ) { 805 if ( pr->ordered == 0 ) { 806 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 807 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 808 } else { 809 pr->ordered_bumped = 0; 810 811 pr->u.p.ordered_lower = 1; 812 pr->u.p.ordered_upper = 0; 813 814 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 815 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 816 } 817 } 818 819 if ( __kmp_env_consistency_check ) { 820 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 821 if ( push_ws ) { 822 __kmp_push_workshare( gtid, ws, loc ); 823 pr->pushed_ws = ws; 824 } else { 825 __kmp_check_workshare( gtid, ws, loc ); 826 pr->pushed_ws = ct_none; 827 } 828 } 829 830 switch ( schedule ) { 831 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 832 case kmp_sch_static_steal: 833 { 834 T nproc = team->t.t_nproc; 835 T ntc, init; 836 837 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 838 839 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 840 if ( nproc > 1 && ntc >= nproc ) { 841 T id = __kmp_tid_from_gtid(gtid); 842 T small_chunk, extras; 843 844 small_chunk = ntc / nproc; 845 extras = ntc % nproc; 846 847 init = id * small_chunk + ( id < extras ? id : extras ); 848 pr->u.p.count = init; 849 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 850 851 pr->u.p.parm2 = lb; 852 //pr->pfields.parm3 = 0; // it's not used in static_steal 853 pr->u.p.parm4 = id; 854 pr->u.p.st = st; 855 break; 856 } else { 857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 858 gtid ) ); 859 schedule = kmp_sch_static_balanced; 860 /* too few iterations: fall-through to kmp_sch_static_balanced */ 861 } // if 862 /* FALL-THROUGH to static balanced */ 863 } // case 864 #endif 865 case kmp_sch_static_balanced: 866 { 867 T nproc = team->t.t_nproc; 868 T init, limit; 869 870 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 871 gtid ) ); 872 873 if ( nproc > 1 ) { 874 T id = __kmp_tid_from_gtid(gtid); 875 876 if ( tc < nproc ) { 877 if ( id < tc ) { 878 init = id; 879 limit = id; 880 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 881 } else { 882 pr->u.p.count = 1; /* means no more chunks to execute */ 883 pr->u.p.parm1 = FALSE; 884 break; 885 } 886 } else { 887 T small_chunk = tc / nproc; 888 T extras = tc % nproc; 889 init = id * small_chunk + (id < extras ? id : extras); 890 limit = init + small_chunk - (id < extras ? 0 : 1); 891 pr->u.p.parm1 = (id == nproc - 1); 892 } 893 } else { 894 if ( tc > 0 ) { 895 init = 0; 896 limit = tc - 1; 897 pr->u.p.parm1 = TRUE; 898 } else { 899 // zero trip count 900 pr->u.p.count = 1; /* means no more chunks to execute */ 901 pr->u.p.parm1 = FALSE; 902 break; 903 } 904 } 905 #if USE_ITT_BUILD 906 // Calculate chunk for metadata report 907 if ( itt_need_metadata_reporting ) 908 cur_chunk = limit - init + 1; 909 #endif 910 if ( st == 1 ) { 911 pr->u.p.lb = lb + init; 912 pr->u.p.ub = lb + limit; 913 } else { 914 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 915 pr->u.p.lb = lb + init * st; 916 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 917 if ( st > 0 ) { 918 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 919 } else { 920 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 921 } 922 } 923 if ( pr->ordered ) { 924 pr->u.p.ordered_lower = init; 925 pr->u.p.ordered_upper = limit; 926 } 927 break; 928 } // case 929 case kmp_sch_guided_iterative_chunked : 930 { 931 T nproc = team->t.t_nproc; 932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 933 934 if ( nproc > 1 ) { 935 if ( (2L * chunk + 1 ) * nproc >= tc ) { 936 /* chunk size too large, switch to dynamic */ 937 schedule = kmp_sch_dynamic_chunked; 938 } else { 939 // when remaining iters become less than parm2 - switch to dynamic 940 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 941 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 942 } 943 } else { 944 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 945 schedule = kmp_sch_static_greedy; 946 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 947 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 948 pr->u.p.parm1 = tc; 949 } // if 950 } // case 951 break; 952 case kmp_sch_guided_analytical_chunked: 953 { 954 T nproc = team->t.t_nproc; 955 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 956 957 if ( nproc > 1 ) { 958 if ( (2L * chunk + 1 ) * nproc >= tc ) { 959 /* chunk size too large, switch to dynamic */ 960 schedule = kmp_sch_dynamic_chunked; 961 } else { 962 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 963 DBL x; 964 965 #if KMP_OS_WINDOWS && KMP_ARCH_X86 966 /* Linux* OS already has 64-bit computation by default for 967 long double, and on Windows* OS on Intel(R) 64, 968 /Qlong_double doesn't work. On Windows* OS 969 on IA-32 architecture, we need to set precision to 970 64-bit instead of the default 53-bit. Even though long 971 double doesn't work on Windows* OS on Intel(R) 64, the 972 resulting lack of precision is not expected to impact 973 the correctness of the algorithm, but this has not been 974 mathematically proven. 975 */ 976 // save original FPCW and set precision to 64-bit, as 977 // Windows* OS on IA-32 architecture defaults to 53-bit 978 unsigned int oldFpcw = _control87(0,0); 979 _control87(_PC_64,_MCW_PC); // 0,0x30000 980 #endif 981 /* value used for comparison in solver for cross-over point */ 982 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 983 984 /* crossover point--chunk indexes equal to or greater than 985 this point switch to dynamic-style scheduling */ 986 UT cross; 987 988 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 989 x = (long double)1.0 - (long double)0.5 / nproc; 990 991 #ifdef KMP_DEBUG 992 { // test natural alignment 993 struct _test_a { 994 char a; 995 union { 996 char b; 997 DBL d; 998 }; 999 } t; 1000 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 1001 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 1002 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 1003 } 1004 #endif // KMP_DEBUG 1005 1006 /* save the term in thread private dispatch structure */ 1007 *(DBL*)&pr->u.p.parm3 = x; 1008 1009 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 1010 { 1011 UT left, right, mid; 1012 long double p; 1013 1014 /* estimate initial upper and lower bound */ 1015 1016 /* doesn't matter what value right is as long as it is positive, but 1017 it affects performance of the solver 1018 */ 1019 right = 229; 1020 p = __kmp_pow< UT >(x,right); 1021 if ( p > target ) { 1022 do{ 1023 p *= p; 1024 right <<= 1; 1025 } while(p>target && right < (1<<27)); 1026 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 1027 } else { 1028 left = 0; 1029 } 1030 1031 /* bisection root-finding method */ 1032 while ( left + 1 < right ) { 1033 mid = (left + right) / 2; 1034 if ( __kmp_pow< UT >(x,mid) > target ) { 1035 left = mid; 1036 } else { 1037 right = mid; 1038 } 1039 } // while 1040 cross = right; 1041 } 1042 /* assert sanity of computed crossover point */ 1043 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1044 1045 /* save the crossover point in thread private dispatch structure */ 1046 pr->u.p.parm2 = cross; 1047 1048 // C75803 1049 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1050 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1051 #else 1052 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1053 #endif 1054 /* dynamic-style scheduling offset */ 1055 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1056 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1057 // restore FPCW 1058 _control87(oldFpcw,_MCW_PC); 1059 #endif 1060 } // if 1061 } else { 1062 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1063 gtid ) ); 1064 schedule = kmp_sch_static_greedy; 1065 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1066 pr->u.p.parm1 = tc; 1067 } // if 1068 } // case 1069 break; 1070 case kmp_sch_static_greedy: 1071 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1072 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1073 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1074 tc; 1075 break; 1076 case kmp_sch_static_chunked : 1077 case kmp_sch_dynamic_chunked : 1078 if ( pr->u.p.parm1 <= 0 ) { 1079 pr->u.p.parm1 = KMP_DEFAULT_CHUNK; 1080 } 1081 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1082 break; 1083 case kmp_sch_trapezoidal : 1084 { 1085 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1086 1087 T parm1, parm2, parm3, parm4; 1088 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1089 1090 parm1 = chunk; 1091 1092 /* F : size of the first cycle */ 1093 parm2 = ( tc / (2 * team->t.t_nproc) ); 1094 1095 if ( parm2 < 1 ) { 1096 parm2 = 1; 1097 } 1098 1099 /* L : size of the last cycle. Make sure the last cycle 1100 * is not larger than the first cycle. 1101 */ 1102 if ( parm1 < 1 ) { 1103 parm1 = 1; 1104 } else if ( parm1 > parm2 ) { 1105 parm1 = parm2; 1106 } 1107 1108 /* N : number of cycles */ 1109 parm3 = ( parm2 + parm1 ); 1110 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1111 1112 if ( parm3 < 2 ) { 1113 parm3 = 2; 1114 } 1115 1116 /* sigma : decreasing incr of the trapezoid */ 1117 parm4 = ( parm3 - 1 ); 1118 parm4 = ( parm2 - parm1 ) / parm4; 1119 1120 // pointless check, because parm4 >= 0 always 1121 //if ( parm4 < 0 ) { 1122 // parm4 = 0; 1123 //} 1124 1125 pr->u.p.parm1 = parm1; 1126 pr->u.p.parm2 = parm2; 1127 pr->u.p.parm3 = parm3; 1128 pr->u.p.parm4 = parm4; 1129 } // case 1130 break; 1131 1132 default: 1133 { 1134 __kmp_msg( 1135 kmp_ms_fatal, // Severity 1136 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1137 KMP_HNT( GetNewerLibrary ), // Hint 1138 __kmp_msg_null // Variadic argument list terminator 1139 ); 1140 } 1141 break; 1142 } // switch 1143 pr->schedule = schedule; 1144 if ( active ) { 1145 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1146 1147 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1148 gtid, my_buffer_index, sh->buffer_index) ); 1149 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1150 USE_ITT_BUILD_ARG( NULL ) 1151 ); 1152 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1153 // *always* 32-bit integers. 1154 KMP_MB(); /* is this necessary? */ 1155 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1156 gtid, my_buffer_index, sh->buffer_index) ); 1157 1158 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1159 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1160 #if USE_ITT_BUILD 1161 if ( pr->ordered ) { 1162 __kmp_itt_ordered_init( gtid ); 1163 }; // if 1164 // Report loop metadata 1165 if ( itt_need_metadata_reporting ) { 1166 // Only report metadata by master of active team at level 1 1167 kmp_uint64 schedtype = 0; 1168 switch ( schedule ) { 1169 case kmp_sch_static_chunked: 1170 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1171 break; 1172 case kmp_sch_static_greedy: 1173 cur_chunk = pr->u.p.parm1; 1174 break; 1175 case kmp_sch_dynamic_chunked: 1176 schedtype = 1; 1177 break; 1178 case kmp_sch_guided_iterative_chunked: 1179 case kmp_sch_guided_analytical_chunked: 1180 schedtype = 2; 1181 break; 1182 default: 1183 // Should we put this case under "static"? 1184 // case kmp_sch_static_steal: 1185 schedtype = 3; 1186 break; 1187 } 1188 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1189 } 1190 #endif /* USE_ITT_BUILD */ 1191 }; // if 1192 1193 #ifdef KMP_DEBUG 1194 { 1195 const char * buff; 1196 // create format specifiers before the debug output 1197 buff = __kmp_str_format( 1198 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1199 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1200 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1201 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1202 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1203 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1204 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1205 KD_TRACE(10, ( buff, 1206 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1207 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1208 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1209 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1210 __kmp_str_free( &buff ); 1211 } 1212 #endif 1213 #if ( KMP_STATIC_STEAL_ENABLED ) 1214 if ( ___kmp_size_type < 8 ) { 1215 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1216 // all the parm3 variables will contain the same value. 1217 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1218 // rather than program life-time increment. 1219 // So the dedicated variable is required. The 'static_steal_counter' is used. 1220 if( schedule == kmp_sch_static_steal ) { 1221 // Other threads will inspect this variable when searching for a victim. 1222 // This is a flag showing that other threads may steal from this thread since then. 1223 volatile T * p = &pr->u.p.static_steal_counter; 1224 *p = *p + 1; 1225 } 1226 } 1227 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1228 1229 #if OMPT_SUPPORT && OMPT_TRACE 1230 if (ompt_enabled && 1231 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1232 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1233 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1234 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1235 team_info->parallel_id, task_info->task_id, team_info->microtask); 1236 } 1237 #endif 1238 } 1239 1240 /* 1241 * For ordered loops, either __kmp_dispatch_finish() should be called after 1242 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1243 * every chunk of iterations. If the ordered section(s) were not executed 1244 * for this iteration (or every iteration in this chunk), we need to set the 1245 * ordered iteration counters so that the next thread can proceed. 1246 */ 1247 template< typename UT > 1248 static void 1249 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1250 { 1251 typedef typename traits_t< UT >::signed_t ST; 1252 kmp_info_t *th = __kmp_threads[ gtid ]; 1253 1254 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1255 if ( ! th -> th.th_team -> t.t_serialized ) { 1256 1257 dispatch_private_info_template< UT > * pr = 1258 reinterpret_cast< dispatch_private_info_template< UT >* > 1259 ( th->th.th_dispatch->th_dispatch_pr_current ); 1260 dispatch_shared_info_template< UT > volatile * sh = 1261 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1262 ( th->th.th_dispatch->th_dispatch_sh_current ); 1263 KMP_DEBUG_ASSERT( pr ); 1264 KMP_DEBUG_ASSERT( sh ); 1265 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1266 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1267 1268 if ( pr->ordered_bumped ) { 1269 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1270 gtid ) ); 1271 pr->ordered_bumped = 0; 1272 } else { 1273 UT lower = pr->u.p.ordered_lower; 1274 1275 #ifdef KMP_DEBUG 1276 { 1277 const char * buff; 1278 // create format specifiers before the debug output 1279 buff = __kmp_str_format( 1280 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1281 traits_t< UT >::spec, traits_t< UT >::spec ); 1282 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1283 __kmp_str_free( &buff ); 1284 } 1285 #endif 1286 1287 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1288 USE_ITT_BUILD_ARG(NULL) 1289 ); 1290 KMP_MB(); /* is this necessary? */ 1291 #ifdef KMP_DEBUG 1292 { 1293 const char * buff; 1294 // create format specifiers before the debug output 1295 buff = __kmp_str_format( 1296 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1297 traits_t< UT >::spec, traits_t< UT >::spec ); 1298 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1299 __kmp_str_free( &buff ); 1300 } 1301 #endif 1302 1303 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1304 } // if 1305 } // if 1306 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1307 } 1308 1309 #ifdef KMP_GOMP_COMPAT 1310 1311 template< typename UT > 1312 static void 1313 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1314 { 1315 typedef typename traits_t< UT >::signed_t ST; 1316 kmp_info_t *th = __kmp_threads[ gtid ]; 1317 1318 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1319 if ( ! th -> th.th_team -> t.t_serialized ) { 1320 // int cid; 1321 dispatch_private_info_template< UT > * pr = 1322 reinterpret_cast< dispatch_private_info_template< UT >* > 1323 ( th->th.th_dispatch->th_dispatch_pr_current ); 1324 dispatch_shared_info_template< UT > volatile * sh = 1325 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1326 ( th->th.th_dispatch->th_dispatch_sh_current ); 1327 KMP_DEBUG_ASSERT( pr ); 1328 KMP_DEBUG_ASSERT( sh ); 1329 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1330 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1331 1332 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1333 UT lower = pr->u.p.ordered_lower; 1334 UT upper = pr->u.p.ordered_upper; 1335 UT inc = upper - lower + 1; 1336 1337 if ( pr->ordered_bumped == inc ) { 1338 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1339 gtid ) ); 1340 pr->ordered_bumped = 0; 1341 } else { 1342 inc -= pr->ordered_bumped; 1343 1344 #ifdef KMP_DEBUG 1345 { 1346 const char * buff; 1347 // create format specifiers before the debug output 1348 buff = __kmp_str_format( 1349 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1350 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1351 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1352 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1353 __kmp_str_free( &buff ); 1354 } 1355 #endif 1356 1357 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1358 USE_ITT_BUILD_ARG(NULL) 1359 ); 1360 1361 KMP_MB(); /* is this necessary? */ 1362 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1363 gtid ) ); 1364 pr->ordered_bumped = 0; 1365 //!!!!! TODO check if the inc should be unsigned, or signed??? 1366 #ifdef KMP_DEBUG 1367 { 1368 const char * buff; 1369 // create format specifiers before the debug output 1370 buff = __kmp_str_format( 1371 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1372 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1373 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1374 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1375 __kmp_str_free( &buff ); 1376 } 1377 #endif 1378 1379 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1380 } 1381 // } 1382 } 1383 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1384 } 1385 1386 #endif /* KMP_GOMP_COMPAT */ 1387 1388 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1389 * (no more work), then tell OMPT the loop is over. In some cases 1390 * kmp_dispatch_fini() is not called. */ 1391 #if OMPT_SUPPORT && OMPT_TRACE 1392 #define OMPT_LOOP_END \ 1393 if (status == 0) { \ 1394 if (ompt_enabled && \ 1395 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1396 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1397 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1398 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1399 team_info->parallel_id, task_info->task_id); \ 1400 } \ 1401 } 1402 #else 1403 #define OMPT_LOOP_END // no-op 1404 #endif 1405 1406 template< typename T > 1407 static int 1408 __kmp_dispatch_next( 1409 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1410 ) { 1411 1412 typedef typename traits_t< T >::unsigned_t UT; 1413 typedef typename traits_t< T >::signed_t ST; 1414 typedef typename traits_t< T >::floating_t DBL; 1415 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1416 static const int ___kmp_size_type = sizeof( UT ); 1417 #endif 1418 1419 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule 1420 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs 1421 // more than a compile time choice to use static scheduling would.) 1422 KMP_TIME_BLOCK(FOR_dynamic_scheduling); 1423 1424 int status; 1425 dispatch_private_info_template< T > * pr; 1426 kmp_info_t * th = __kmp_threads[ gtid ]; 1427 kmp_team_t * team = th -> th.th_team; 1428 1429 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1430 #ifdef KMP_DEBUG 1431 { 1432 const char * buff; 1433 // create format specifiers before the debug output 1434 buff = __kmp_str_format( 1435 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1436 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1437 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1438 __kmp_str_free( &buff ); 1439 } 1440 #endif 1441 1442 if ( team -> t.t_serialized ) { 1443 /* NOTE: serialize this dispatch becase we are not at the active level */ 1444 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1445 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1446 KMP_DEBUG_ASSERT( pr ); 1447 1448 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1449 *p_lb = 0; 1450 *p_ub = 0; 1451 // if ( p_last != NULL ) 1452 // *p_last = 0; 1453 if ( p_st != NULL ) 1454 *p_st = 0; 1455 if ( __kmp_env_consistency_check ) { 1456 if ( pr->pushed_ws != ct_none ) { 1457 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1458 } 1459 } 1460 } else if ( pr->nomerge ) { 1461 kmp_int32 last; 1462 T start; 1463 UT limit, trip, init; 1464 ST incr; 1465 T chunk = pr->u.p.parm1; 1466 1467 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1468 1469 init = chunk * pr->u.p.count++; 1470 trip = pr->u.p.tc - 1; 1471 1472 if ( (status = (init <= trip)) == 0 ) { 1473 *p_lb = 0; 1474 *p_ub = 0; 1475 // if ( p_last != NULL ) 1476 // *p_last = 0; 1477 if ( p_st != NULL ) 1478 *p_st = 0; 1479 if ( __kmp_env_consistency_check ) { 1480 if ( pr->pushed_ws != ct_none ) { 1481 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1482 } 1483 } 1484 } else { 1485 start = pr->u.p.lb; 1486 limit = chunk + init - 1; 1487 incr = pr->u.p.st; 1488 1489 if ( (last = (limit >= trip)) != 0 ) { 1490 limit = trip; 1491 #if KMP_OS_WINDOWS 1492 pr->u.p.last_upper = pr->u.p.ub; 1493 #endif /* KMP_OS_WINDOWS */ 1494 } 1495 if ( p_last != NULL ) 1496 *p_last = last; 1497 if ( p_st != NULL ) 1498 *p_st = incr; 1499 if ( incr == 1 ) { 1500 *p_lb = start + init; 1501 *p_ub = start + limit; 1502 } else { 1503 *p_lb = start + init * incr; 1504 *p_ub = start + limit * incr; 1505 } 1506 1507 if ( pr->ordered ) { 1508 pr->u.p.ordered_lower = init; 1509 pr->u.p.ordered_upper = limit; 1510 #ifdef KMP_DEBUG 1511 { 1512 const char * buff; 1513 // create format specifiers before the debug output 1514 buff = __kmp_str_format( 1515 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1516 traits_t< UT >::spec, traits_t< UT >::spec ); 1517 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1518 __kmp_str_free( &buff ); 1519 } 1520 #endif 1521 } // if 1522 } // if 1523 } else { 1524 pr->u.p.tc = 0; 1525 *p_lb = pr->u.p.lb; 1526 *p_ub = pr->u.p.ub; 1527 #if KMP_OS_WINDOWS 1528 pr->u.p.last_upper = *p_ub; 1529 #endif /* KMP_OS_WINDOWS */ 1530 if ( p_last != NULL ) 1531 *p_last = TRUE; 1532 if ( p_st != NULL ) 1533 *p_st = pr->u.p.st; 1534 } // if 1535 #ifdef KMP_DEBUG 1536 { 1537 const char * buff; 1538 // create format specifiers before the debug output 1539 buff = __kmp_str_format( 1540 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1541 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1542 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1543 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1544 __kmp_str_free( &buff ); 1545 } 1546 #endif 1547 #if INCLUDE_SSC_MARKS 1548 SSC_MARK_DISPATCH_NEXT(); 1549 #endif 1550 OMPT_LOOP_END; 1551 return status; 1552 } else { 1553 kmp_int32 last = 0; 1554 dispatch_shared_info_template< UT > *sh; 1555 T start; 1556 ST incr; 1557 UT limit, trip, init; 1558 1559 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1560 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1561 1562 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1563 ( th->th.th_dispatch->th_dispatch_pr_current ); 1564 KMP_DEBUG_ASSERT( pr ); 1565 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1566 ( th->th.th_dispatch->th_dispatch_sh_current ); 1567 KMP_DEBUG_ASSERT( sh ); 1568 1569 if ( pr->u.p.tc == 0 ) { 1570 // zero trip count 1571 status = 0; 1572 } else { 1573 switch (pr->schedule) { 1574 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1575 case kmp_sch_static_steal: 1576 { 1577 T chunk = pr->u.p.parm1; 1578 1579 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1580 1581 trip = pr->u.p.tc - 1; 1582 1583 if ( ___kmp_size_type > 4 ) { 1584 // Other threads do not look into the data of this thread, 1585 // so it's not necessary to make volatile casting. 1586 init = ( pr->u.p.count )++; 1587 status = ( init < (UT)pr->u.p.ub ); 1588 } else { 1589 typedef union { 1590 struct { 1591 UT count; 1592 T ub; 1593 } p; 1594 kmp_int64 b; 1595 } union_i4; 1596 // All operations on 'count' or 'ub' must be combined atomically together. 1597 // stealing implemented only for 4-byte indexes 1598 { 1599 union_i4 vold, vnew; 1600 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1601 vnew = vold; 1602 vnew.p.count++; 1603 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1604 ( volatile kmp_int64* )&pr->u.p.count, 1605 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1606 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1607 KMP_CPU_PAUSE(); 1608 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1609 vnew = vold; 1610 vnew.p.count++; 1611 } 1612 vnew = vold; 1613 init = vnew.p.count; 1614 status = ( init < (UT)vnew.p.ub ) ; 1615 } 1616 1617 if( !status ) { 1618 kmp_info_t **other_threads = team->t.t_threads; 1619 int while_limit = 10; 1620 int while_index = 0; 1621 1622 // TODO: algorithm of searching for a victim 1623 // should be cleaned up and measured 1624 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1625 union_i4 vold, vnew; 1626 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1627 T victimIdx = pr->u.p.parm4; 1628 T oldVictimIdx = victimIdx; 1629 dispatch_private_info_template< T > * victim; 1630 1631 do { 1632 if( !victimIdx ) { 1633 victimIdx = team->t.t_nproc - 1; 1634 } else { 1635 --victimIdx; 1636 } 1637 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1638 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1639 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1640 // TODO: think about a proper place of this test 1641 if ( ( !victim ) || 1642 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1643 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1644 // TODO: delay would be nice 1645 continue; 1646 // the victim is not ready yet to participate in stealing 1647 // because the victim is still in kmp_init_dispatch 1648 } 1649 if ( oldVictimIdx == victimIdx ) { 1650 break; 1651 } 1652 pr->u.p.parm4 = victimIdx; 1653 1654 while( 1 ) { 1655 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1656 vnew = vold; 1657 1658 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1659 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1660 break; 1661 } 1662 vnew.p.ub -= (remaining >> 2); 1663 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1664 #pragma warning( push ) 1665 // disable warning on pointless comparison of unsigned with 0 1666 #pragma warning( disable: 186 ) 1667 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1668 #pragma warning( pop ) 1669 // TODO: Should this be acquire or release? 1670 if ( KMP_COMPARE_AND_STORE_ACQ64( 1671 ( volatile kmp_int64 * )&victim->u.p.count, 1672 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1673 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1674 status = 1; 1675 while_index = 0; 1676 // now update own count and ub 1677 #if KMP_ARCH_X86 1678 // stealing executed on non-KMP_ARCH_X86 only 1679 // Atomic 64-bit write on ia32 is 1680 // unavailable, so we do this in steps. 1681 // This code is not tested. 1682 init = vold.p.count; 1683 pr->u.p.ub = 0; 1684 pr->u.p.count = init + 1; 1685 pr->u.p.ub = vnew.p.count; 1686 #else 1687 init = vnew.p.ub; 1688 vold.p.count = init + 1; 1689 // TODO: is it safe and enough? 1690 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1691 #endif // KMP_ARCH_X86 1692 break; 1693 } // if 1694 KMP_CPU_PAUSE(); 1695 } // while (1) 1696 } // while 1697 } // if 1698 } // if 1699 if ( !status ) { 1700 *p_lb = 0; 1701 *p_ub = 0; 1702 if ( p_st != NULL ) *p_st = 0; 1703 } else { 1704 start = pr->u.p.parm2; 1705 init *= chunk; 1706 limit = chunk + init - 1; 1707 incr = pr->u.p.st; 1708 1709 KMP_DEBUG_ASSERT(init <= trip); 1710 if ( (last = (limit >= trip)) != 0 ) 1711 limit = trip; 1712 if ( p_st != NULL ) *p_st = incr; 1713 1714 if ( incr == 1 ) { 1715 *p_lb = start + init; 1716 *p_ub = start + limit; 1717 } else { 1718 *p_lb = start + init * incr; 1719 *p_ub = start + limit * incr; 1720 } 1721 1722 if ( pr->ordered ) { 1723 pr->u.p.ordered_lower = init; 1724 pr->u.p.ordered_upper = limit; 1725 #ifdef KMP_DEBUG 1726 { 1727 const char * buff; 1728 // create format specifiers before the debug output 1729 buff = __kmp_str_format( 1730 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1731 traits_t< UT >::spec, traits_t< UT >::spec ); 1732 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1733 __kmp_str_free( &buff ); 1734 } 1735 #endif 1736 } // if 1737 } // if 1738 break; 1739 } // case 1740 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1741 case kmp_sch_static_balanced: 1742 { 1743 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1744 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1745 pr->u.p.count = 1; 1746 *p_lb = pr->u.p.lb; 1747 *p_ub = pr->u.p.ub; 1748 last = pr->u.p.parm1; 1749 if ( p_st != NULL ) 1750 *p_st = pr->u.p.st; 1751 } else { /* no iterations to do */ 1752 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1753 } 1754 if ( pr->ordered ) { 1755 #ifdef KMP_DEBUG 1756 { 1757 const char * buff; 1758 // create format specifiers before the debug output 1759 buff = __kmp_str_format( 1760 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1761 traits_t< UT >::spec, traits_t< UT >::spec ); 1762 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1763 __kmp_str_free( &buff ); 1764 } 1765 #endif 1766 } // if 1767 } // case 1768 break; 1769 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1770 case kmp_sch_static_chunked: 1771 { 1772 T parm1; 1773 1774 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1775 gtid ) ); 1776 parm1 = pr->u.p.parm1; 1777 1778 trip = pr->u.p.tc - 1; 1779 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1780 1781 if ( (status = (init <= trip)) != 0 ) { 1782 start = pr->u.p.lb; 1783 incr = pr->u.p.st; 1784 limit = parm1 + init - 1; 1785 1786 if ( (last = (limit >= trip)) != 0 ) 1787 limit = trip; 1788 1789 if ( p_st != NULL ) *p_st = incr; 1790 1791 pr->u.p.count += team->t.t_nproc; 1792 1793 if ( incr == 1 ) { 1794 *p_lb = start + init; 1795 *p_ub = start + limit; 1796 } 1797 else { 1798 *p_lb = start + init * incr; 1799 *p_ub = start + limit * incr; 1800 } 1801 1802 if ( pr->ordered ) { 1803 pr->u.p.ordered_lower = init; 1804 pr->u.p.ordered_upper = limit; 1805 #ifdef KMP_DEBUG 1806 { 1807 const char * buff; 1808 // create format specifiers before the debug output 1809 buff = __kmp_str_format( 1810 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1811 traits_t< UT >::spec, traits_t< UT >::spec ); 1812 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1813 __kmp_str_free( &buff ); 1814 } 1815 #endif 1816 } // if 1817 } // if 1818 } // case 1819 break; 1820 1821 case kmp_sch_dynamic_chunked: 1822 { 1823 T chunk = pr->u.p.parm1; 1824 1825 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1826 gtid ) ); 1827 1828 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1829 trip = pr->u.p.tc - 1; 1830 1831 if ( (status = (init <= trip)) == 0 ) { 1832 *p_lb = 0; 1833 *p_ub = 0; 1834 if ( p_st != NULL ) *p_st = 0; 1835 } else { 1836 start = pr->u.p.lb; 1837 limit = chunk + init - 1; 1838 incr = pr->u.p.st; 1839 1840 if ( (last = (limit >= trip)) != 0 ) 1841 limit = trip; 1842 1843 if ( p_st != NULL ) *p_st = incr; 1844 1845 if ( incr == 1 ) { 1846 *p_lb = start + init; 1847 *p_ub = start + limit; 1848 } else { 1849 *p_lb = start + init * incr; 1850 *p_ub = start + limit * incr; 1851 } 1852 1853 if ( pr->ordered ) { 1854 pr->u.p.ordered_lower = init; 1855 pr->u.p.ordered_upper = limit; 1856 #ifdef KMP_DEBUG 1857 { 1858 const char * buff; 1859 // create format specifiers before the debug output 1860 buff = __kmp_str_format( 1861 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1862 traits_t< UT >::spec, traits_t< UT >::spec ); 1863 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1864 __kmp_str_free( &buff ); 1865 } 1866 #endif 1867 } // if 1868 } // if 1869 } // case 1870 break; 1871 1872 case kmp_sch_guided_iterative_chunked: 1873 { 1874 T chunkspec = pr->u.p.parm1; 1875 KD_TRACE(100, 1876 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1877 trip = pr->u.p.tc; 1878 // Start atomic part of calculations 1879 while(1) { 1880 ST remaining; // signed, because can be < 0 1881 init = sh->u.s.iteration; // shared value 1882 remaining = trip - init; 1883 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1884 // nothing to do, don't try atomic op 1885 status = 0; 1886 break; 1887 } 1888 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1889 // use dynamic-style shcedule 1890 // atomically inrement iterations, get old value 1891 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1892 remaining = trip - init; 1893 if (remaining <= 0) { 1894 status = 0; // all iterations got by other threads 1895 } else { 1896 // got some iterations to work on 1897 status = 1; 1898 if ( (T)remaining > chunkspec ) { 1899 limit = init + chunkspec - 1; 1900 } else { 1901 last = 1; // the last chunk 1902 limit = init + remaining - 1; 1903 } // if 1904 } // if 1905 break; 1906 } // if 1907 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1908 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1909 // CAS was successful, chunk obtained 1910 status = 1; 1911 --limit; 1912 break; 1913 } // if 1914 } // while 1915 if ( status != 0 ) { 1916 start = pr->u.p.lb; 1917 incr = pr->u.p.st; 1918 if ( p_st != NULL ) 1919 *p_st = incr; 1920 *p_lb = start + init * incr; 1921 *p_ub = start + limit * incr; 1922 if ( pr->ordered ) { 1923 pr->u.p.ordered_lower = init; 1924 pr->u.p.ordered_upper = limit; 1925 #ifdef KMP_DEBUG 1926 { 1927 const char * buff; 1928 // create format specifiers before the debug output 1929 buff = __kmp_str_format( 1930 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1931 traits_t< UT >::spec, traits_t< UT >::spec ); 1932 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1933 __kmp_str_free( &buff ); 1934 } 1935 #endif 1936 } // if 1937 } else { 1938 *p_lb = 0; 1939 *p_ub = 0; 1940 if ( p_st != NULL ) 1941 *p_st = 0; 1942 } // if 1943 } // case 1944 break; 1945 1946 case kmp_sch_guided_analytical_chunked: 1947 { 1948 T chunkspec = pr->u.p.parm1; 1949 UT chunkIdx; 1950 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1951 /* for storing original FPCW value for Windows* OS on 1952 IA-32 architecture 8-byte version */ 1953 unsigned int oldFpcw; 1954 unsigned int fpcwSet = 0; 1955 #endif 1956 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1957 gtid ) ); 1958 1959 trip = pr->u.p.tc; 1960 1961 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1962 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1963 1964 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1965 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1966 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1967 --trip; 1968 /* use dynamic-style scheduling */ 1969 init = chunkIdx * chunkspec + pr->u.p.count; 1970 /* need to verify init > 0 in case of overflow in the above calculation */ 1971 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1972 limit = init + chunkspec -1; 1973 1974 if ( (last = (limit >= trip)) != 0 ) 1975 limit = trip; 1976 } 1977 break; 1978 } else { 1979 /* use exponential-style scheduling */ 1980 /* The following check is to workaround the lack of long double precision on Windows* OS. 1981 This check works around the possible effect that init != 0 for chunkIdx == 0. 1982 */ 1983 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1984 /* If we haven't already done so, save original 1985 FPCW and set precision to 64-bit, as Windows* OS 1986 on IA-32 architecture defaults to 53-bit */ 1987 if ( !fpcwSet ) { 1988 oldFpcw = _control87(0,0); 1989 _control87(_PC_64,_MCW_PC); 1990 fpcwSet = 0x30000; 1991 } 1992 #endif 1993 if ( chunkIdx ) { 1994 init = __kmp_dispatch_guided_remaining< T >( 1995 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1996 KMP_DEBUG_ASSERT(init); 1997 init = trip - init; 1998 } else 1999 init = 0; 2000 limit = trip - __kmp_dispatch_guided_remaining< T >( 2001 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 2002 KMP_ASSERT(init <= limit); 2003 if ( init < limit ) { 2004 KMP_DEBUG_ASSERT(limit <= trip); 2005 --limit; 2006 status = 1; 2007 break; 2008 } // if 2009 } // if 2010 } // while (1) 2011 #if KMP_OS_WINDOWS && KMP_ARCH_X86 2012 /* restore FPCW if necessary 2013 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 2014 */ 2015 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 2016 _control87(oldFpcw,_MCW_PC); 2017 #endif 2018 if ( status != 0 ) { 2019 start = pr->u.p.lb; 2020 incr = pr->u.p.st; 2021 if ( p_st != NULL ) 2022 *p_st = incr; 2023 *p_lb = start + init * incr; 2024 *p_ub = start + limit * incr; 2025 if ( pr->ordered ) { 2026 pr->u.p.ordered_lower = init; 2027 pr->u.p.ordered_upper = limit; 2028 #ifdef KMP_DEBUG 2029 { 2030 const char * buff; 2031 // create format specifiers before the debug output 2032 buff = __kmp_str_format( 2033 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2034 traits_t< UT >::spec, traits_t< UT >::spec ); 2035 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2036 __kmp_str_free( &buff ); 2037 } 2038 #endif 2039 } 2040 } else { 2041 *p_lb = 0; 2042 *p_ub = 0; 2043 if ( p_st != NULL ) 2044 *p_st = 0; 2045 } 2046 } // case 2047 break; 2048 2049 case kmp_sch_trapezoidal: 2050 { 2051 UT index; 2052 T parm2 = pr->u.p.parm2; 2053 T parm3 = pr->u.p.parm3; 2054 T parm4 = pr->u.p.parm4; 2055 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2056 gtid ) ); 2057 2058 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2059 2060 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2061 trip = pr->u.p.tc - 1; 2062 2063 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2064 *p_lb = 0; 2065 *p_ub = 0; 2066 if ( p_st != NULL ) *p_st = 0; 2067 } else { 2068 start = pr->u.p.lb; 2069 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2070 incr = pr->u.p.st; 2071 2072 if ( (last = (limit >= trip)) != 0 ) 2073 limit = trip; 2074 2075 if ( p_st != NULL ) *p_st = incr; 2076 2077 if ( incr == 1 ) { 2078 *p_lb = start + init; 2079 *p_ub = start + limit; 2080 } else { 2081 *p_lb = start + init * incr; 2082 *p_ub = start + limit * incr; 2083 } 2084 2085 if ( pr->ordered ) { 2086 pr->u.p.ordered_lower = init; 2087 pr->u.p.ordered_upper = limit; 2088 #ifdef KMP_DEBUG 2089 { 2090 const char * buff; 2091 // create format specifiers before the debug output 2092 buff = __kmp_str_format( 2093 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2094 traits_t< UT >::spec, traits_t< UT >::spec ); 2095 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2096 __kmp_str_free( &buff ); 2097 } 2098 #endif 2099 } // if 2100 } // if 2101 } // case 2102 break; 2103 default: 2104 { 2105 status = 0; // to avoid complaints on uninitialized variable use 2106 __kmp_msg( 2107 kmp_ms_fatal, // Severity 2108 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2109 KMP_HNT( GetNewerLibrary ), // Hint 2110 __kmp_msg_null // Variadic argument list terminator 2111 ); 2112 } 2113 break; 2114 } // switch 2115 } // if tc == 0; 2116 2117 if ( status == 0 ) { 2118 UT num_done; 2119 2120 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2121 #ifdef KMP_DEBUG 2122 { 2123 const char * buff; 2124 // create format specifiers before the debug output 2125 buff = __kmp_str_format( 2126 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2127 traits_t< UT >::spec ); 2128 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2129 __kmp_str_free( &buff ); 2130 } 2131 #endif 2132 2133 if ( (ST)num_done == team->t.t_nproc-1 ) { 2134 /* NOTE: release this buffer to be reused */ 2135 2136 KMP_MB(); /* Flush all pending memory write invalidates. */ 2137 2138 sh->u.s.num_done = 0; 2139 sh->u.s.iteration = 0; 2140 2141 /* TODO replace with general release procedure? */ 2142 if ( pr->ordered ) { 2143 sh->u.s.ordered_iteration = 0; 2144 } 2145 2146 KMP_MB(); /* Flush all pending memory write invalidates. */ 2147 2148 sh -> buffer_index += KMP_MAX_DISP_BUF; 2149 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2150 gtid, sh->buffer_index) ); 2151 2152 KMP_MB(); /* Flush all pending memory write invalidates. */ 2153 2154 } // if 2155 if ( __kmp_env_consistency_check ) { 2156 if ( pr->pushed_ws != ct_none ) { 2157 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2158 } 2159 } 2160 2161 th -> th.th_dispatch -> th_deo_fcn = NULL; 2162 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2163 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2164 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2165 } // if (status == 0) 2166 #if KMP_OS_WINDOWS 2167 else if ( last ) { 2168 pr->u.p.last_upper = pr->u.p.ub; 2169 } 2170 #endif /* KMP_OS_WINDOWS */ 2171 if ( p_last != NULL && status != 0 ) 2172 *p_last = last; 2173 } // if 2174 2175 #ifdef KMP_DEBUG 2176 { 2177 const char * buff; 2178 // create format specifiers before the debug output 2179 buff = __kmp_str_format( 2180 "__kmp_dispatch_next: T#%%d normal case: " \ 2181 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2182 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2183 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2184 __kmp_str_free( &buff ); 2185 } 2186 #endif 2187 #if INCLUDE_SSC_MARKS 2188 SSC_MARK_DISPATCH_NEXT(); 2189 #endif 2190 OMPT_LOOP_END; 2191 return status; 2192 } 2193 2194 template< typename T > 2195 static void 2196 __kmp_dist_get_bounds( 2197 ident_t *loc, 2198 kmp_int32 gtid, 2199 kmp_int32 *plastiter, 2200 T *plower, 2201 T *pupper, 2202 typename traits_t< T >::signed_t incr 2203 ) { 2204 typedef typename traits_t< T >::unsigned_t UT; 2205 typedef typename traits_t< T >::signed_t ST; 2206 register kmp_uint32 team_id; 2207 register kmp_uint32 nteams; 2208 register UT trip_count; 2209 register kmp_team_t *team; 2210 kmp_info_t * th; 2211 2212 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2213 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2214 #ifdef KMP_DEBUG 2215 { 2216 const char * buff; 2217 // create format specifiers before the debug output 2218 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2219 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2220 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2221 traits_t< T >::spec ); 2222 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2223 __kmp_str_free( &buff ); 2224 } 2225 #endif 2226 2227 if( __kmp_env_consistency_check ) { 2228 if( incr == 0 ) { 2229 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2230 } 2231 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2232 // The loop is illegal. 2233 // Some zero-trip loops maintained by compiler, e.g.: 2234 // for(i=10;i<0;++i) // lower >= upper - run-time check 2235 // for(i=0;i>10;--i) // lower <= upper - run-time check 2236 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2237 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2238 // Compiler does not check the following illegal loops: 2239 // for(i=0;i<10;i+=incr) // where incr<0 2240 // for(i=10;i>0;i-=incr) // where incr<0 2241 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2242 } 2243 } 2244 th = __kmp_threads[gtid]; 2245 team = th->th.th_team; 2246 #if OMP_40_ENABLED 2247 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2248 nteams = th->th.th_teams_size.nteams; 2249 #endif 2250 team_id = team->t.t_master_tid; 2251 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2252 2253 // compute global trip count 2254 if( incr == 1 ) { 2255 trip_count = *pupper - *plower + 1; 2256 } else if(incr == -1) { 2257 trip_count = *plower - *pupper + 1; 2258 } else { 2259 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2260 } 2261 2262 if( trip_count <= nteams ) { 2263 KMP_DEBUG_ASSERT( 2264 __kmp_static == kmp_sch_static_greedy || \ 2265 __kmp_static == kmp_sch_static_balanced 2266 ); // Unknown static scheduling type. 2267 // only some teams get single iteration, others get nothing 2268 if( team_id < trip_count ) { 2269 *pupper = *plower = *plower + team_id * incr; 2270 } else { 2271 *plower = *pupper + incr; // zero-trip loop 2272 } 2273 if( plastiter != NULL ) 2274 *plastiter = ( team_id == trip_count - 1 ); 2275 } else { 2276 if( __kmp_static == kmp_sch_static_balanced ) { 2277 register UT chunk = trip_count / nteams; 2278 register UT extras = trip_count % nteams; 2279 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2280 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2281 if( plastiter != NULL ) 2282 *plastiter = ( team_id == nteams - 1 ); 2283 } else { 2284 register T chunk_inc_count = 2285 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2286 register T upper = *pupper; 2287 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2288 // Unknown static scheduling type. 2289 *plower += team_id * chunk_inc_count; 2290 *pupper = *plower + chunk_inc_count - incr; 2291 // Check/correct bounds if needed 2292 if( incr > 0 ) { 2293 if( *pupper < *plower ) 2294 *pupper = i_maxmin< T >::mx; 2295 if( plastiter != NULL ) 2296 *plastiter = *plower <= upper && *pupper > upper - incr; 2297 if( *pupper > upper ) 2298 *pupper = upper; // tracker C73258 2299 } else { 2300 if( *pupper > *plower ) 2301 *pupper = i_maxmin< T >::mn; 2302 if( plastiter != NULL ) 2303 *plastiter = *plower >= upper && *pupper < upper - incr; 2304 if( *pupper < upper ) 2305 *pupper = upper; // tracker C73258 2306 } 2307 } 2308 } 2309 } 2310 2311 //----------------------------------------------------------------------------------------- 2312 // Dispatch routines 2313 // Transfer call to template< type T > 2314 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2315 // T lb, T ub, ST st, ST chunk ) 2316 extern "C" { 2317 2318 /*! 2319 @ingroup WORK_SHARING 2320 @{ 2321 @param loc Source location 2322 @param gtid Global thread id 2323 @param schedule Schedule type 2324 @param lb Lower bound 2325 @param ub Upper bound 2326 @param st Step (or increment if you prefer) 2327 @param chunk The chunk size to block with 2328 2329 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2330 These functions are all identical apart from the types of the arguments. 2331 */ 2332 2333 void 2334 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2335 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2336 { 2337 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2338 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2339 } 2340 /*! 2341 See @ref __kmpc_dispatch_init_4 2342 */ 2343 void 2344 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2345 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2346 { 2347 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2348 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2349 } 2350 2351 /*! 2352 See @ref __kmpc_dispatch_init_4 2353 */ 2354 void 2355 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2356 kmp_int64 lb, kmp_int64 ub, 2357 kmp_int64 st, kmp_int64 chunk ) 2358 { 2359 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2360 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2361 } 2362 2363 /*! 2364 See @ref __kmpc_dispatch_init_4 2365 */ 2366 void 2367 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2368 kmp_uint64 lb, kmp_uint64 ub, 2369 kmp_int64 st, kmp_int64 chunk ) 2370 { 2371 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2372 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2373 } 2374 2375 /*! 2376 See @ref __kmpc_dispatch_init_4 2377 2378 Difference from __kmpc_dispatch_init set of functions is these functions 2379 are called for composite distribute parallel for construct. Thus before 2380 regular iterations dispatching we need to calc per-team iteration space. 2381 2382 These functions are all identical apart from the types of the arguments. 2383 */ 2384 void 2385 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2386 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2387 { 2388 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2389 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2390 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2391 } 2392 2393 void 2394 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2395 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2396 { 2397 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2398 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2399 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2400 } 2401 2402 void 2403 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2404 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2405 { 2406 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2407 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2408 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2409 } 2410 2411 void 2412 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2413 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2414 { 2415 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2416 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2417 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2418 } 2419 2420 /*! 2421 @param loc Source code location 2422 @param gtid Global thread id 2423 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2424 @param p_lb Pointer to the lower bound for the next chunk of work 2425 @param p_ub Pointer to the upper bound for the next chunk of work 2426 @param p_st Pointer to the stride for the next chunk of work 2427 @return one if there is work to be done, zero otherwise 2428 2429 Get the next dynamically allocated chunk of work for this thread. 2430 If there is no more work, then the lb,ub and stride need not be modified. 2431 */ 2432 int 2433 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2434 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2435 { 2436 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2437 } 2438 2439 /*! 2440 See @ref __kmpc_dispatch_next_4 2441 */ 2442 int 2443 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2444 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2445 { 2446 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2447 } 2448 2449 /*! 2450 See @ref __kmpc_dispatch_next_4 2451 */ 2452 int 2453 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2454 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2455 { 2456 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2457 } 2458 2459 /*! 2460 See @ref __kmpc_dispatch_next_4 2461 */ 2462 int 2463 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2464 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2465 { 2466 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2467 } 2468 2469 /*! 2470 @param loc Source code location 2471 @param gtid Global thread id 2472 2473 Mark the end of a dynamic loop. 2474 */ 2475 void 2476 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2477 { 2478 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2479 } 2480 2481 /*! 2482 See @ref __kmpc_dispatch_fini_4 2483 */ 2484 void 2485 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2486 { 2487 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2488 } 2489 2490 /*! 2491 See @ref __kmpc_dispatch_fini_4 2492 */ 2493 void 2494 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2495 { 2496 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2497 } 2498 2499 /*! 2500 See @ref __kmpc_dispatch_fini_4 2501 */ 2502 void 2503 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2504 { 2505 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2506 } 2507 /*! @} */ 2508 2509 //----------------------------------------------------------------------------------------- 2510 //Non-template routines from kmp_dispatch.c used in other sources 2511 2512 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2513 return value == checker; 2514 } 2515 2516 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2517 return value != checker; 2518 } 2519 2520 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2521 return value < checker; 2522 } 2523 2524 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2525 return value >= checker; 2526 } 2527 2528 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2529 return value <= checker; 2530 } 2531 2532 kmp_uint32 2533 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2534 kmp_uint32 checker, 2535 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2536 , void * obj // Higher-level synchronization object, or NULL. 2537 ) 2538 { 2539 // note: we may not belong to a team at this point 2540 register volatile kmp_uint32 * spin = spinner; 2541 register kmp_uint32 check = checker; 2542 register kmp_uint32 spins; 2543 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2544 register kmp_uint32 r; 2545 2546 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2547 KMP_INIT_YIELD( spins ); 2548 // main wait spin loop 2549 while(!f(r = TCR_4(*spin), check)) { 2550 KMP_FSYNC_SPIN_PREPARE( obj ); 2551 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2552 It causes problems with infinite recursion because of exit lock */ 2553 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2554 __kmp_abort_thread(); */ 2555 2556 /* if we have waited a bit, or are oversubscribed, yield */ 2557 /* pause is in the following code */ 2558 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2559 KMP_YIELD_SPIN( spins ); 2560 } 2561 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2562 return r; 2563 } 2564 2565 } // extern "C" 2566 2567 #ifdef KMP_GOMP_COMPAT 2568 2569 void 2570 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2571 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2572 kmp_int32 chunk, int push_ws ) 2573 { 2574 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2575 push_ws ); 2576 } 2577 2578 void 2579 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2580 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2581 kmp_int32 chunk, int push_ws ) 2582 { 2583 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2584 push_ws ); 2585 } 2586 2587 void 2588 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2589 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2590 kmp_int64 chunk, int push_ws ) 2591 { 2592 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2593 push_ws ); 2594 } 2595 2596 void 2597 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2598 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2599 kmp_int64 chunk, int push_ws ) 2600 { 2601 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2602 push_ws ); 2603 } 2604 2605 void 2606 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2607 { 2608 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2609 } 2610 2611 void 2612 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2613 { 2614 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2615 } 2616 2617 void 2618 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2619 { 2620 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2621 } 2622 2623 void 2624 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2625 { 2626 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2627 } 2628 2629 #endif /* KMP_GOMP_COMPAT */ 2630 2631 /* ------------------------------------------------------------------------ */ 2632 /* ------------------------------------------------------------------------ */ 2633 2634