1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 #endif 637 if ( ! active ) { 638 pr = reinterpret_cast< dispatch_private_info_template< T >* > 639 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 640 } else { 641 KMP_DEBUG_ASSERT( th->th.th_dispatch == 642 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 643 644 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 645 646 /* What happens when number of threads changes, need to resize buffer? */ 647 pr = reinterpret_cast< dispatch_private_info_template< T > * > 648 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 649 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 650 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 651 } 652 653 /* Pick up the nomerge/ordered bits from the scheduling type */ 654 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 655 pr->nomerge = TRUE; 656 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 657 } else { 658 pr->nomerge = FALSE; 659 } 660 pr->type_size = ___kmp_size_type; // remember the size of variables 661 if ( kmp_ord_lower & schedule ) { 662 pr->ordered = TRUE; 663 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 664 } else { 665 pr->ordered = FALSE; 666 } 667 if ( schedule == kmp_sch_static ) { 668 schedule = __kmp_static; 669 } else { 670 if ( schedule == kmp_sch_runtime ) { 671 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 672 schedule = team -> t.t_sched.r_sched_type; 673 // Detail the schedule if needed (global controls are differentiated appropriately) 674 if ( schedule == kmp_sch_guided_chunked ) { 675 schedule = __kmp_guided; 676 } else if ( schedule == kmp_sch_static ) { 677 schedule = __kmp_static; 678 } 679 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 680 chunk = team -> t.t_sched.chunk; 681 682 #ifdef KMP_DEBUG 683 { 684 const char * buff; 685 // create format specifiers before the debug output 686 buff = __kmp_str_format( 687 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 688 traits_t< ST >::spec ); 689 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 690 __kmp_str_free( &buff ); 691 } 692 #endif 693 } else { 694 if ( schedule == kmp_sch_guided_chunked ) { 695 schedule = __kmp_guided; 696 } 697 if ( chunk <= 0 ) { 698 chunk = KMP_DEFAULT_CHUNK; 699 } 700 } 701 702 if ( schedule == kmp_sch_auto ) { 703 // mapping and differentiation: in the __kmp_do_serial_initialize() 704 schedule = __kmp_auto; 705 #ifdef KMP_DEBUG 706 { 707 const char * buff; 708 // create format specifiers before the debug output 709 buff = __kmp_str_format( 710 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 711 traits_t< ST >::spec ); 712 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 713 __kmp_str_free( &buff ); 714 } 715 #endif 716 } 717 718 /* guided analytical not safe for too many threads */ 719 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 720 schedule = kmp_sch_guided_iterative_chunked; 721 KMP_WARNING( DispatchManyThreads ); 722 } 723 pr->u.p.parm1 = chunk; 724 } 725 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 726 "unknown scheduling type" ); 727 728 pr->u.p.count = 0; 729 730 if ( __kmp_env_consistency_check ) { 731 if ( st == 0 ) { 732 __kmp_error_construct( 733 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 734 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 735 ); 736 } 737 } 738 739 tc = ( ub - lb + st ); 740 if ( st != 1 ) { 741 if ( st < 0 ) { 742 if ( lb < ub ) { 743 tc = 0; // zero-trip 744 } else { // lb >= ub 745 tc = (ST)tc / st; // convert to signed division 746 } 747 } else { // st > 0 748 if ( ub < lb ) { 749 tc = 0; // zero-trip 750 } else { // lb >= ub 751 tc /= st; 752 } 753 } 754 } else if ( ub < lb ) { // st == 1 755 tc = 0; // zero-trip 756 } 757 758 pr->u.p.lb = lb; 759 pr->u.p.ub = ub; 760 pr->u.p.st = st; 761 pr->u.p.tc = tc; 762 763 #if KMP_OS_WINDOWS 764 pr->u.p.last_upper = ub + st; 765 #endif /* KMP_OS_WINDOWS */ 766 767 /* NOTE: only the active parallel region(s) has active ordered sections */ 768 769 if ( active ) { 770 if ( pr->ordered == 0 ) { 771 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 772 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 773 } else { 774 pr->ordered_bumped = 0; 775 776 pr->u.p.ordered_lower = 1; 777 pr->u.p.ordered_upper = 0; 778 779 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 780 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 781 } 782 } 783 784 if ( __kmp_env_consistency_check ) { 785 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 786 if ( push_ws ) { 787 __kmp_push_workshare( gtid, ws, loc ); 788 pr->pushed_ws = ws; 789 } else { 790 __kmp_check_workshare( gtid, ws, loc ); 791 pr->pushed_ws = ct_none; 792 } 793 } 794 795 switch ( schedule ) { 796 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 797 case kmp_sch_static_steal: 798 { 799 T nproc = team->t.t_nproc; 800 T ntc, init; 801 802 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 803 804 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 805 if ( nproc > 1 && ntc >= nproc ) { 806 T id = __kmp_tid_from_gtid(gtid); 807 T small_chunk, extras; 808 809 small_chunk = ntc / nproc; 810 extras = ntc % nproc; 811 812 init = id * small_chunk + ( id < extras ? id : extras ); 813 pr->u.p.count = init; 814 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 815 816 pr->u.p.parm2 = lb; 817 //pr->pfields.parm3 = 0; // it's not used in static_steal 818 pr->u.p.parm4 = id; 819 pr->u.p.st = st; 820 break; 821 } else { 822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 823 gtid ) ); 824 schedule = kmp_sch_static_balanced; 825 /* too few iterations: fall-through to kmp_sch_static_balanced */ 826 } // if 827 /* FALL-THROUGH to static balanced */ 828 } // case 829 #endif 830 case kmp_sch_static_balanced: 831 { 832 T nproc = team->t.t_nproc; 833 T init, limit; 834 835 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 836 gtid ) ); 837 838 if ( nproc > 1 ) { 839 T id = __kmp_tid_from_gtid(gtid); 840 841 if ( tc < nproc ) { 842 if ( id < tc ) { 843 init = id; 844 limit = id; 845 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 846 } else { 847 pr->u.p.count = 1; /* means no more chunks to execute */ 848 pr->u.p.parm1 = FALSE; 849 break; 850 } 851 } else { 852 T small_chunk = tc / nproc; 853 T extras = tc % nproc; 854 init = id * small_chunk + (id < extras ? id : extras); 855 limit = init + small_chunk - (id < extras ? 0 : 1); 856 pr->u.p.parm1 = (id == nproc - 1); 857 } 858 } else { 859 if ( tc > 0 ) { 860 init = 0; 861 limit = tc - 1; 862 pr->u.p.parm1 = TRUE; 863 } else { 864 // zero trip count 865 pr->u.p.count = 1; /* means no more chunks to execute */ 866 pr->u.p.parm1 = FALSE; 867 break; 868 } 869 } 870 #if USE_ITT_BUILD 871 // Calculate chunk for metadata report 872 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 873 cur_chunk = limit - init + 1; 874 } 875 #endif 876 if ( st == 1 ) { 877 pr->u.p.lb = lb + init; 878 pr->u.p.ub = lb + limit; 879 } else { 880 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 881 pr->u.p.lb = lb + init * st; 882 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 883 if ( st > 0 ) { 884 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 885 } else { 886 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 887 } 888 } 889 if ( pr->ordered ) { 890 pr->u.p.ordered_lower = init; 891 pr->u.p.ordered_upper = limit; 892 } 893 break; 894 } // case 895 case kmp_sch_guided_iterative_chunked : 896 { 897 T nproc = team->t.t_nproc; 898 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 899 900 if ( nproc > 1 ) { 901 if ( (2L * chunk + 1 ) * nproc >= tc ) { 902 /* chunk size too large, switch to dynamic */ 903 schedule = kmp_sch_dynamic_chunked; 904 } else { 905 // when remaining iters become less than parm2 - switch to dynamic 906 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 907 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 908 } 909 } else { 910 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 911 schedule = kmp_sch_static_greedy; 912 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 913 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 914 pr->u.p.parm1 = tc; 915 } // if 916 } // case 917 break; 918 case kmp_sch_guided_analytical_chunked: 919 { 920 T nproc = team->t.t_nproc; 921 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 922 923 if ( nproc > 1 ) { 924 if ( (2L * chunk + 1 ) * nproc >= tc ) { 925 /* chunk size too large, switch to dynamic */ 926 schedule = kmp_sch_dynamic_chunked; 927 } else { 928 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 929 DBL x; 930 931 #if KMP_OS_WINDOWS && KMP_ARCH_X86 932 /* Linux* OS already has 64-bit computation by default for 933 long double, and on Windows* OS on Intel(R) 64, 934 /Qlong_double doesn't work. On Windows* OS 935 on IA-32 architecture, we need to set precision to 936 64-bit instead of the default 53-bit. Even though long 937 double doesn't work on Windows* OS on Intel(R) 64, the 938 resulting lack of precision is not expected to impact 939 the correctness of the algorithm, but this has not been 940 mathematically proven. 941 */ 942 // save original FPCW and set precision to 64-bit, as 943 // Windows* OS on IA-32 architecture defaults to 53-bit 944 unsigned int oldFpcw = _control87(0,0); 945 _control87(_PC_64,_MCW_PC); // 0,0x30000 946 #endif 947 /* value used for comparison in solver for cross-over point */ 948 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 949 950 /* crossover point--chunk indexes equal to or greater than 951 this point switch to dynamic-style scheduling */ 952 UT cross; 953 954 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 955 x = (long double)1.0 - (long double)0.5 / nproc; 956 957 #ifdef KMP_DEBUG 958 { // test natural alignment 959 struct _test_a { 960 char a; 961 union { 962 char b; 963 DBL d; 964 }; 965 } t; 966 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 967 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 968 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 969 } 970 #endif // KMP_DEBUG 971 972 /* save the term in thread private dispatch structure */ 973 *(DBL*)&pr->u.p.parm3 = x; 974 975 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 976 { 977 UT left, right, mid; 978 long double p; 979 980 /* estimate initial upper and lower bound */ 981 982 /* doesn't matter what value right is as long as it is positive, but 983 it affects performance of the solver 984 */ 985 right = 229; 986 p = __kmp_pow< UT >(x,right); 987 if ( p > target ) { 988 do{ 989 p *= p; 990 right <<= 1; 991 } while(p>target && right < (1<<27)); 992 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 993 } else { 994 left = 0; 995 } 996 997 /* bisection root-finding method */ 998 while ( left + 1 < right ) { 999 mid = (left + right) / 2; 1000 if ( __kmp_pow< UT >(x,mid) > target ) { 1001 left = mid; 1002 } else { 1003 right = mid; 1004 } 1005 } // while 1006 cross = right; 1007 } 1008 /* assert sanity of computed crossover point */ 1009 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1010 1011 /* save the crossover point in thread private dispatch structure */ 1012 pr->u.p.parm2 = cross; 1013 1014 // C75803 1015 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1016 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1017 #else 1018 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1019 #endif 1020 /* dynamic-style scheduling offset */ 1021 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1022 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1023 // restore FPCW 1024 _control87(oldFpcw,_MCW_PC); 1025 #endif 1026 } // if 1027 } else { 1028 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1029 gtid ) ); 1030 schedule = kmp_sch_static_greedy; 1031 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1032 pr->u.p.parm1 = tc; 1033 } // if 1034 } // case 1035 break; 1036 case kmp_sch_static_greedy: 1037 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1038 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1039 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1040 tc; 1041 break; 1042 case kmp_sch_static_chunked : 1043 case kmp_sch_dynamic_chunked : 1044 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1045 break; 1046 case kmp_sch_trapezoidal : 1047 { 1048 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1049 1050 T parm1, parm2, parm3, parm4; 1051 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1052 1053 parm1 = chunk; 1054 1055 /* F : size of the first cycle */ 1056 parm2 = ( tc / (2 * team->t.t_nproc) ); 1057 1058 if ( parm2 < 1 ) { 1059 parm2 = 1; 1060 } 1061 1062 /* L : size of the last cycle. Make sure the last cycle 1063 * is not larger than the first cycle. 1064 */ 1065 if ( parm1 < 1 ) { 1066 parm1 = 1; 1067 } else if ( parm1 > parm2 ) { 1068 parm1 = parm2; 1069 } 1070 1071 /* N : number of cycles */ 1072 parm3 = ( parm2 + parm1 ); 1073 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1074 1075 if ( parm3 < 2 ) { 1076 parm3 = 2; 1077 } 1078 1079 /* sigma : decreasing incr of the trapezoid */ 1080 parm4 = ( parm3 - 1 ); 1081 parm4 = ( parm2 - parm1 ) / parm4; 1082 1083 // pointless check, because parm4 >= 0 always 1084 //if ( parm4 < 0 ) { 1085 // parm4 = 0; 1086 //} 1087 1088 pr->u.p.parm1 = parm1; 1089 pr->u.p.parm2 = parm2; 1090 pr->u.p.parm3 = parm3; 1091 pr->u.p.parm4 = parm4; 1092 } // case 1093 break; 1094 1095 default: 1096 { 1097 __kmp_msg( 1098 kmp_ms_fatal, // Severity 1099 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1100 KMP_HNT( GetNewerLibrary ), // Hint 1101 __kmp_msg_null // Variadic argument list terminator 1102 ); 1103 } 1104 break; 1105 } // switch 1106 pr->schedule = schedule; 1107 if ( active ) { 1108 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1109 1110 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1111 gtid, my_buffer_index, sh->buffer_index) ); 1112 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1113 USE_ITT_BUILD_ARG( NULL ) 1114 ); 1115 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1116 // *always* 32-bit integers. 1117 KMP_MB(); /* is this necessary? */ 1118 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1119 gtid, my_buffer_index, sh->buffer_index) ); 1120 1121 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1122 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1123 #if USE_ITT_BUILD 1124 if ( pr->ordered ) { 1125 __kmp_itt_ordered_init( gtid ); 1126 }; // if 1127 #endif /* USE_ITT_BUILD */ 1128 }; // if 1129 1130 #if USE_ITT_BUILD 1131 // Report loop metadata 1132 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 1133 kmp_uint32 tid = __kmp_tid_from_gtid( gtid ); 1134 if (KMP_MASTER_TID(tid)) { 1135 kmp_uint64 schedtype = 0; 1136 1137 switch ( schedule ) { 1138 case kmp_sch_static_chunked: 1139 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1140 break; 1141 case kmp_sch_static_greedy: 1142 cur_chunk = pr->u.p.parm1; 1143 break; 1144 case kmp_sch_dynamic_chunked: 1145 schedtype = 1; 1146 break; 1147 case kmp_sch_guided_iterative_chunked: 1148 case kmp_sch_guided_analytical_chunked: 1149 schedtype = 2; 1150 break; 1151 default: 1152 // Should we put this case under "static"? 1153 // case kmp_sch_static_steal: 1154 schedtype = 3; 1155 break; 1156 } 1157 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1158 } 1159 } 1160 #endif /* USE_ITT_BUILD */ 1161 1162 #ifdef KMP_DEBUG 1163 { 1164 const char * buff; 1165 // create format specifiers before the debug output 1166 buff = __kmp_str_format( 1167 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1168 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1169 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1170 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1171 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1172 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1173 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1174 KD_TRACE(10, ( buff, 1175 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1176 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1177 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1178 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1179 __kmp_str_free( &buff ); 1180 } 1181 #endif 1182 #if ( KMP_STATIC_STEAL_ENABLED ) 1183 if ( ___kmp_size_type < 8 ) { 1184 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1185 // all the parm3 variables will contain the same value. 1186 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1187 // rather than program life-time increment. 1188 // So the dedicated variable is required. The 'static_steal_counter' is used. 1189 if( schedule == kmp_sch_static_steal ) { 1190 // Other threads will inspect this variable when searching for a victim. 1191 // This is a flag showing that other threads may steal from this thread since then. 1192 volatile T * p = &pr->u.p.static_steal_counter; 1193 *p = *p + 1; 1194 } 1195 } 1196 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1197 1198 #if OMPT_SUPPORT && OMPT_TRACE 1199 if ((ompt_status == ompt_status_track_callback) && 1200 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1201 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1202 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1203 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1204 team_info->parallel_id, task_info->task_id, team_info->microtask); 1205 } 1206 #endif 1207 } 1208 1209 /* 1210 * For ordered loops, either __kmp_dispatch_finish() should be called after 1211 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1212 * every chunk of iterations. If the ordered section(s) were not executed 1213 * for this iteration (or every iteration in this chunk), we need to set the 1214 * ordered iteration counters so that the next thread can proceed. 1215 */ 1216 template< typename UT > 1217 static void 1218 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1219 { 1220 typedef typename traits_t< UT >::signed_t ST; 1221 kmp_info_t *th = __kmp_threads[ gtid ]; 1222 1223 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1224 if ( ! th -> th.th_team -> t.t_serialized ) { 1225 1226 dispatch_private_info_template< UT > * pr = 1227 reinterpret_cast< dispatch_private_info_template< UT >* > 1228 ( th->th.th_dispatch->th_dispatch_pr_current ); 1229 dispatch_shared_info_template< UT > volatile * sh = 1230 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1231 ( th->th.th_dispatch->th_dispatch_sh_current ); 1232 KMP_DEBUG_ASSERT( pr ); 1233 KMP_DEBUG_ASSERT( sh ); 1234 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1235 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1236 1237 if ( pr->ordered_bumped ) { 1238 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1239 gtid ) ); 1240 pr->ordered_bumped = 0; 1241 } else { 1242 UT lower = pr->u.p.ordered_lower; 1243 1244 #ifdef KMP_DEBUG 1245 { 1246 const char * buff; 1247 // create format specifiers before the debug output 1248 buff = __kmp_str_format( 1249 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1250 traits_t< UT >::spec, traits_t< UT >::spec ); 1251 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1252 __kmp_str_free( &buff ); 1253 } 1254 #endif 1255 1256 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1257 USE_ITT_BUILD_ARG(NULL) 1258 ); 1259 KMP_MB(); /* is this necessary? */ 1260 #ifdef KMP_DEBUG 1261 { 1262 const char * buff; 1263 // create format specifiers before the debug output 1264 buff = __kmp_str_format( 1265 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1266 traits_t< UT >::spec, traits_t< UT >::spec ); 1267 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1268 __kmp_str_free( &buff ); 1269 } 1270 #endif 1271 1272 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1273 } // if 1274 } // if 1275 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1276 } 1277 1278 #ifdef KMP_GOMP_COMPAT 1279 1280 template< typename UT > 1281 static void 1282 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1283 { 1284 typedef typename traits_t< UT >::signed_t ST; 1285 kmp_info_t *th = __kmp_threads[ gtid ]; 1286 1287 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1288 if ( ! th -> th.th_team -> t.t_serialized ) { 1289 // int cid; 1290 dispatch_private_info_template< UT > * pr = 1291 reinterpret_cast< dispatch_private_info_template< UT >* > 1292 ( th->th.th_dispatch->th_dispatch_pr_current ); 1293 dispatch_shared_info_template< UT > volatile * sh = 1294 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1295 ( th->th.th_dispatch->th_dispatch_sh_current ); 1296 KMP_DEBUG_ASSERT( pr ); 1297 KMP_DEBUG_ASSERT( sh ); 1298 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1299 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1300 1301 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1302 UT lower = pr->u.p.ordered_lower; 1303 UT upper = pr->u.p.ordered_upper; 1304 UT inc = upper - lower + 1; 1305 1306 if ( pr->ordered_bumped == inc ) { 1307 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1308 gtid ) ); 1309 pr->ordered_bumped = 0; 1310 } else { 1311 inc -= pr->ordered_bumped; 1312 1313 #ifdef KMP_DEBUG 1314 { 1315 const char * buff; 1316 // create format specifiers before the debug output 1317 buff = __kmp_str_format( 1318 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1319 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1320 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1321 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1322 __kmp_str_free( &buff ); 1323 } 1324 #endif 1325 1326 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1327 USE_ITT_BUILD_ARG(NULL) 1328 ); 1329 1330 KMP_MB(); /* is this necessary? */ 1331 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1332 gtid ) ); 1333 pr->ordered_bumped = 0; 1334 //!!!!! TODO check if the inc should be unsigned, or signed??? 1335 #ifdef KMP_DEBUG 1336 { 1337 const char * buff; 1338 // create format specifiers before the debug output 1339 buff = __kmp_str_format( 1340 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1341 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1342 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1343 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1344 __kmp_str_free( &buff ); 1345 } 1346 #endif 1347 1348 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1349 } 1350 // } 1351 } 1352 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1353 } 1354 1355 #endif /* KMP_GOMP_COMPAT */ 1356 1357 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1358 * (no more work), then tell OMPT the loop is over. In some cases 1359 * kmp_dispatch_fini() is not called. */ 1360 #if OMPT_SUPPORT && OMPT_TRACE 1361 #define OMPT_LOOP_END \ 1362 if (status == 0) { \ 1363 if ((ompt_status == ompt_status_track_callback) && \ 1364 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1365 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1366 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1367 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1368 team_info->parallel_id, task_info->task_id); \ 1369 } \ 1370 } 1371 #else 1372 #define OMPT_LOOP_END // no-op 1373 #endif 1374 1375 template< typename T > 1376 static int 1377 __kmp_dispatch_next( 1378 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1379 ) { 1380 1381 typedef typename traits_t< T >::unsigned_t UT; 1382 typedef typename traits_t< T >::signed_t ST; 1383 typedef typename traits_t< T >::floating_t DBL; 1384 static const int ___kmp_size_type = sizeof( UT ); 1385 1386 int status; 1387 dispatch_private_info_template< T > * pr; 1388 kmp_info_t * th = __kmp_threads[ gtid ]; 1389 kmp_team_t * team = th -> th.th_team; 1390 1391 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL 1392 #ifdef KMP_DEBUG 1393 { 1394 const char * buff; 1395 // create format specifiers before the debug output 1396 buff = __kmp_str_format( 1397 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1398 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1399 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1400 __kmp_str_free( &buff ); 1401 } 1402 #endif 1403 1404 if ( team -> t.t_serialized ) { 1405 /* NOTE: serialize this dispatch becase we are not at the active level */ 1406 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1407 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1408 KMP_DEBUG_ASSERT( pr ); 1409 1410 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1411 *p_lb = 0; 1412 *p_ub = 0; 1413 // if ( p_last != NULL ) 1414 // *p_last = 0; 1415 if ( p_st != NULL ) 1416 *p_st = 0; 1417 if ( __kmp_env_consistency_check ) { 1418 if ( pr->pushed_ws != ct_none ) { 1419 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1420 } 1421 } 1422 } else if ( pr->nomerge ) { 1423 kmp_int32 last; 1424 T start; 1425 UT limit, trip, init; 1426 ST incr; 1427 T chunk = pr->u.p.parm1; 1428 1429 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1430 1431 init = chunk * pr->u.p.count++; 1432 trip = pr->u.p.tc - 1; 1433 1434 if ( (status = (init <= trip)) == 0 ) { 1435 *p_lb = 0; 1436 *p_ub = 0; 1437 // if ( p_last != NULL ) 1438 // *p_last = 0; 1439 if ( p_st != NULL ) 1440 *p_st = 0; 1441 if ( __kmp_env_consistency_check ) { 1442 if ( pr->pushed_ws != ct_none ) { 1443 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1444 } 1445 } 1446 } else { 1447 start = pr->u.p.lb; 1448 limit = chunk + init - 1; 1449 incr = pr->u.p.st; 1450 1451 if ( (last = (limit >= trip)) != 0 ) { 1452 limit = trip; 1453 #if KMP_OS_WINDOWS 1454 pr->u.p.last_upper = pr->u.p.ub; 1455 #endif /* KMP_OS_WINDOWS */ 1456 } 1457 if ( p_last != NULL ) 1458 *p_last = last; 1459 if ( p_st != NULL ) 1460 *p_st = incr; 1461 if ( incr == 1 ) { 1462 *p_lb = start + init; 1463 *p_ub = start + limit; 1464 } else { 1465 *p_lb = start + init * incr; 1466 *p_ub = start + limit * incr; 1467 } 1468 1469 if ( pr->ordered ) { 1470 pr->u.p.ordered_lower = init; 1471 pr->u.p.ordered_upper = limit; 1472 #ifdef KMP_DEBUG 1473 { 1474 const char * buff; 1475 // create format specifiers before the debug output 1476 buff = __kmp_str_format( 1477 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1478 traits_t< UT >::spec, traits_t< UT >::spec ); 1479 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1480 __kmp_str_free( &buff ); 1481 } 1482 #endif 1483 } // if 1484 } // if 1485 } else { 1486 pr->u.p.tc = 0; 1487 *p_lb = pr->u.p.lb; 1488 *p_ub = pr->u.p.ub; 1489 #if KMP_OS_WINDOWS 1490 pr->u.p.last_upper = *p_ub; 1491 #endif /* KMP_OS_WINDOWS */ 1492 if ( p_last != NULL ) 1493 *p_last = TRUE; 1494 if ( p_st != NULL ) 1495 *p_st = pr->u.p.st; 1496 } // if 1497 #ifdef KMP_DEBUG 1498 { 1499 const char * buff; 1500 // create format specifiers before the debug output 1501 buff = __kmp_str_format( 1502 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1503 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1504 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1505 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1506 __kmp_str_free( &buff ); 1507 } 1508 #endif 1509 #if INCLUDE_SSC_MARKS 1510 SSC_MARK_DISPATCH_NEXT(); 1511 #endif 1512 OMPT_LOOP_END; 1513 return status; 1514 } else { 1515 kmp_int32 last = 0; 1516 dispatch_shared_info_template< UT > *sh; 1517 T start; 1518 ST incr; 1519 UT limit, trip, init; 1520 1521 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1522 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1523 1524 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1525 ( th->th.th_dispatch->th_dispatch_pr_current ); 1526 KMP_DEBUG_ASSERT( pr ); 1527 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1528 ( th->th.th_dispatch->th_dispatch_sh_current ); 1529 KMP_DEBUG_ASSERT( sh ); 1530 1531 if ( pr->u.p.tc == 0 ) { 1532 // zero trip count 1533 status = 0; 1534 } else { 1535 switch (pr->schedule) { 1536 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1537 case kmp_sch_static_steal: 1538 { 1539 T chunk = pr->u.p.parm1; 1540 1541 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1542 1543 trip = pr->u.p.tc - 1; 1544 1545 if ( ___kmp_size_type > 4 ) { 1546 // Other threads do not look into the data of this thread, 1547 // so it's not necessary to make volatile casting. 1548 init = ( pr->u.p.count )++; 1549 status = ( init < (UT)pr->u.p.ub ); 1550 } else { 1551 typedef union { 1552 struct { 1553 UT count; 1554 T ub; 1555 } p; 1556 kmp_int64 b; 1557 } union_i4; 1558 // All operations on 'count' or 'ub' must be combined atomically together. 1559 // stealing implemented only for 4-byte indexes 1560 { 1561 union_i4 vold, vnew; 1562 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1563 vnew = vold; 1564 vnew.p.count++; 1565 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1566 ( volatile kmp_int64* )&pr->u.p.count, 1567 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1568 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1569 KMP_CPU_PAUSE(); 1570 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1571 vnew = vold; 1572 vnew.p.count++; 1573 } 1574 vnew = vold; 1575 init = vnew.p.count; 1576 status = ( init < (UT)vnew.p.ub ) ; 1577 } 1578 1579 if( !status ) { 1580 kmp_info_t **other_threads = team->t.t_threads; 1581 int while_limit = 10; 1582 int while_index = 0; 1583 1584 // TODO: algorithm of searching for a victim 1585 // should be cleaned up and measured 1586 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1587 union_i4 vold, vnew; 1588 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1589 T victimIdx = pr->u.p.parm4; 1590 T oldVictimIdx = victimIdx; 1591 dispatch_private_info_template< T > * victim; 1592 1593 do { 1594 if( !victimIdx ) { 1595 victimIdx = team->t.t_nproc - 1; 1596 } else { 1597 --victimIdx; 1598 } 1599 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1600 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1601 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1602 // TODO: think about a proper place of this test 1603 if ( ( !victim ) || 1604 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1605 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1606 // TODO: delay would be nice 1607 continue; 1608 // the victim is not ready yet to participate in stealing 1609 // because the victim is still in kmp_init_dispatch 1610 } 1611 if ( oldVictimIdx == victimIdx ) { 1612 break; 1613 } 1614 pr->u.p.parm4 = victimIdx; 1615 1616 while( 1 ) { 1617 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1618 vnew = vold; 1619 1620 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1621 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1622 break; 1623 } 1624 vnew.p.ub -= (remaining >> 2); 1625 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1626 #pragma warning( push ) 1627 // disable warning on pointless comparison of unsigned with 0 1628 #pragma warning( disable: 186 ) 1629 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1630 #pragma warning( pop ) 1631 // TODO: Should this be acquire or release? 1632 if ( KMP_COMPARE_AND_STORE_ACQ64( 1633 ( volatile kmp_int64 * )&victim->u.p.count, 1634 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1635 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1636 status = 1; 1637 while_index = 0; 1638 // now update own count and ub 1639 #if KMP_ARCH_X86 1640 // stealing executed on non-KMP_ARCH_X86 only 1641 // Atomic 64-bit write on ia32 is 1642 // unavailable, so we do this in steps. 1643 // This code is not tested. 1644 init = vold.p.count; 1645 pr->u.p.ub = 0; 1646 pr->u.p.count = init + 1; 1647 pr->u.p.ub = vnew.p.count; 1648 #else 1649 init = vnew.p.ub; 1650 vold.p.count = init + 1; 1651 // TODO: is it safe and enough? 1652 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1653 #endif // KMP_ARCH_X86 1654 break; 1655 } // if 1656 KMP_CPU_PAUSE(); 1657 } // while (1) 1658 } // while 1659 } // if 1660 } // if 1661 if ( !status ) { 1662 *p_lb = 0; 1663 *p_ub = 0; 1664 if ( p_st != NULL ) *p_st = 0; 1665 } else { 1666 start = pr->u.p.parm2; 1667 init *= chunk; 1668 limit = chunk + init - 1; 1669 incr = pr->u.p.st; 1670 1671 KMP_DEBUG_ASSERT(init <= trip); 1672 if ( (last = (limit >= trip)) != 0 ) 1673 limit = trip; 1674 if ( p_st != NULL ) *p_st = incr; 1675 1676 if ( incr == 1 ) { 1677 *p_lb = start + init; 1678 *p_ub = start + limit; 1679 } else { 1680 *p_lb = start + init * incr; 1681 *p_ub = start + limit * incr; 1682 } 1683 1684 if ( pr->ordered ) { 1685 pr->u.p.ordered_lower = init; 1686 pr->u.p.ordered_upper = limit; 1687 #ifdef KMP_DEBUG 1688 { 1689 const char * buff; 1690 // create format specifiers before the debug output 1691 buff = __kmp_str_format( 1692 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1693 traits_t< UT >::spec, traits_t< UT >::spec ); 1694 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1695 __kmp_str_free( &buff ); 1696 } 1697 #endif 1698 } // if 1699 } // if 1700 break; 1701 } // case 1702 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1703 case kmp_sch_static_balanced: 1704 { 1705 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1706 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1707 pr->u.p.count = 1; 1708 *p_lb = pr->u.p.lb; 1709 *p_ub = pr->u.p.ub; 1710 last = pr->u.p.parm1; 1711 if ( p_st != NULL ) 1712 *p_st = pr->u.p.st; 1713 } else { /* no iterations to do */ 1714 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1715 } 1716 if ( pr->ordered ) { 1717 #ifdef KMP_DEBUG 1718 { 1719 const char * buff; 1720 // create format specifiers before the debug output 1721 buff = __kmp_str_format( 1722 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1723 traits_t< UT >::spec, traits_t< UT >::spec ); 1724 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1725 __kmp_str_free( &buff ); 1726 } 1727 #endif 1728 } // if 1729 } // case 1730 break; 1731 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1732 case kmp_sch_static_chunked: 1733 { 1734 T parm1; 1735 1736 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1737 gtid ) ); 1738 parm1 = pr->u.p.parm1; 1739 1740 trip = pr->u.p.tc - 1; 1741 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1742 1743 if ( (status = (init <= trip)) != 0 ) { 1744 start = pr->u.p.lb; 1745 incr = pr->u.p.st; 1746 limit = parm1 + init - 1; 1747 1748 if ( (last = (limit >= trip)) != 0 ) 1749 limit = trip; 1750 1751 if ( p_st != NULL ) *p_st = incr; 1752 1753 pr->u.p.count += team->t.t_nproc; 1754 1755 if ( incr == 1 ) { 1756 *p_lb = start + init; 1757 *p_ub = start + limit; 1758 } 1759 else { 1760 *p_lb = start + init * incr; 1761 *p_ub = start + limit * incr; 1762 } 1763 1764 if ( pr->ordered ) { 1765 pr->u.p.ordered_lower = init; 1766 pr->u.p.ordered_upper = limit; 1767 #ifdef KMP_DEBUG 1768 { 1769 const char * buff; 1770 // create format specifiers before the debug output 1771 buff = __kmp_str_format( 1772 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1773 traits_t< UT >::spec, traits_t< UT >::spec ); 1774 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1775 __kmp_str_free( &buff ); 1776 } 1777 #endif 1778 } // if 1779 } // if 1780 } // case 1781 break; 1782 1783 case kmp_sch_dynamic_chunked: 1784 { 1785 T chunk = pr->u.p.parm1; 1786 1787 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1788 gtid ) ); 1789 1790 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1791 trip = pr->u.p.tc - 1; 1792 1793 if ( (status = (init <= trip)) == 0 ) { 1794 *p_lb = 0; 1795 *p_ub = 0; 1796 if ( p_st != NULL ) *p_st = 0; 1797 } else { 1798 start = pr->u.p.lb; 1799 limit = chunk + init - 1; 1800 incr = pr->u.p.st; 1801 1802 if ( (last = (limit >= trip)) != 0 ) 1803 limit = trip; 1804 1805 if ( p_st != NULL ) *p_st = incr; 1806 1807 if ( incr == 1 ) { 1808 *p_lb = start + init; 1809 *p_ub = start + limit; 1810 } else { 1811 *p_lb = start + init * incr; 1812 *p_ub = start + limit * incr; 1813 } 1814 1815 if ( pr->ordered ) { 1816 pr->u.p.ordered_lower = init; 1817 pr->u.p.ordered_upper = limit; 1818 #ifdef KMP_DEBUG 1819 { 1820 const char * buff; 1821 // create format specifiers before the debug output 1822 buff = __kmp_str_format( 1823 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1824 traits_t< UT >::spec, traits_t< UT >::spec ); 1825 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1826 __kmp_str_free( &buff ); 1827 } 1828 #endif 1829 } // if 1830 } // if 1831 } // case 1832 break; 1833 1834 case kmp_sch_guided_iterative_chunked: 1835 { 1836 T chunkspec = pr->u.p.parm1; 1837 KD_TRACE(100, 1838 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1839 trip = pr->u.p.tc; 1840 // Start atomic part of calculations 1841 while(1) { 1842 ST remaining; // signed, because can be < 0 1843 init = sh->u.s.iteration; // shared value 1844 remaining = trip - init; 1845 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1846 // nothing to do, don't try atomic op 1847 status = 0; 1848 break; 1849 } 1850 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1851 // use dynamic-style shcedule 1852 // atomically inrement iterations, get old value 1853 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1854 remaining = trip - init; 1855 if (remaining <= 0) { 1856 status = 0; // all iterations got by other threads 1857 } else { 1858 // got some iterations to work on 1859 status = 1; 1860 if ( (T)remaining > chunkspec ) { 1861 limit = init + chunkspec - 1; 1862 } else { 1863 last = 1; // the last chunk 1864 limit = init + remaining - 1; 1865 } // if 1866 } // if 1867 break; 1868 } // if 1869 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1870 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1871 // CAS was successful, chunk obtained 1872 status = 1; 1873 --limit; 1874 break; 1875 } // if 1876 } // while 1877 if ( status != 0 ) { 1878 start = pr->u.p.lb; 1879 incr = pr->u.p.st; 1880 if ( p_st != NULL ) 1881 *p_st = incr; 1882 *p_lb = start + init * incr; 1883 *p_ub = start + limit * incr; 1884 if ( pr->ordered ) { 1885 pr->u.p.ordered_lower = init; 1886 pr->u.p.ordered_upper = limit; 1887 #ifdef KMP_DEBUG 1888 { 1889 const char * buff; 1890 // create format specifiers before the debug output 1891 buff = __kmp_str_format( 1892 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1893 traits_t< UT >::spec, traits_t< UT >::spec ); 1894 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1895 __kmp_str_free( &buff ); 1896 } 1897 #endif 1898 } // if 1899 } else { 1900 *p_lb = 0; 1901 *p_ub = 0; 1902 if ( p_st != NULL ) 1903 *p_st = 0; 1904 } // if 1905 } // case 1906 break; 1907 1908 case kmp_sch_guided_analytical_chunked: 1909 { 1910 T chunkspec = pr->u.p.parm1; 1911 UT chunkIdx; 1912 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1913 /* for storing original FPCW value for Windows* OS on 1914 IA-32 architecture 8-byte version */ 1915 unsigned int oldFpcw; 1916 unsigned int fpcwSet = 0; 1917 #endif 1918 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1919 gtid ) ); 1920 1921 trip = pr->u.p.tc; 1922 1923 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1924 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1925 1926 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1927 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1928 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1929 --trip; 1930 /* use dynamic-style scheduling */ 1931 init = chunkIdx * chunkspec + pr->u.p.count; 1932 /* need to verify init > 0 in case of overflow in the above calculation */ 1933 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1934 limit = init + chunkspec -1; 1935 1936 if ( (last = (limit >= trip)) != 0 ) 1937 limit = trip; 1938 } 1939 break; 1940 } else { 1941 /* use exponential-style scheduling */ 1942 /* The following check is to workaround the lack of long double precision on Windows* OS. 1943 This check works around the possible effect that init != 0 for chunkIdx == 0. 1944 */ 1945 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1946 /* If we haven't already done so, save original 1947 FPCW and set precision to 64-bit, as Windows* OS 1948 on IA-32 architecture defaults to 53-bit */ 1949 if ( !fpcwSet ) { 1950 oldFpcw = _control87(0,0); 1951 _control87(_PC_64,_MCW_PC); 1952 fpcwSet = 0x30000; 1953 } 1954 #endif 1955 if ( chunkIdx ) { 1956 init = __kmp_dispatch_guided_remaining< T >( 1957 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1958 KMP_DEBUG_ASSERT(init); 1959 init = trip - init; 1960 } else 1961 init = 0; 1962 limit = trip - __kmp_dispatch_guided_remaining< T >( 1963 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1964 KMP_ASSERT(init <= limit); 1965 if ( init < limit ) { 1966 KMP_DEBUG_ASSERT(limit <= trip); 1967 --limit; 1968 status = 1; 1969 break; 1970 } // if 1971 } // if 1972 } // while (1) 1973 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1974 /* restore FPCW if necessary 1975 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1976 */ 1977 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1978 _control87(oldFpcw,_MCW_PC); 1979 #endif 1980 if ( status != 0 ) { 1981 start = pr->u.p.lb; 1982 incr = pr->u.p.st; 1983 if ( p_st != NULL ) 1984 *p_st = incr; 1985 *p_lb = start + init * incr; 1986 *p_ub = start + limit * incr; 1987 if ( pr->ordered ) { 1988 pr->u.p.ordered_lower = init; 1989 pr->u.p.ordered_upper = limit; 1990 #ifdef KMP_DEBUG 1991 { 1992 const char * buff; 1993 // create format specifiers before the debug output 1994 buff = __kmp_str_format( 1995 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1996 traits_t< UT >::spec, traits_t< UT >::spec ); 1997 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1998 __kmp_str_free( &buff ); 1999 } 2000 #endif 2001 } 2002 } else { 2003 *p_lb = 0; 2004 *p_ub = 0; 2005 if ( p_st != NULL ) 2006 *p_st = 0; 2007 } 2008 } // case 2009 break; 2010 2011 case kmp_sch_trapezoidal: 2012 { 2013 UT index; 2014 T parm2 = pr->u.p.parm2; 2015 T parm3 = pr->u.p.parm3; 2016 T parm4 = pr->u.p.parm4; 2017 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2018 gtid ) ); 2019 2020 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2021 2022 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2023 trip = pr->u.p.tc - 1; 2024 2025 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2026 *p_lb = 0; 2027 *p_ub = 0; 2028 if ( p_st != NULL ) *p_st = 0; 2029 } else { 2030 start = pr->u.p.lb; 2031 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2032 incr = pr->u.p.st; 2033 2034 if ( (last = (limit >= trip)) != 0 ) 2035 limit = trip; 2036 2037 if ( p_st != NULL ) *p_st = incr; 2038 2039 if ( incr == 1 ) { 2040 *p_lb = start + init; 2041 *p_ub = start + limit; 2042 } else { 2043 *p_lb = start + init * incr; 2044 *p_ub = start + limit * incr; 2045 } 2046 2047 if ( pr->ordered ) { 2048 pr->u.p.ordered_lower = init; 2049 pr->u.p.ordered_upper = limit; 2050 #ifdef KMP_DEBUG 2051 { 2052 const char * buff; 2053 // create format specifiers before the debug output 2054 buff = __kmp_str_format( 2055 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2056 traits_t< UT >::spec, traits_t< UT >::spec ); 2057 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2058 __kmp_str_free( &buff ); 2059 } 2060 #endif 2061 } // if 2062 } // if 2063 } // case 2064 break; 2065 default: 2066 { 2067 status = 0; // to avoid complaints on uninitialized variable use 2068 __kmp_msg( 2069 kmp_ms_fatal, // Severity 2070 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2071 KMP_HNT( GetNewerLibrary ), // Hint 2072 __kmp_msg_null // Variadic argument list terminator 2073 ); 2074 } 2075 break; 2076 } // switch 2077 } // if tc == 0; 2078 2079 if ( status == 0 ) { 2080 UT num_done; 2081 2082 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2083 #ifdef KMP_DEBUG 2084 { 2085 const char * buff; 2086 // create format specifiers before the debug output 2087 buff = __kmp_str_format( 2088 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2089 traits_t< UT >::spec ); 2090 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2091 __kmp_str_free( &buff ); 2092 } 2093 #endif 2094 2095 if ( (ST)num_done == team->t.t_nproc-1 ) { 2096 /* NOTE: release this buffer to be reused */ 2097 2098 KMP_MB(); /* Flush all pending memory write invalidates. */ 2099 2100 sh->u.s.num_done = 0; 2101 sh->u.s.iteration = 0; 2102 2103 /* TODO replace with general release procedure? */ 2104 if ( pr->ordered ) { 2105 sh->u.s.ordered_iteration = 0; 2106 } 2107 2108 KMP_MB(); /* Flush all pending memory write invalidates. */ 2109 2110 sh -> buffer_index += KMP_MAX_DISP_BUF; 2111 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2112 gtid, sh->buffer_index) ); 2113 2114 KMP_MB(); /* Flush all pending memory write invalidates. */ 2115 2116 } // if 2117 if ( __kmp_env_consistency_check ) { 2118 if ( pr->pushed_ws != ct_none ) { 2119 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2120 } 2121 } 2122 2123 th -> th.th_dispatch -> th_deo_fcn = NULL; 2124 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2125 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2126 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2127 } // if (status == 0) 2128 #if KMP_OS_WINDOWS 2129 else if ( last ) { 2130 pr->u.p.last_upper = pr->u.p.ub; 2131 } 2132 #endif /* KMP_OS_WINDOWS */ 2133 if ( p_last != NULL && status != 0 ) 2134 *p_last = last; 2135 } // if 2136 2137 #ifdef KMP_DEBUG 2138 { 2139 const char * buff; 2140 // create format specifiers before the debug output 2141 buff = __kmp_str_format( 2142 "__kmp_dispatch_next: T#%%d normal case: " \ 2143 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2144 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2145 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2146 __kmp_str_free( &buff ); 2147 } 2148 #endif 2149 #if INCLUDE_SSC_MARKS 2150 SSC_MARK_DISPATCH_NEXT(); 2151 #endif 2152 OMPT_LOOP_END; 2153 return status; 2154 } 2155 2156 template< typename T > 2157 static void 2158 __kmp_dist_get_bounds( 2159 ident_t *loc, 2160 kmp_int32 gtid, 2161 kmp_int32 *plastiter, 2162 T *plower, 2163 T *pupper, 2164 typename traits_t< T >::signed_t incr 2165 ) { 2166 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); 2167 typedef typename traits_t< T >::unsigned_t UT; 2168 typedef typename traits_t< T >::signed_t ST; 2169 register kmp_uint32 team_id; 2170 register kmp_uint32 nteams; 2171 register UT trip_count; 2172 register kmp_team_t *team; 2173 kmp_info_t * th; 2174 2175 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2176 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2177 #ifdef KMP_DEBUG 2178 { 2179 const char * buff; 2180 // create format specifiers before the debug output 2181 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2182 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2183 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2184 traits_t< T >::spec ); 2185 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2186 __kmp_str_free( &buff ); 2187 } 2188 #endif 2189 2190 if( __kmp_env_consistency_check ) { 2191 if( incr == 0 ) { 2192 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2193 } 2194 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2195 // The loop is illegal. 2196 // Some zero-trip loops maintained by compiler, e.g.: 2197 // for(i=10;i<0;++i) // lower >= upper - run-time check 2198 // for(i=0;i>10;--i) // lower <= upper - run-time check 2199 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2200 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2201 // Compiler does not check the following illegal loops: 2202 // for(i=0;i<10;i+=incr) // where incr<0 2203 // for(i=10;i>0;i-=incr) // where incr<0 2204 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2205 } 2206 } 2207 th = __kmp_threads[gtid]; 2208 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2209 team = th->th.th_team; 2210 #if OMP_40_ENABLED 2211 nteams = th->th.th_teams_size.nteams; 2212 #endif 2213 team_id = team->t.t_master_tid; 2214 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2215 2216 // compute global trip count 2217 if( incr == 1 ) { 2218 trip_count = *pupper - *plower + 1; 2219 } else if(incr == -1) { 2220 trip_count = *plower - *pupper + 1; 2221 } else { 2222 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2223 } 2224 if( trip_count <= nteams ) { 2225 KMP_DEBUG_ASSERT( 2226 __kmp_static == kmp_sch_static_greedy || \ 2227 __kmp_static == kmp_sch_static_balanced 2228 ); // Unknown static scheduling type. 2229 // only some teams get single iteration, others get nothing 2230 if( team_id < trip_count ) { 2231 *pupper = *plower = *plower + team_id * incr; 2232 } else { 2233 *plower = *pupper + incr; // zero-trip loop 2234 } 2235 if( plastiter != NULL ) 2236 *plastiter = ( team_id == trip_count - 1 ); 2237 } else { 2238 if( __kmp_static == kmp_sch_static_balanced ) { 2239 register UT chunk = trip_count / nteams; 2240 register UT extras = trip_count % nteams; 2241 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2242 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2243 if( plastiter != NULL ) 2244 *plastiter = ( team_id == nteams - 1 ); 2245 } else { 2246 register T chunk_inc_count = 2247 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2248 register T upper = *pupper; 2249 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2250 // Unknown static scheduling type. 2251 *plower += team_id * chunk_inc_count; 2252 *pupper = *plower + chunk_inc_count - incr; 2253 // Check/correct bounds if needed 2254 if( incr > 0 ) { 2255 if( *pupper < *plower ) 2256 *pupper = i_maxmin< T >::mx; 2257 if( plastiter != NULL ) 2258 *plastiter = *plower <= upper && *pupper > upper - incr; 2259 if( *pupper > upper ) 2260 *pupper = upper; // tracker C73258 2261 } else { 2262 if( *pupper > *plower ) 2263 *pupper = i_maxmin< T >::mn; 2264 if( plastiter != NULL ) 2265 *plastiter = *plower >= upper && *pupper < upper - incr; 2266 if( *pupper < upper ) 2267 *pupper = upper; // tracker C73258 2268 } 2269 } 2270 } 2271 } 2272 2273 //----------------------------------------------------------------------------------------- 2274 // Dispatch routines 2275 // Transfer call to template< type T > 2276 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2277 // T lb, T ub, ST st, ST chunk ) 2278 extern "C" { 2279 2280 /*! 2281 @ingroup WORK_SHARING 2282 @{ 2283 @param loc Source location 2284 @param gtid Global thread id 2285 @param schedule Schedule type 2286 @param lb Lower bound 2287 @param ub Upper bound 2288 @param st Step (or increment if you prefer) 2289 @param chunk The chunk size to block with 2290 2291 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2292 These functions are all identical apart from the types of the arguments. 2293 */ 2294 2295 void 2296 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2297 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2298 { 2299 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2300 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2301 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2302 } 2303 /*! 2304 See @ref __kmpc_dispatch_init_4 2305 */ 2306 void 2307 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2308 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2309 { 2310 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2311 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2312 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2313 } 2314 2315 /*! 2316 See @ref __kmpc_dispatch_init_4 2317 */ 2318 void 2319 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2320 kmp_int64 lb, kmp_int64 ub, 2321 kmp_int64 st, kmp_int64 chunk ) 2322 { 2323 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2324 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2325 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2326 } 2327 2328 /*! 2329 See @ref __kmpc_dispatch_init_4 2330 */ 2331 void 2332 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2333 kmp_uint64 lb, kmp_uint64 ub, 2334 kmp_int64 st, kmp_int64 chunk ) 2335 { 2336 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2337 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2338 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2339 } 2340 2341 /*! 2342 See @ref __kmpc_dispatch_init_4 2343 2344 Difference from __kmpc_dispatch_init set of functions is these functions 2345 are called for composite distribute parallel for construct. Thus before 2346 regular iterations dispatching we need to calc per-team iteration space. 2347 2348 These functions are all identical apart from the types of the arguments. 2349 */ 2350 void 2351 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2352 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2353 { 2354 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2355 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2356 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2357 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2358 } 2359 2360 void 2361 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2362 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2363 { 2364 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2365 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2366 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2367 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2368 } 2369 2370 void 2371 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2372 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2373 { 2374 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2375 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2376 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2377 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2378 } 2379 2380 void 2381 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2382 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2383 { 2384 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2385 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2386 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2387 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2388 } 2389 2390 /*! 2391 @param loc Source code location 2392 @param gtid Global thread id 2393 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2394 @param p_lb Pointer to the lower bound for the next chunk of work 2395 @param p_ub Pointer to the upper bound for the next chunk of work 2396 @param p_st Pointer to the stride for the next chunk of work 2397 @return one if there is work to be done, zero otherwise 2398 2399 Get the next dynamically allocated chunk of work for this thread. 2400 If there is no more work, then the lb,ub and stride need not be modified. 2401 */ 2402 int 2403 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2404 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2405 { 2406 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2407 } 2408 2409 /*! 2410 See @ref __kmpc_dispatch_next_4 2411 */ 2412 int 2413 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2414 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2415 { 2416 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2417 } 2418 2419 /*! 2420 See @ref __kmpc_dispatch_next_4 2421 */ 2422 int 2423 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2424 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2425 { 2426 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2427 } 2428 2429 /*! 2430 See @ref __kmpc_dispatch_next_4 2431 */ 2432 int 2433 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2434 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2435 { 2436 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2437 } 2438 2439 /*! 2440 @param loc Source code location 2441 @param gtid Global thread id 2442 2443 Mark the end of a dynamic loop. 2444 */ 2445 void 2446 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2447 { 2448 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2449 } 2450 2451 /*! 2452 See @ref __kmpc_dispatch_fini_4 2453 */ 2454 void 2455 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2456 { 2457 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2458 } 2459 2460 /*! 2461 See @ref __kmpc_dispatch_fini_4 2462 */ 2463 void 2464 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2465 { 2466 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2467 } 2468 2469 /*! 2470 See @ref __kmpc_dispatch_fini_4 2471 */ 2472 void 2473 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2474 { 2475 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2476 } 2477 /*! @} */ 2478 2479 //----------------------------------------------------------------------------------------- 2480 //Non-template routines from kmp_dispatch.c used in other sources 2481 2482 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2483 return value == checker; 2484 } 2485 2486 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2487 return value != checker; 2488 } 2489 2490 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2491 return value < checker; 2492 } 2493 2494 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2495 return value >= checker; 2496 } 2497 2498 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2499 return value <= checker; 2500 } 2501 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2502 return value == checker; 2503 } 2504 2505 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2506 return value != checker; 2507 } 2508 2509 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2510 return value < checker; 2511 } 2512 2513 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2514 return value >= checker; 2515 } 2516 2517 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2518 return value <= checker; 2519 } 2520 2521 kmp_uint32 2522 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2523 kmp_uint32 checker, 2524 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2525 , void * obj // Higher-level synchronization object, or NULL. 2526 ) 2527 { 2528 // note: we may not belong to a team at this point 2529 register volatile kmp_uint32 * spin = spinner; 2530 register kmp_uint32 check = checker; 2531 register kmp_uint32 spins; 2532 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2533 register kmp_uint32 r; 2534 2535 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2536 KMP_INIT_YIELD( spins ); 2537 // main wait spin loop 2538 while(!f(r = TCR_4(*spin), check)) { 2539 KMP_FSYNC_SPIN_PREPARE( obj ); 2540 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2541 It causes problems with infinite recursion because of exit lock */ 2542 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2543 __kmp_abort_thread(); */ 2544 2545 /* if we have waited a bit, or are oversubscribed, yield */ 2546 /* pause is in the following code */ 2547 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2548 KMP_YIELD_SPIN( spins ); 2549 } 2550 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2551 return r; 2552 } 2553 2554 kmp_uint64 2555 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2556 kmp_uint64 checker, 2557 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2558 , void * obj // Higher-level synchronization object, or NULL. 2559 ) 2560 { 2561 // note: we may not belong to a team at this point 2562 register volatile kmp_uint64 * spin = spinner; 2563 register kmp_uint64 check = checker; 2564 register kmp_uint32 spins; 2565 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2566 register kmp_uint64 r; 2567 2568 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2569 KMP_INIT_YIELD( spins ); 2570 // main wait spin loop 2571 while(!f(r = *spin, check)) 2572 { 2573 KMP_FSYNC_SPIN_PREPARE( obj ); 2574 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2575 It causes problems with infinite recursion because of exit lock */ 2576 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2577 __kmp_abort_thread(); */ 2578 2579 // if we are oversubscribed, 2580 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2581 // pause is in the following code 2582 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2583 KMP_YIELD_SPIN( spins ); 2584 } 2585 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2586 return r; 2587 } 2588 2589 } // extern "C" 2590 2591 #ifdef KMP_GOMP_COMPAT 2592 2593 void 2594 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2595 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2596 kmp_int32 chunk, int push_ws ) 2597 { 2598 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2599 push_ws ); 2600 } 2601 2602 void 2603 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2604 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2605 kmp_int32 chunk, int push_ws ) 2606 { 2607 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2608 push_ws ); 2609 } 2610 2611 void 2612 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2613 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2614 kmp_int64 chunk, int push_ws ) 2615 { 2616 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2617 push_ws ); 2618 } 2619 2620 void 2621 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2622 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2623 kmp_int64 chunk, int push_ws ) 2624 { 2625 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2626 push_ws ); 2627 } 2628 2629 void 2630 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2631 { 2632 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2633 } 2634 2635 void 2636 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2637 { 2638 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2639 } 2640 2641 void 2642 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2643 { 2644 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2645 } 2646 2647 void 2648 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2649 { 2650 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2651 } 2652 2653 #endif /* KMP_GOMP_COMPAT */ 2654 2655 /* ------------------------------------------------------------------------ */ 2656 /* ------------------------------------------------------------------------ */ 2657 2658