1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 637 KMP_MASTER_GTID(gtid) && 638 #if OMP_40_ENABLED 639 th->th.th_teams_microtask == NULL && 640 #endif 641 team->t.t_active_level == 1; 642 #endif 643 if ( ! active ) { 644 pr = reinterpret_cast< dispatch_private_info_template< T >* > 645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 646 } else { 647 KMP_DEBUG_ASSERT( th->th.th_dispatch == 648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 649 650 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 651 652 /* What happens when number of threads changes, need to resize buffer? */ 653 pr = reinterpret_cast< dispatch_private_info_template< T > * > 654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 657 } 658 659 /* Pick up the nomerge/ordered bits from the scheduling type */ 660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 661 pr->nomerge = TRUE; 662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 663 } else { 664 pr->nomerge = FALSE; 665 } 666 pr->type_size = ___kmp_size_type; // remember the size of variables 667 if ( kmp_ord_lower & schedule ) { 668 pr->ordered = TRUE; 669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 670 } else { 671 pr->ordered = FALSE; 672 } 673 if ( schedule == kmp_sch_static ) { 674 schedule = __kmp_static; 675 } else { 676 if ( schedule == kmp_sch_runtime ) { 677 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 678 schedule = team -> t.t_sched.r_sched_type; 679 // Detail the schedule if needed (global controls are differentiated appropriately) 680 if ( schedule == kmp_sch_guided_chunked ) { 681 schedule = __kmp_guided; 682 } else if ( schedule == kmp_sch_static ) { 683 schedule = __kmp_static; 684 } 685 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 686 chunk = team -> t.t_sched.chunk; 687 688 #ifdef KMP_DEBUG 689 { 690 const char * buff; 691 // create format specifiers before the debug output 692 buff = __kmp_str_format( 693 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 694 traits_t< ST >::spec ); 695 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 696 __kmp_str_free( &buff ); 697 } 698 #endif 699 } else { 700 if ( schedule == kmp_sch_guided_chunked ) { 701 schedule = __kmp_guided; 702 } 703 if ( chunk <= 0 ) { 704 chunk = KMP_DEFAULT_CHUNK; 705 } 706 } 707 708 if ( schedule == kmp_sch_auto ) { 709 // mapping and differentiation: in the __kmp_do_serial_initialize() 710 schedule = __kmp_auto; 711 #ifdef KMP_DEBUG 712 { 713 const char * buff; 714 // create format specifiers before the debug output 715 buff = __kmp_str_format( 716 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 717 traits_t< ST >::spec ); 718 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 719 __kmp_str_free( &buff ); 720 } 721 #endif 722 } 723 724 /* guided analytical not safe for too many threads */ 725 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 726 schedule = kmp_sch_guided_iterative_chunked; 727 KMP_WARNING( DispatchManyThreads ); 728 } 729 pr->u.p.parm1 = chunk; 730 } 731 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 732 "unknown scheduling type" ); 733 734 pr->u.p.count = 0; 735 736 if ( __kmp_env_consistency_check ) { 737 if ( st == 0 ) { 738 __kmp_error_construct( 739 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 740 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 741 ); 742 } 743 } 744 745 tc = ( ub - lb + st ); 746 if ( st != 1 ) { 747 if ( st < 0 ) { 748 if ( lb < ub ) { 749 tc = 0; // zero-trip 750 } else { // lb >= ub 751 tc = (ST)tc / st; // convert to signed division 752 } 753 } else { // st > 0 754 if ( ub < lb ) { 755 tc = 0; // zero-trip 756 } else { // lb >= ub 757 tc /= st; 758 } 759 } 760 } else if ( ub < lb ) { // st == 1 761 tc = 0; // zero-trip 762 } 763 764 pr->u.p.lb = lb; 765 pr->u.p.ub = ub; 766 pr->u.p.st = st; 767 pr->u.p.tc = tc; 768 769 #if KMP_OS_WINDOWS 770 pr->u.p.last_upper = ub + st; 771 #endif /* KMP_OS_WINDOWS */ 772 773 /* NOTE: only the active parallel region(s) has active ordered sections */ 774 775 if ( active ) { 776 if ( pr->ordered == 0 ) { 777 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 778 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 779 } else { 780 pr->ordered_bumped = 0; 781 782 pr->u.p.ordered_lower = 1; 783 pr->u.p.ordered_upper = 0; 784 785 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 786 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 787 } 788 } 789 790 if ( __kmp_env_consistency_check ) { 791 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 792 if ( push_ws ) { 793 __kmp_push_workshare( gtid, ws, loc ); 794 pr->pushed_ws = ws; 795 } else { 796 __kmp_check_workshare( gtid, ws, loc ); 797 pr->pushed_ws = ct_none; 798 } 799 } 800 801 switch ( schedule ) { 802 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 803 case kmp_sch_static_steal: 804 { 805 T nproc = team->t.t_nproc; 806 T ntc, init; 807 808 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 809 810 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 811 if ( nproc > 1 && ntc >= nproc ) { 812 T id = __kmp_tid_from_gtid(gtid); 813 T small_chunk, extras; 814 815 small_chunk = ntc / nproc; 816 extras = ntc % nproc; 817 818 init = id * small_chunk + ( id < extras ? id : extras ); 819 pr->u.p.count = init; 820 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 821 822 pr->u.p.parm2 = lb; 823 //pr->pfields.parm3 = 0; // it's not used in static_steal 824 pr->u.p.parm4 = id; 825 pr->u.p.st = st; 826 break; 827 } else { 828 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 829 gtid ) ); 830 schedule = kmp_sch_static_balanced; 831 /* too few iterations: fall-through to kmp_sch_static_balanced */ 832 } // if 833 /* FALL-THROUGH to static balanced */ 834 } // case 835 #endif 836 case kmp_sch_static_balanced: 837 { 838 T nproc = team->t.t_nproc; 839 T init, limit; 840 841 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 842 gtid ) ); 843 844 if ( nproc > 1 ) { 845 T id = __kmp_tid_from_gtid(gtid); 846 847 if ( tc < nproc ) { 848 if ( id < tc ) { 849 init = id; 850 limit = id; 851 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 852 } else { 853 pr->u.p.count = 1; /* means no more chunks to execute */ 854 pr->u.p.parm1 = FALSE; 855 break; 856 } 857 } else { 858 T small_chunk = tc / nproc; 859 T extras = tc % nproc; 860 init = id * small_chunk + (id < extras ? id : extras); 861 limit = init + small_chunk - (id < extras ? 0 : 1); 862 pr->u.p.parm1 = (id == nproc - 1); 863 } 864 } else { 865 if ( tc > 0 ) { 866 init = 0; 867 limit = tc - 1; 868 pr->u.p.parm1 = TRUE; 869 } else { 870 // zero trip count 871 pr->u.p.count = 1; /* means no more chunks to execute */ 872 pr->u.p.parm1 = FALSE; 873 break; 874 } 875 } 876 #if USE_ITT_BUILD 877 // Calculate chunk for metadata report 878 if ( itt_need_metadata_reporting ) 879 cur_chunk = limit - init + 1; 880 #endif 881 if ( st == 1 ) { 882 pr->u.p.lb = lb + init; 883 pr->u.p.ub = lb + limit; 884 } else { 885 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 886 pr->u.p.lb = lb + init * st; 887 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 888 if ( st > 0 ) { 889 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 890 } else { 891 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 892 } 893 } 894 if ( pr->ordered ) { 895 pr->u.p.ordered_lower = init; 896 pr->u.p.ordered_upper = limit; 897 } 898 break; 899 } // case 900 case kmp_sch_guided_iterative_chunked : 901 { 902 T nproc = team->t.t_nproc; 903 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 904 905 if ( nproc > 1 ) { 906 if ( (2L * chunk + 1 ) * nproc >= tc ) { 907 /* chunk size too large, switch to dynamic */ 908 schedule = kmp_sch_dynamic_chunked; 909 } else { 910 // when remaining iters become less than parm2 - switch to dynamic 911 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 912 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 913 } 914 } else { 915 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 916 schedule = kmp_sch_static_greedy; 917 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 918 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 919 pr->u.p.parm1 = tc; 920 } // if 921 } // case 922 break; 923 case kmp_sch_guided_analytical_chunked: 924 { 925 T nproc = team->t.t_nproc; 926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 927 928 if ( nproc > 1 ) { 929 if ( (2L * chunk + 1 ) * nproc >= tc ) { 930 /* chunk size too large, switch to dynamic */ 931 schedule = kmp_sch_dynamic_chunked; 932 } else { 933 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 934 DBL x; 935 936 #if KMP_OS_WINDOWS && KMP_ARCH_X86 937 /* Linux* OS already has 64-bit computation by default for 938 long double, and on Windows* OS on Intel(R) 64, 939 /Qlong_double doesn't work. On Windows* OS 940 on IA-32 architecture, we need to set precision to 941 64-bit instead of the default 53-bit. Even though long 942 double doesn't work on Windows* OS on Intel(R) 64, the 943 resulting lack of precision is not expected to impact 944 the correctness of the algorithm, but this has not been 945 mathematically proven. 946 */ 947 // save original FPCW and set precision to 64-bit, as 948 // Windows* OS on IA-32 architecture defaults to 53-bit 949 unsigned int oldFpcw = _control87(0,0); 950 _control87(_PC_64,_MCW_PC); // 0,0x30000 951 #endif 952 /* value used for comparison in solver for cross-over point */ 953 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 954 955 /* crossover point--chunk indexes equal to or greater than 956 this point switch to dynamic-style scheduling */ 957 UT cross; 958 959 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 960 x = (long double)1.0 - (long double)0.5 / nproc; 961 962 #ifdef KMP_DEBUG 963 { // test natural alignment 964 struct _test_a { 965 char a; 966 union { 967 char b; 968 DBL d; 969 }; 970 } t; 971 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 972 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 973 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 974 } 975 #endif // KMP_DEBUG 976 977 /* save the term in thread private dispatch structure */ 978 *(DBL*)&pr->u.p.parm3 = x; 979 980 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 981 { 982 UT left, right, mid; 983 long double p; 984 985 /* estimate initial upper and lower bound */ 986 987 /* doesn't matter what value right is as long as it is positive, but 988 it affects performance of the solver 989 */ 990 right = 229; 991 p = __kmp_pow< UT >(x,right); 992 if ( p > target ) { 993 do{ 994 p *= p; 995 right <<= 1; 996 } while(p>target && right < (1<<27)); 997 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 998 } else { 999 left = 0; 1000 } 1001 1002 /* bisection root-finding method */ 1003 while ( left + 1 < right ) { 1004 mid = (left + right) / 2; 1005 if ( __kmp_pow< UT >(x,mid) > target ) { 1006 left = mid; 1007 } else { 1008 right = mid; 1009 } 1010 } // while 1011 cross = right; 1012 } 1013 /* assert sanity of computed crossover point */ 1014 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1015 1016 /* save the crossover point in thread private dispatch structure */ 1017 pr->u.p.parm2 = cross; 1018 1019 // C75803 1020 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1021 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1022 #else 1023 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1024 #endif 1025 /* dynamic-style scheduling offset */ 1026 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1027 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1028 // restore FPCW 1029 _control87(oldFpcw,_MCW_PC); 1030 #endif 1031 } // if 1032 } else { 1033 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1034 gtid ) ); 1035 schedule = kmp_sch_static_greedy; 1036 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1037 pr->u.p.parm1 = tc; 1038 } // if 1039 } // case 1040 break; 1041 case kmp_sch_static_greedy: 1042 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1043 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1044 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1045 tc; 1046 break; 1047 case kmp_sch_static_chunked : 1048 case kmp_sch_dynamic_chunked : 1049 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1050 break; 1051 case kmp_sch_trapezoidal : 1052 { 1053 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1054 1055 T parm1, parm2, parm3, parm4; 1056 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1057 1058 parm1 = chunk; 1059 1060 /* F : size of the first cycle */ 1061 parm2 = ( tc / (2 * team->t.t_nproc) ); 1062 1063 if ( parm2 < 1 ) { 1064 parm2 = 1; 1065 } 1066 1067 /* L : size of the last cycle. Make sure the last cycle 1068 * is not larger than the first cycle. 1069 */ 1070 if ( parm1 < 1 ) { 1071 parm1 = 1; 1072 } else if ( parm1 > parm2 ) { 1073 parm1 = parm2; 1074 } 1075 1076 /* N : number of cycles */ 1077 parm3 = ( parm2 + parm1 ); 1078 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1079 1080 if ( parm3 < 2 ) { 1081 parm3 = 2; 1082 } 1083 1084 /* sigma : decreasing incr of the trapezoid */ 1085 parm4 = ( parm3 - 1 ); 1086 parm4 = ( parm2 - parm1 ) / parm4; 1087 1088 // pointless check, because parm4 >= 0 always 1089 //if ( parm4 < 0 ) { 1090 // parm4 = 0; 1091 //} 1092 1093 pr->u.p.parm1 = parm1; 1094 pr->u.p.parm2 = parm2; 1095 pr->u.p.parm3 = parm3; 1096 pr->u.p.parm4 = parm4; 1097 } // case 1098 break; 1099 1100 default: 1101 { 1102 __kmp_msg( 1103 kmp_ms_fatal, // Severity 1104 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1105 KMP_HNT( GetNewerLibrary ), // Hint 1106 __kmp_msg_null // Variadic argument list terminator 1107 ); 1108 } 1109 break; 1110 } // switch 1111 pr->schedule = schedule; 1112 if ( active ) { 1113 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1114 1115 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1116 gtid, my_buffer_index, sh->buffer_index) ); 1117 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1118 USE_ITT_BUILD_ARG( NULL ) 1119 ); 1120 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1121 // *always* 32-bit integers. 1122 KMP_MB(); /* is this necessary? */ 1123 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1124 gtid, my_buffer_index, sh->buffer_index) ); 1125 1126 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1127 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1128 #if USE_ITT_BUILD 1129 if ( pr->ordered ) { 1130 __kmp_itt_ordered_init( gtid ); 1131 }; // if 1132 // Report loop metadata 1133 if ( itt_need_metadata_reporting ) { 1134 // Only report metadata by master of active team at level 1 1135 kmp_uint64 schedtype = 0; 1136 switch ( schedule ) { 1137 case kmp_sch_static_chunked: 1138 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1139 break; 1140 case kmp_sch_static_greedy: 1141 cur_chunk = pr->u.p.parm1; 1142 break; 1143 case kmp_sch_dynamic_chunked: 1144 schedtype = 1; 1145 break; 1146 case kmp_sch_guided_iterative_chunked: 1147 case kmp_sch_guided_analytical_chunked: 1148 schedtype = 2; 1149 break; 1150 default: 1151 // Should we put this case under "static"? 1152 // case kmp_sch_static_steal: 1153 schedtype = 3; 1154 break; 1155 } 1156 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1157 } 1158 #endif /* USE_ITT_BUILD */ 1159 }; // if 1160 1161 #ifdef KMP_DEBUG 1162 { 1163 const char * buff; 1164 // create format specifiers before the debug output 1165 buff = __kmp_str_format( 1166 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1167 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1168 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1169 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1170 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1171 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1172 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1173 KD_TRACE(10, ( buff, 1174 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1175 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1176 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1177 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1178 __kmp_str_free( &buff ); 1179 } 1180 #endif 1181 #if ( KMP_STATIC_STEAL_ENABLED ) 1182 if ( ___kmp_size_type < 8 ) { 1183 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1184 // all the parm3 variables will contain the same value. 1185 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1186 // rather than program life-time increment. 1187 // So the dedicated variable is required. The 'static_steal_counter' is used. 1188 if( schedule == kmp_sch_static_steal ) { 1189 // Other threads will inspect this variable when searching for a victim. 1190 // This is a flag showing that other threads may steal from this thread since then. 1191 volatile T * p = &pr->u.p.static_steal_counter; 1192 *p = *p + 1; 1193 } 1194 } 1195 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1196 1197 #if OMPT_SUPPORT && OMPT_TRACE 1198 if ((ompt_status == ompt_status_track_callback) && 1199 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1200 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1201 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1202 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1203 team_info->parallel_id, task_info->task_id, team_info->microtask); 1204 } 1205 #endif 1206 } 1207 1208 /* 1209 * For ordered loops, either __kmp_dispatch_finish() should be called after 1210 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1211 * every chunk of iterations. If the ordered section(s) were not executed 1212 * for this iteration (or every iteration in this chunk), we need to set the 1213 * ordered iteration counters so that the next thread can proceed. 1214 */ 1215 template< typename UT > 1216 static void 1217 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1218 { 1219 typedef typename traits_t< UT >::signed_t ST; 1220 kmp_info_t *th = __kmp_threads[ gtid ]; 1221 1222 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1223 if ( ! th -> th.th_team -> t.t_serialized ) { 1224 1225 dispatch_private_info_template< UT > * pr = 1226 reinterpret_cast< dispatch_private_info_template< UT >* > 1227 ( th->th.th_dispatch->th_dispatch_pr_current ); 1228 dispatch_shared_info_template< UT > volatile * sh = 1229 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1230 ( th->th.th_dispatch->th_dispatch_sh_current ); 1231 KMP_DEBUG_ASSERT( pr ); 1232 KMP_DEBUG_ASSERT( sh ); 1233 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1234 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1235 1236 if ( pr->ordered_bumped ) { 1237 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1238 gtid ) ); 1239 pr->ordered_bumped = 0; 1240 } else { 1241 UT lower = pr->u.p.ordered_lower; 1242 1243 #ifdef KMP_DEBUG 1244 { 1245 const char * buff; 1246 // create format specifiers before the debug output 1247 buff = __kmp_str_format( 1248 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1249 traits_t< UT >::spec, traits_t< UT >::spec ); 1250 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1251 __kmp_str_free( &buff ); 1252 } 1253 #endif 1254 1255 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1256 USE_ITT_BUILD_ARG(NULL) 1257 ); 1258 KMP_MB(); /* is this necessary? */ 1259 #ifdef KMP_DEBUG 1260 { 1261 const char * buff; 1262 // create format specifiers before the debug output 1263 buff = __kmp_str_format( 1264 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1265 traits_t< UT >::spec, traits_t< UT >::spec ); 1266 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1267 __kmp_str_free( &buff ); 1268 } 1269 #endif 1270 1271 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1272 } // if 1273 } // if 1274 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1275 } 1276 1277 #ifdef KMP_GOMP_COMPAT 1278 1279 template< typename UT > 1280 static void 1281 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1282 { 1283 typedef typename traits_t< UT >::signed_t ST; 1284 kmp_info_t *th = __kmp_threads[ gtid ]; 1285 1286 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1287 if ( ! th -> th.th_team -> t.t_serialized ) { 1288 // int cid; 1289 dispatch_private_info_template< UT > * pr = 1290 reinterpret_cast< dispatch_private_info_template< UT >* > 1291 ( th->th.th_dispatch->th_dispatch_pr_current ); 1292 dispatch_shared_info_template< UT > volatile * sh = 1293 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1294 ( th->th.th_dispatch->th_dispatch_sh_current ); 1295 KMP_DEBUG_ASSERT( pr ); 1296 KMP_DEBUG_ASSERT( sh ); 1297 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1298 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1299 1300 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1301 UT lower = pr->u.p.ordered_lower; 1302 UT upper = pr->u.p.ordered_upper; 1303 UT inc = upper - lower + 1; 1304 1305 if ( pr->ordered_bumped == inc ) { 1306 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1307 gtid ) ); 1308 pr->ordered_bumped = 0; 1309 } else { 1310 inc -= pr->ordered_bumped; 1311 1312 #ifdef KMP_DEBUG 1313 { 1314 const char * buff; 1315 // create format specifiers before the debug output 1316 buff = __kmp_str_format( 1317 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1318 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1319 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1320 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1321 __kmp_str_free( &buff ); 1322 } 1323 #endif 1324 1325 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1326 USE_ITT_BUILD_ARG(NULL) 1327 ); 1328 1329 KMP_MB(); /* is this necessary? */ 1330 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1331 gtid ) ); 1332 pr->ordered_bumped = 0; 1333 //!!!!! TODO check if the inc should be unsigned, or signed??? 1334 #ifdef KMP_DEBUG 1335 { 1336 const char * buff; 1337 // create format specifiers before the debug output 1338 buff = __kmp_str_format( 1339 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1340 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1341 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1342 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1343 __kmp_str_free( &buff ); 1344 } 1345 #endif 1346 1347 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1348 } 1349 // } 1350 } 1351 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1352 } 1353 1354 #endif /* KMP_GOMP_COMPAT */ 1355 1356 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1357 * (no more work), then tell OMPT the loop is over. In some cases 1358 * kmp_dispatch_fini() is not called. */ 1359 #if OMPT_SUPPORT && OMPT_TRACE 1360 #define OMPT_LOOP_END \ 1361 if (status == 0) { \ 1362 if ((ompt_status == ompt_status_track_callback) && \ 1363 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1364 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1365 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1366 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1367 team_info->parallel_id, task_info->task_id); \ 1368 } \ 1369 } 1370 #else 1371 #define OMPT_LOOP_END // no-op 1372 #endif 1373 1374 template< typename T > 1375 static int 1376 __kmp_dispatch_next( 1377 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1378 ) { 1379 1380 typedef typename traits_t< T >::unsigned_t UT; 1381 typedef typename traits_t< T >::signed_t ST; 1382 typedef typename traits_t< T >::floating_t DBL; 1383 static const int ___kmp_size_type = sizeof( UT ); 1384 1385 int status; 1386 dispatch_private_info_template< T > * pr; 1387 kmp_info_t * th = __kmp_threads[ gtid ]; 1388 kmp_team_t * team = th -> th.th_team; 1389 1390 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL 1391 #ifdef KMP_DEBUG 1392 { 1393 const char * buff; 1394 // create format specifiers before the debug output 1395 buff = __kmp_str_format( 1396 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1397 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1398 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1399 __kmp_str_free( &buff ); 1400 } 1401 #endif 1402 1403 if ( team -> t.t_serialized ) { 1404 /* NOTE: serialize this dispatch becase we are not at the active level */ 1405 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1406 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1407 KMP_DEBUG_ASSERT( pr ); 1408 1409 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1410 *p_lb = 0; 1411 *p_ub = 0; 1412 // if ( p_last != NULL ) 1413 // *p_last = 0; 1414 if ( p_st != NULL ) 1415 *p_st = 0; 1416 if ( __kmp_env_consistency_check ) { 1417 if ( pr->pushed_ws != ct_none ) { 1418 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1419 } 1420 } 1421 } else if ( pr->nomerge ) { 1422 kmp_int32 last; 1423 T start; 1424 UT limit, trip, init; 1425 ST incr; 1426 T chunk = pr->u.p.parm1; 1427 1428 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1429 1430 init = chunk * pr->u.p.count++; 1431 trip = pr->u.p.tc - 1; 1432 1433 if ( (status = (init <= trip)) == 0 ) { 1434 *p_lb = 0; 1435 *p_ub = 0; 1436 // if ( p_last != NULL ) 1437 // *p_last = 0; 1438 if ( p_st != NULL ) 1439 *p_st = 0; 1440 if ( __kmp_env_consistency_check ) { 1441 if ( pr->pushed_ws != ct_none ) { 1442 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1443 } 1444 } 1445 } else { 1446 start = pr->u.p.lb; 1447 limit = chunk + init - 1; 1448 incr = pr->u.p.st; 1449 1450 if ( (last = (limit >= trip)) != 0 ) { 1451 limit = trip; 1452 #if KMP_OS_WINDOWS 1453 pr->u.p.last_upper = pr->u.p.ub; 1454 #endif /* KMP_OS_WINDOWS */ 1455 } 1456 if ( p_last != NULL ) 1457 *p_last = last; 1458 if ( p_st != NULL ) 1459 *p_st = incr; 1460 if ( incr == 1 ) { 1461 *p_lb = start + init; 1462 *p_ub = start + limit; 1463 } else { 1464 *p_lb = start + init * incr; 1465 *p_ub = start + limit * incr; 1466 } 1467 1468 if ( pr->ordered ) { 1469 pr->u.p.ordered_lower = init; 1470 pr->u.p.ordered_upper = limit; 1471 #ifdef KMP_DEBUG 1472 { 1473 const char * buff; 1474 // create format specifiers before the debug output 1475 buff = __kmp_str_format( 1476 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1477 traits_t< UT >::spec, traits_t< UT >::spec ); 1478 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1479 __kmp_str_free( &buff ); 1480 } 1481 #endif 1482 } // if 1483 } // if 1484 } else { 1485 pr->u.p.tc = 0; 1486 *p_lb = pr->u.p.lb; 1487 *p_ub = pr->u.p.ub; 1488 #if KMP_OS_WINDOWS 1489 pr->u.p.last_upper = *p_ub; 1490 #endif /* KMP_OS_WINDOWS */ 1491 if ( p_last != NULL ) 1492 *p_last = TRUE; 1493 if ( p_st != NULL ) 1494 *p_st = pr->u.p.st; 1495 } // if 1496 #ifdef KMP_DEBUG 1497 { 1498 const char * buff; 1499 // create format specifiers before the debug output 1500 buff = __kmp_str_format( 1501 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1502 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1503 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1504 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1505 __kmp_str_free( &buff ); 1506 } 1507 #endif 1508 #if INCLUDE_SSC_MARKS 1509 SSC_MARK_DISPATCH_NEXT(); 1510 #endif 1511 OMPT_LOOP_END; 1512 return status; 1513 } else { 1514 kmp_int32 last = 0; 1515 dispatch_shared_info_template< UT > *sh; 1516 T start; 1517 ST incr; 1518 UT limit, trip, init; 1519 1520 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1521 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1522 1523 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1524 ( th->th.th_dispatch->th_dispatch_pr_current ); 1525 KMP_DEBUG_ASSERT( pr ); 1526 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1527 ( th->th.th_dispatch->th_dispatch_sh_current ); 1528 KMP_DEBUG_ASSERT( sh ); 1529 1530 if ( pr->u.p.tc == 0 ) { 1531 // zero trip count 1532 status = 0; 1533 } else { 1534 switch (pr->schedule) { 1535 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1536 case kmp_sch_static_steal: 1537 { 1538 T chunk = pr->u.p.parm1; 1539 1540 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1541 1542 trip = pr->u.p.tc - 1; 1543 1544 if ( ___kmp_size_type > 4 ) { 1545 // Other threads do not look into the data of this thread, 1546 // so it's not necessary to make volatile casting. 1547 init = ( pr->u.p.count )++; 1548 status = ( init < (UT)pr->u.p.ub ); 1549 } else { 1550 typedef union { 1551 struct { 1552 UT count; 1553 T ub; 1554 } p; 1555 kmp_int64 b; 1556 } union_i4; 1557 // All operations on 'count' or 'ub' must be combined atomically together. 1558 // stealing implemented only for 4-byte indexes 1559 { 1560 union_i4 vold, vnew; 1561 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1562 vnew = vold; 1563 vnew.p.count++; 1564 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1565 ( volatile kmp_int64* )&pr->u.p.count, 1566 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1567 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1568 KMP_CPU_PAUSE(); 1569 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1570 vnew = vold; 1571 vnew.p.count++; 1572 } 1573 vnew = vold; 1574 init = vnew.p.count; 1575 status = ( init < (UT)vnew.p.ub ) ; 1576 } 1577 1578 if( !status ) { 1579 kmp_info_t **other_threads = team->t.t_threads; 1580 int while_limit = 10; 1581 int while_index = 0; 1582 1583 // TODO: algorithm of searching for a victim 1584 // should be cleaned up and measured 1585 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1586 union_i4 vold, vnew; 1587 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1588 T victimIdx = pr->u.p.parm4; 1589 T oldVictimIdx = victimIdx; 1590 dispatch_private_info_template< T > * victim; 1591 1592 do { 1593 if( !victimIdx ) { 1594 victimIdx = team->t.t_nproc - 1; 1595 } else { 1596 --victimIdx; 1597 } 1598 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1599 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1600 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1601 // TODO: think about a proper place of this test 1602 if ( ( !victim ) || 1603 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1604 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1605 // TODO: delay would be nice 1606 continue; 1607 // the victim is not ready yet to participate in stealing 1608 // because the victim is still in kmp_init_dispatch 1609 } 1610 if ( oldVictimIdx == victimIdx ) { 1611 break; 1612 } 1613 pr->u.p.parm4 = victimIdx; 1614 1615 while( 1 ) { 1616 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1617 vnew = vold; 1618 1619 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1620 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1621 break; 1622 } 1623 vnew.p.ub -= (remaining >> 2); 1624 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1625 #pragma warning( push ) 1626 // disable warning on pointless comparison of unsigned with 0 1627 #pragma warning( disable: 186 ) 1628 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1629 #pragma warning( pop ) 1630 // TODO: Should this be acquire or release? 1631 if ( KMP_COMPARE_AND_STORE_ACQ64( 1632 ( volatile kmp_int64 * )&victim->u.p.count, 1633 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1634 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1635 status = 1; 1636 while_index = 0; 1637 // now update own count and ub 1638 #if KMP_ARCH_X86 1639 // stealing executed on non-KMP_ARCH_X86 only 1640 // Atomic 64-bit write on ia32 is 1641 // unavailable, so we do this in steps. 1642 // This code is not tested. 1643 init = vold.p.count; 1644 pr->u.p.ub = 0; 1645 pr->u.p.count = init + 1; 1646 pr->u.p.ub = vnew.p.count; 1647 #else 1648 init = vnew.p.ub; 1649 vold.p.count = init + 1; 1650 // TODO: is it safe and enough? 1651 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1652 #endif // KMP_ARCH_X86 1653 break; 1654 } // if 1655 KMP_CPU_PAUSE(); 1656 } // while (1) 1657 } // while 1658 } // if 1659 } // if 1660 if ( !status ) { 1661 *p_lb = 0; 1662 *p_ub = 0; 1663 if ( p_st != NULL ) *p_st = 0; 1664 } else { 1665 start = pr->u.p.parm2; 1666 init *= chunk; 1667 limit = chunk + init - 1; 1668 incr = pr->u.p.st; 1669 1670 KMP_DEBUG_ASSERT(init <= trip); 1671 if ( (last = (limit >= trip)) != 0 ) 1672 limit = trip; 1673 if ( p_st != NULL ) *p_st = incr; 1674 1675 if ( incr == 1 ) { 1676 *p_lb = start + init; 1677 *p_ub = start + limit; 1678 } else { 1679 *p_lb = start + init * incr; 1680 *p_ub = start + limit * incr; 1681 } 1682 1683 if ( pr->ordered ) { 1684 pr->u.p.ordered_lower = init; 1685 pr->u.p.ordered_upper = limit; 1686 #ifdef KMP_DEBUG 1687 { 1688 const char * buff; 1689 // create format specifiers before the debug output 1690 buff = __kmp_str_format( 1691 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1692 traits_t< UT >::spec, traits_t< UT >::spec ); 1693 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1694 __kmp_str_free( &buff ); 1695 } 1696 #endif 1697 } // if 1698 } // if 1699 break; 1700 } // case 1701 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1702 case kmp_sch_static_balanced: 1703 { 1704 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1705 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1706 pr->u.p.count = 1; 1707 *p_lb = pr->u.p.lb; 1708 *p_ub = pr->u.p.ub; 1709 last = pr->u.p.parm1; 1710 if ( p_st != NULL ) 1711 *p_st = pr->u.p.st; 1712 } else { /* no iterations to do */ 1713 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1714 } 1715 if ( pr->ordered ) { 1716 #ifdef KMP_DEBUG 1717 { 1718 const char * buff; 1719 // create format specifiers before the debug output 1720 buff = __kmp_str_format( 1721 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1722 traits_t< UT >::spec, traits_t< UT >::spec ); 1723 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1724 __kmp_str_free( &buff ); 1725 } 1726 #endif 1727 } // if 1728 } // case 1729 break; 1730 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1731 case kmp_sch_static_chunked: 1732 { 1733 T parm1; 1734 1735 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1736 gtid ) ); 1737 parm1 = pr->u.p.parm1; 1738 1739 trip = pr->u.p.tc - 1; 1740 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1741 1742 if ( (status = (init <= trip)) != 0 ) { 1743 start = pr->u.p.lb; 1744 incr = pr->u.p.st; 1745 limit = parm1 + init - 1; 1746 1747 if ( (last = (limit >= trip)) != 0 ) 1748 limit = trip; 1749 1750 if ( p_st != NULL ) *p_st = incr; 1751 1752 pr->u.p.count += team->t.t_nproc; 1753 1754 if ( incr == 1 ) { 1755 *p_lb = start + init; 1756 *p_ub = start + limit; 1757 } 1758 else { 1759 *p_lb = start + init * incr; 1760 *p_ub = start + limit * incr; 1761 } 1762 1763 if ( pr->ordered ) { 1764 pr->u.p.ordered_lower = init; 1765 pr->u.p.ordered_upper = limit; 1766 #ifdef KMP_DEBUG 1767 { 1768 const char * buff; 1769 // create format specifiers before the debug output 1770 buff = __kmp_str_format( 1771 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1772 traits_t< UT >::spec, traits_t< UT >::spec ); 1773 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1774 __kmp_str_free( &buff ); 1775 } 1776 #endif 1777 } // if 1778 } // if 1779 } // case 1780 break; 1781 1782 case kmp_sch_dynamic_chunked: 1783 { 1784 T chunk = pr->u.p.parm1; 1785 1786 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1787 gtid ) ); 1788 1789 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1790 trip = pr->u.p.tc - 1; 1791 1792 if ( (status = (init <= trip)) == 0 ) { 1793 *p_lb = 0; 1794 *p_ub = 0; 1795 if ( p_st != NULL ) *p_st = 0; 1796 } else { 1797 start = pr->u.p.lb; 1798 limit = chunk + init - 1; 1799 incr = pr->u.p.st; 1800 1801 if ( (last = (limit >= trip)) != 0 ) 1802 limit = trip; 1803 1804 if ( p_st != NULL ) *p_st = incr; 1805 1806 if ( incr == 1 ) { 1807 *p_lb = start + init; 1808 *p_ub = start + limit; 1809 } else { 1810 *p_lb = start + init * incr; 1811 *p_ub = start + limit * incr; 1812 } 1813 1814 if ( pr->ordered ) { 1815 pr->u.p.ordered_lower = init; 1816 pr->u.p.ordered_upper = limit; 1817 #ifdef KMP_DEBUG 1818 { 1819 const char * buff; 1820 // create format specifiers before the debug output 1821 buff = __kmp_str_format( 1822 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1823 traits_t< UT >::spec, traits_t< UT >::spec ); 1824 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1825 __kmp_str_free( &buff ); 1826 } 1827 #endif 1828 } // if 1829 } // if 1830 } // case 1831 break; 1832 1833 case kmp_sch_guided_iterative_chunked: 1834 { 1835 T chunkspec = pr->u.p.parm1; 1836 KD_TRACE(100, 1837 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1838 trip = pr->u.p.tc; 1839 // Start atomic part of calculations 1840 while(1) { 1841 ST remaining; // signed, because can be < 0 1842 init = sh->u.s.iteration; // shared value 1843 remaining = trip - init; 1844 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1845 // nothing to do, don't try atomic op 1846 status = 0; 1847 break; 1848 } 1849 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1850 // use dynamic-style shcedule 1851 // atomically inrement iterations, get old value 1852 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1853 remaining = trip - init; 1854 if (remaining <= 0) { 1855 status = 0; // all iterations got by other threads 1856 } else { 1857 // got some iterations to work on 1858 status = 1; 1859 if ( (T)remaining > chunkspec ) { 1860 limit = init + chunkspec - 1; 1861 } else { 1862 last = 1; // the last chunk 1863 limit = init + remaining - 1; 1864 } // if 1865 } // if 1866 break; 1867 } // if 1868 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1869 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1870 // CAS was successful, chunk obtained 1871 status = 1; 1872 --limit; 1873 break; 1874 } // if 1875 } // while 1876 if ( status != 0 ) { 1877 start = pr->u.p.lb; 1878 incr = pr->u.p.st; 1879 if ( p_st != NULL ) 1880 *p_st = incr; 1881 *p_lb = start + init * incr; 1882 *p_ub = start + limit * incr; 1883 if ( pr->ordered ) { 1884 pr->u.p.ordered_lower = init; 1885 pr->u.p.ordered_upper = limit; 1886 #ifdef KMP_DEBUG 1887 { 1888 const char * buff; 1889 // create format specifiers before the debug output 1890 buff = __kmp_str_format( 1891 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1892 traits_t< UT >::spec, traits_t< UT >::spec ); 1893 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1894 __kmp_str_free( &buff ); 1895 } 1896 #endif 1897 } // if 1898 } else { 1899 *p_lb = 0; 1900 *p_ub = 0; 1901 if ( p_st != NULL ) 1902 *p_st = 0; 1903 } // if 1904 } // case 1905 break; 1906 1907 case kmp_sch_guided_analytical_chunked: 1908 { 1909 T chunkspec = pr->u.p.parm1; 1910 UT chunkIdx; 1911 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1912 /* for storing original FPCW value for Windows* OS on 1913 IA-32 architecture 8-byte version */ 1914 unsigned int oldFpcw; 1915 unsigned int fpcwSet = 0; 1916 #endif 1917 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1918 gtid ) ); 1919 1920 trip = pr->u.p.tc; 1921 1922 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1923 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1924 1925 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1926 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1927 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1928 --trip; 1929 /* use dynamic-style scheduling */ 1930 init = chunkIdx * chunkspec + pr->u.p.count; 1931 /* need to verify init > 0 in case of overflow in the above calculation */ 1932 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1933 limit = init + chunkspec -1; 1934 1935 if ( (last = (limit >= trip)) != 0 ) 1936 limit = trip; 1937 } 1938 break; 1939 } else { 1940 /* use exponential-style scheduling */ 1941 /* The following check is to workaround the lack of long double precision on Windows* OS. 1942 This check works around the possible effect that init != 0 for chunkIdx == 0. 1943 */ 1944 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1945 /* If we haven't already done so, save original 1946 FPCW and set precision to 64-bit, as Windows* OS 1947 on IA-32 architecture defaults to 53-bit */ 1948 if ( !fpcwSet ) { 1949 oldFpcw = _control87(0,0); 1950 _control87(_PC_64,_MCW_PC); 1951 fpcwSet = 0x30000; 1952 } 1953 #endif 1954 if ( chunkIdx ) { 1955 init = __kmp_dispatch_guided_remaining< T >( 1956 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1957 KMP_DEBUG_ASSERT(init); 1958 init = trip - init; 1959 } else 1960 init = 0; 1961 limit = trip - __kmp_dispatch_guided_remaining< T >( 1962 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1963 KMP_ASSERT(init <= limit); 1964 if ( init < limit ) { 1965 KMP_DEBUG_ASSERT(limit <= trip); 1966 --limit; 1967 status = 1; 1968 break; 1969 } // if 1970 } // if 1971 } // while (1) 1972 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1973 /* restore FPCW if necessary 1974 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1975 */ 1976 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1977 _control87(oldFpcw,_MCW_PC); 1978 #endif 1979 if ( status != 0 ) { 1980 start = pr->u.p.lb; 1981 incr = pr->u.p.st; 1982 if ( p_st != NULL ) 1983 *p_st = incr; 1984 *p_lb = start + init * incr; 1985 *p_ub = start + limit * incr; 1986 if ( pr->ordered ) { 1987 pr->u.p.ordered_lower = init; 1988 pr->u.p.ordered_upper = limit; 1989 #ifdef KMP_DEBUG 1990 { 1991 const char * buff; 1992 // create format specifiers before the debug output 1993 buff = __kmp_str_format( 1994 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1995 traits_t< UT >::spec, traits_t< UT >::spec ); 1996 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1997 __kmp_str_free( &buff ); 1998 } 1999 #endif 2000 } 2001 } else { 2002 *p_lb = 0; 2003 *p_ub = 0; 2004 if ( p_st != NULL ) 2005 *p_st = 0; 2006 } 2007 } // case 2008 break; 2009 2010 case kmp_sch_trapezoidal: 2011 { 2012 UT index; 2013 T parm2 = pr->u.p.parm2; 2014 T parm3 = pr->u.p.parm3; 2015 T parm4 = pr->u.p.parm4; 2016 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2017 gtid ) ); 2018 2019 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2020 2021 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2022 trip = pr->u.p.tc - 1; 2023 2024 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2025 *p_lb = 0; 2026 *p_ub = 0; 2027 if ( p_st != NULL ) *p_st = 0; 2028 } else { 2029 start = pr->u.p.lb; 2030 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2031 incr = pr->u.p.st; 2032 2033 if ( (last = (limit >= trip)) != 0 ) 2034 limit = trip; 2035 2036 if ( p_st != NULL ) *p_st = incr; 2037 2038 if ( incr == 1 ) { 2039 *p_lb = start + init; 2040 *p_ub = start + limit; 2041 } else { 2042 *p_lb = start + init * incr; 2043 *p_ub = start + limit * incr; 2044 } 2045 2046 if ( pr->ordered ) { 2047 pr->u.p.ordered_lower = init; 2048 pr->u.p.ordered_upper = limit; 2049 #ifdef KMP_DEBUG 2050 { 2051 const char * buff; 2052 // create format specifiers before the debug output 2053 buff = __kmp_str_format( 2054 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2055 traits_t< UT >::spec, traits_t< UT >::spec ); 2056 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2057 __kmp_str_free( &buff ); 2058 } 2059 #endif 2060 } // if 2061 } // if 2062 } // case 2063 break; 2064 default: 2065 { 2066 status = 0; // to avoid complaints on uninitialized variable use 2067 __kmp_msg( 2068 kmp_ms_fatal, // Severity 2069 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2070 KMP_HNT( GetNewerLibrary ), // Hint 2071 __kmp_msg_null // Variadic argument list terminator 2072 ); 2073 } 2074 break; 2075 } // switch 2076 } // if tc == 0; 2077 2078 if ( status == 0 ) { 2079 UT num_done; 2080 2081 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2082 #ifdef KMP_DEBUG 2083 { 2084 const char * buff; 2085 // create format specifiers before the debug output 2086 buff = __kmp_str_format( 2087 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2088 traits_t< UT >::spec ); 2089 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2090 __kmp_str_free( &buff ); 2091 } 2092 #endif 2093 2094 if ( (ST)num_done == team->t.t_nproc-1 ) { 2095 /* NOTE: release this buffer to be reused */ 2096 2097 KMP_MB(); /* Flush all pending memory write invalidates. */ 2098 2099 sh->u.s.num_done = 0; 2100 sh->u.s.iteration = 0; 2101 2102 /* TODO replace with general release procedure? */ 2103 if ( pr->ordered ) { 2104 sh->u.s.ordered_iteration = 0; 2105 } 2106 2107 KMP_MB(); /* Flush all pending memory write invalidates. */ 2108 2109 sh -> buffer_index += KMP_MAX_DISP_BUF; 2110 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2111 gtid, sh->buffer_index) ); 2112 2113 KMP_MB(); /* Flush all pending memory write invalidates. */ 2114 2115 } // if 2116 if ( __kmp_env_consistency_check ) { 2117 if ( pr->pushed_ws != ct_none ) { 2118 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2119 } 2120 } 2121 2122 th -> th.th_dispatch -> th_deo_fcn = NULL; 2123 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2124 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2125 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2126 } // if (status == 0) 2127 #if KMP_OS_WINDOWS 2128 else if ( last ) { 2129 pr->u.p.last_upper = pr->u.p.ub; 2130 } 2131 #endif /* KMP_OS_WINDOWS */ 2132 if ( p_last != NULL && status != 0 ) 2133 *p_last = last; 2134 } // if 2135 2136 #ifdef KMP_DEBUG 2137 { 2138 const char * buff; 2139 // create format specifiers before the debug output 2140 buff = __kmp_str_format( 2141 "__kmp_dispatch_next: T#%%d normal case: " \ 2142 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2143 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2144 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2145 __kmp_str_free( &buff ); 2146 } 2147 #endif 2148 #if INCLUDE_SSC_MARKS 2149 SSC_MARK_DISPATCH_NEXT(); 2150 #endif 2151 OMPT_LOOP_END; 2152 return status; 2153 } 2154 2155 template< typename T > 2156 static void 2157 __kmp_dist_get_bounds( 2158 ident_t *loc, 2159 kmp_int32 gtid, 2160 kmp_int32 *plastiter, 2161 T *plower, 2162 T *pupper, 2163 typename traits_t< T >::signed_t incr 2164 ) { 2165 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); 2166 typedef typename traits_t< T >::unsigned_t UT; 2167 typedef typename traits_t< T >::signed_t ST; 2168 register kmp_uint32 team_id; 2169 register kmp_uint32 nteams; 2170 register UT trip_count; 2171 register kmp_team_t *team; 2172 kmp_info_t * th; 2173 2174 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2175 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2176 #ifdef KMP_DEBUG 2177 { 2178 const char * buff; 2179 // create format specifiers before the debug output 2180 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2181 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2182 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2183 traits_t< T >::spec ); 2184 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2185 __kmp_str_free( &buff ); 2186 } 2187 #endif 2188 2189 if( __kmp_env_consistency_check ) { 2190 if( incr == 0 ) { 2191 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2192 } 2193 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2194 // The loop is illegal. 2195 // Some zero-trip loops maintained by compiler, e.g.: 2196 // for(i=10;i<0;++i) // lower >= upper - run-time check 2197 // for(i=0;i>10;--i) // lower <= upper - run-time check 2198 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2199 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2200 // Compiler does not check the following illegal loops: 2201 // for(i=0;i<10;i+=incr) // where incr<0 2202 // for(i=10;i>0;i-=incr) // where incr<0 2203 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2204 } 2205 } 2206 th = __kmp_threads[gtid]; 2207 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2208 team = th->th.th_team; 2209 #if OMP_40_ENABLED 2210 nteams = th->th.th_teams_size.nteams; 2211 #endif 2212 team_id = team->t.t_master_tid; 2213 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2214 2215 // compute global trip count 2216 if( incr == 1 ) { 2217 trip_count = *pupper - *plower + 1; 2218 } else if(incr == -1) { 2219 trip_count = *plower - *pupper + 1; 2220 } else { 2221 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2222 } 2223 if( trip_count <= nteams ) { 2224 KMP_DEBUG_ASSERT( 2225 __kmp_static == kmp_sch_static_greedy || \ 2226 __kmp_static == kmp_sch_static_balanced 2227 ); // Unknown static scheduling type. 2228 // only some teams get single iteration, others get nothing 2229 if( team_id < trip_count ) { 2230 *pupper = *plower = *plower + team_id * incr; 2231 } else { 2232 *plower = *pupper + incr; // zero-trip loop 2233 } 2234 if( plastiter != NULL ) 2235 *plastiter = ( team_id == trip_count - 1 ); 2236 } else { 2237 if( __kmp_static == kmp_sch_static_balanced ) { 2238 register UT chunk = trip_count / nteams; 2239 register UT extras = trip_count % nteams; 2240 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2241 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2242 if( plastiter != NULL ) 2243 *plastiter = ( team_id == nteams - 1 ); 2244 } else { 2245 register T chunk_inc_count = 2246 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2247 register T upper = *pupper; 2248 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2249 // Unknown static scheduling type. 2250 *plower += team_id * chunk_inc_count; 2251 *pupper = *plower + chunk_inc_count - incr; 2252 // Check/correct bounds if needed 2253 if( incr > 0 ) { 2254 if( *pupper < *plower ) 2255 *pupper = i_maxmin< T >::mx; 2256 if( plastiter != NULL ) 2257 *plastiter = *plower <= upper && *pupper > upper - incr; 2258 if( *pupper > upper ) 2259 *pupper = upper; // tracker C73258 2260 } else { 2261 if( *pupper > *plower ) 2262 *pupper = i_maxmin< T >::mn; 2263 if( plastiter != NULL ) 2264 *plastiter = *plower >= upper && *pupper < upper - incr; 2265 if( *pupper < upper ) 2266 *pupper = upper; // tracker C73258 2267 } 2268 } 2269 } 2270 } 2271 2272 //----------------------------------------------------------------------------------------- 2273 // Dispatch routines 2274 // Transfer call to template< type T > 2275 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2276 // T lb, T ub, ST st, ST chunk ) 2277 extern "C" { 2278 2279 /*! 2280 @ingroup WORK_SHARING 2281 @{ 2282 @param loc Source location 2283 @param gtid Global thread id 2284 @param schedule Schedule type 2285 @param lb Lower bound 2286 @param ub Upper bound 2287 @param st Step (or increment if you prefer) 2288 @param chunk The chunk size to block with 2289 2290 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2291 These functions are all identical apart from the types of the arguments. 2292 */ 2293 2294 void 2295 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2296 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2297 { 2298 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2299 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2300 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2301 } 2302 /*! 2303 See @ref __kmpc_dispatch_init_4 2304 */ 2305 void 2306 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2307 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2308 { 2309 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2310 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2311 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2312 } 2313 2314 /*! 2315 See @ref __kmpc_dispatch_init_4 2316 */ 2317 void 2318 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2319 kmp_int64 lb, kmp_int64 ub, 2320 kmp_int64 st, kmp_int64 chunk ) 2321 { 2322 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2323 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2324 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2325 } 2326 2327 /*! 2328 See @ref __kmpc_dispatch_init_4 2329 */ 2330 void 2331 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2332 kmp_uint64 lb, kmp_uint64 ub, 2333 kmp_int64 st, kmp_int64 chunk ) 2334 { 2335 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2336 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2337 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2338 } 2339 2340 /*! 2341 See @ref __kmpc_dispatch_init_4 2342 2343 Difference from __kmpc_dispatch_init set of functions is these functions 2344 are called for composite distribute parallel for construct. Thus before 2345 regular iterations dispatching we need to calc per-team iteration space. 2346 2347 These functions are all identical apart from the types of the arguments. 2348 */ 2349 void 2350 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2351 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2352 { 2353 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2354 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2355 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2356 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2357 } 2358 2359 void 2360 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2361 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2362 { 2363 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2364 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2365 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2366 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2367 } 2368 2369 void 2370 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2371 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2372 { 2373 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2374 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2375 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2376 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2377 } 2378 2379 void 2380 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2381 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2382 { 2383 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2384 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2385 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2386 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2387 } 2388 2389 /*! 2390 @param loc Source code location 2391 @param gtid Global thread id 2392 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2393 @param p_lb Pointer to the lower bound for the next chunk of work 2394 @param p_ub Pointer to the upper bound for the next chunk of work 2395 @param p_st Pointer to the stride for the next chunk of work 2396 @return one if there is work to be done, zero otherwise 2397 2398 Get the next dynamically allocated chunk of work for this thread. 2399 If there is no more work, then the lb,ub and stride need not be modified. 2400 */ 2401 int 2402 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2403 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2404 { 2405 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2406 } 2407 2408 /*! 2409 See @ref __kmpc_dispatch_next_4 2410 */ 2411 int 2412 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2413 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2414 { 2415 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2416 } 2417 2418 /*! 2419 See @ref __kmpc_dispatch_next_4 2420 */ 2421 int 2422 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2423 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2424 { 2425 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2426 } 2427 2428 /*! 2429 See @ref __kmpc_dispatch_next_4 2430 */ 2431 int 2432 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2433 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2434 { 2435 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2436 } 2437 2438 /*! 2439 @param loc Source code location 2440 @param gtid Global thread id 2441 2442 Mark the end of a dynamic loop. 2443 */ 2444 void 2445 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2446 { 2447 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2448 } 2449 2450 /*! 2451 See @ref __kmpc_dispatch_fini_4 2452 */ 2453 void 2454 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2455 { 2456 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2457 } 2458 2459 /*! 2460 See @ref __kmpc_dispatch_fini_4 2461 */ 2462 void 2463 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2464 { 2465 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2466 } 2467 2468 /*! 2469 See @ref __kmpc_dispatch_fini_4 2470 */ 2471 void 2472 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2473 { 2474 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2475 } 2476 /*! @} */ 2477 2478 //----------------------------------------------------------------------------------------- 2479 //Non-template routines from kmp_dispatch.c used in other sources 2480 2481 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2482 return value == checker; 2483 } 2484 2485 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2486 return value != checker; 2487 } 2488 2489 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2490 return value < checker; 2491 } 2492 2493 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2494 return value >= checker; 2495 } 2496 2497 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2498 return value <= checker; 2499 } 2500 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2501 return value == checker; 2502 } 2503 2504 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2505 return value != checker; 2506 } 2507 2508 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2509 return value < checker; 2510 } 2511 2512 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2513 return value >= checker; 2514 } 2515 2516 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2517 return value <= checker; 2518 } 2519 2520 kmp_uint32 2521 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2522 kmp_uint32 checker, 2523 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2524 , void * obj // Higher-level synchronization object, or NULL. 2525 ) 2526 { 2527 // note: we may not belong to a team at this point 2528 register volatile kmp_uint32 * spin = spinner; 2529 register kmp_uint32 check = checker; 2530 register kmp_uint32 spins; 2531 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2532 register kmp_uint32 r; 2533 2534 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2535 KMP_INIT_YIELD( spins ); 2536 // main wait spin loop 2537 while(!f(r = TCR_4(*spin), check)) { 2538 KMP_FSYNC_SPIN_PREPARE( obj ); 2539 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2540 It causes problems with infinite recursion because of exit lock */ 2541 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2542 __kmp_abort_thread(); */ 2543 2544 /* if we have waited a bit, or are oversubscribed, yield */ 2545 /* pause is in the following code */ 2546 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2547 KMP_YIELD_SPIN( spins ); 2548 } 2549 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2550 return r; 2551 } 2552 2553 kmp_uint64 2554 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2555 kmp_uint64 checker, 2556 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2557 , void * obj // Higher-level synchronization object, or NULL. 2558 ) 2559 { 2560 // note: we may not belong to a team at this point 2561 register volatile kmp_uint64 * spin = spinner; 2562 register kmp_uint64 check = checker; 2563 register kmp_uint32 spins; 2564 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2565 register kmp_uint64 r; 2566 2567 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2568 KMP_INIT_YIELD( spins ); 2569 // main wait spin loop 2570 while(!f(r = *spin, check)) 2571 { 2572 KMP_FSYNC_SPIN_PREPARE( obj ); 2573 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2574 It causes problems with infinite recursion because of exit lock */ 2575 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2576 __kmp_abort_thread(); */ 2577 2578 // if we are oversubscribed, 2579 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2580 // pause is in the following code 2581 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2582 KMP_YIELD_SPIN( spins ); 2583 } 2584 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2585 return r; 2586 } 2587 2588 } // extern "C" 2589 2590 #ifdef KMP_GOMP_COMPAT 2591 2592 void 2593 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2594 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2595 kmp_int32 chunk, int push_ws ) 2596 { 2597 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2598 push_ws ); 2599 } 2600 2601 void 2602 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2603 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2604 kmp_int32 chunk, int push_ws ) 2605 { 2606 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2607 push_ws ); 2608 } 2609 2610 void 2611 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2612 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2613 kmp_int64 chunk, int push_ws ) 2614 { 2615 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2616 push_ws ); 2617 } 2618 2619 void 2620 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2621 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2622 kmp_int64 chunk, int push_ws ) 2623 { 2624 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2625 push_ws ); 2626 } 2627 2628 void 2629 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2630 { 2631 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2632 } 2633 2634 void 2635 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2636 { 2637 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2638 } 2639 2640 void 2641 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2642 { 2643 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2644 } 2645 2646 void 2647 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2648 { 2649 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2650 } 2651 2652 #endif /* KMP_GOMP_COMPAT */ 2653 2654 /* ------------------------------------------------------------------------ */ 2655 /* ------------------------------------------------------------------------ */ 2656 2657