1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 #if OMPT_SUPPORT 39 #include "ompt-internal.h" 40 #include "ompt-specific.h" 41 #endif 42 43 /* ------------------------------------------------------------------------ */ 44 /* ------------------------------------------------------------------------ */ 45 46 // template for type limits 47 template< typename T > 48 struct i_maxmin { 49 static const T mx; 50 static const T mn; 51 }; 52 template<> 53 struct i_maxmin< int > { 54 static const int mx = 0x7fffffff; 55 static const int mn = 0x80000000; 56 }; 57 template<> 58 struct i_maxmin< unsigned int > { 59 static const unsigned int mx = 0xffffffff; 60 static const unsigned int mn = 0x00000000; 61 }; 62 template<> 63 struct i_maxmin< long long > { 64 static const long long mx = 0x7fffffffffffffffLL; 65 static const long long mn = 0x8000000000000000LL; 66 }; 67 template<> 68 struct i_maxmin< unsigned long long > { 69 static const unsigned long long mx = 0xffffffffffffffffLL; 70 static const unsigned long long mn = 0x0000000000000000LL; 71 }; 72 //------------------------------------------------------------------------- 73 74 #ifdef KMP_STATIC_STEAL_ENABLED 75 76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 77 template< typename T > 78 struct dispatch_private_infoXX_template { 79 typedef typename traits_t< T >::unsigned_t UT; 80 typedef typename traits_t< T >::signed_t ST; 81 UT count; // unsigned 82 T ub; 83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 84 T lb; 85 ST st; // signed 86 UT tc; // unsigned 87 T static_steal_counter; // for static_steal only; maybe better to put after ub 88 89 /* parm[1-4] are used in different ways by different scheduling algorithms */ 90 91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 92 // a) parm3 is properly aligned and 93 // b) all parm1-4 are in the same cache line. 94 // Because of parm1-4 are used together, performance seems to be better 95 // if they are in the same line (not measured though). 96 97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 98 T parm1; 99 T parm2; 100 T parm3; 101 T parm4; 102 }; 103 104 UT ordered_lower; // unsigned 105 UT ordered_upper; // unsigned 106 #if KMP_OS_WINDOWS 107 T last_upper; 108 #endif /* KMP_OS_WINDOWS */ 109 }; 110 111 #else /* KMP_STATIC_STEAL_ENABLED */ 112 113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 114 template< typename T > 115 struct dispatch_private_infoXX_template { 116 typedef typename traits_t< T >::unsigned_t UT; 117 typedef typename traits_t< T >::signed_t ST; 118 T lb; 119 T ub; 120 ST st; // signed 121 UT tc; // unsigned 122 123 T parm1; 124 T parm2; 125 T parm3; 126 T parm4; 127 128 UT count; // unsigned 129 130 UT ordered_lower; // unsigned 131 UT ordered_upper; // unsigned 132 #if KMP_OS_WINDOWS 133 T last_upper; 134 #endif /* KMP_OS_WINDOWS */ 135 }; 136 137 #endif /* KMP_STATIC_STEAL_ENABLED */ 138 139 // replaces dispatch_private_info structure and dispatch_private_info_t type 140 template< typename T > 141 struct KMP_ALIGN_CACHE dispatch_private_info_template { 142 // duplicate alignment here, otherwise size of structure is not correct in our compiler 143 union KMP_ALIGN_CACHE private_info_tmpl { 144 dispatch_private_infoXX_template< T > p; 145 dispatch_private_info64_t p64; 146 } u; 147 enum sched_type schedule; /* scheduling algorithm */ 148 kmp_uint32 ordered; /* ordered clause specified */ 149 kmp_uint32 ordered_bumped; 150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 152 kmp_uint32 nomerge; /* don't merge iters if serialized */ 153 kmp_uint32 type_size; 154 enum cons_type pushed_ws; 155 }; 156 157 158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 159 template< typename UT > 160 struct dispatch_shared_infoXX_template { 161 /* chunk index under dynamic, number of idle threads under static-steal; 162 iteration index otherwise */ 163 volatile UT iteration; 164 volatile UT num_done; 165 volatile UT ordered_iteration; 166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 167 }; 168 169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 170 template< typename UT > 171 struct dispatch_shared_info_template { 172 // we need union here to keep the structure size 173 union shared_info_tmpl { 174 dispatch_shared_infoXX_template< UT > s; 175 dispatch_shared_info64_t s64; 176 } u; 177 volatile kmp_uint32 buffer_index; 178 }; 179 180 /* ------------------------------------------------------------------------ */ 181 /* ------------------------------------------------------------------------ */ 182 183 #undef USE_TEST_LOCKS 184 185 // test_then_add template (general template should NOT be used) 186 template< typename T > 187 static __forceinline T 188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 189 190 template<> 191 __forceinline kmp_int32 192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 193 { 194 kmp_int32 r; 195 r = KMP_TEST_THEN_ADD32( p, d ); 196 return r; 197 } 198 199 template<> 200 __forceinline kmp_int64 201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 202 { 203 kmp_int64 r; 204 r = KMP_TEST_THEN_ADD64( p, d ); 205 return r; 206 } 207 208 // test_then_inc_acq template (general template should NOT be used) 209 template< typename T > 210 static __forceinline T 211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 212 213 template<> 214 __forceinline kmp_int32 215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 216 { 217 kmp_int32 r; 218 r = KMP_TEST_THEN_INC_ACQ32( p ); 219 return r; 220 } 221 222 template<> 223 __forceinline kmp_int64 224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 225 { 226 kmp_int64 r; 227 r = KMP_TEST_THEN_INC_ACQ64( p ); 228 return r; 229 } 230 231 // test_then_inc template (general template should NOT be used) 232 template< typename T > 233 static __forceinline T 234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 235 236 template<> 237 __forceinline kmp_int32 238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 239 { 240 kmp_int32 r; 241 r = KMP_TEST_THEN_INC32( p ); 242 return r; 243 } 244 245 template<> 246 __forceinline kmp_int64 247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 248 { 249 kmp_int64 r; 250 r = KMP_TEST_THEN_INC64( p ); 251 return r; 252 } 253 254 // compare_and_swap template (general template should NOT be used) 255 template< typename T > 256 static __forceinline kmp_int32 257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 258 259 template<> 260 __forceinline kmp_int32 261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 262 { 263 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 264 } 265 266 template<> 267 __forceinline kmp_int32 268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 269 { 270 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 271 } 272 273 /* 274 Spin wait loop that first does pause, then yield. 275 Waits until function returns non-zero when called with *spinner and check. 276 Does NOT put threads to sleep. 277 #if USE_ITT_BUILD 278 Arguments: 279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 280 locks consistently. For example, if lock is acquired immediately, its address is 281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 283 address, not an address of low-level spinner. 284 #endif // USE_ITT_BUILD 285 */ 286 template< typename UT > 287 // ToDo: make inline function (move to header file for icl) 288 static UT // unsigned 4- or 8-byte type 289 __kmp_wait_yield( volatile UT * spinner, 290 UT checker, 291 kmp_uint32 (* pred)( UT, UT ) 292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 293 ) 294 { 295 // note: we may not belong to a team at this point 296 register volatile UT * spin = spinner; 297 register UT check = checker; 298 register kmp_uint32 spins; 299 register kmp_uint32 (*f) ( UT, UT ) = pred; 300 register UT r; 301 302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 303 KMP_INIT_YIELD( spins ); 304 // main wait spin loop 305 while(!f(r = *spin, check)) 306 { 307 KMP_FSYNC_SPIN_PREPARE( obj ); 308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 309 It causes problems with infinite recursion because of exit lock */ 310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 311 __kmp_abort_thread(); */ 312 313 // if we are oversubscribed, 314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 315 // pause is in the following code 316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 317 KMP_YIELD_SPIN( spins ); 318 } 319 KMP_FSYNC_SPIN_ACQUIRED( obj ); 320 return r; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_eq( UT value, UT checker) { 325 return value == checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_neq( UT value, UT checker) { 330 return value != checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_lt( UT value, UT checker) { 335 return value < checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_ge( UT value, UT checker) { 340 return value >= checker; 341 } 342 343 template< typename UT > 344 static kmp_uint32 __kmp_le( UT value, UT checker) { 345 return value <= checker; 346 } 347 348 349 /* ------------------------------------------------------------------------ */ 350 /* ------------------------------------------------------------------------ */ 351 352 static void 353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 354 { 355 kmp_info_t *th; 356 357 KMP_DEBUG_ASSERT( gtid_ref ); 358 359 if ( __kmp_env_consistency_check ) { 360 th = __kmp_threads[*gtid_ref]; 361 if ( th -> th.th_root -> r.r_active 362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 363 #if KMP_USE_DYNAMIC_LOCK 364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 365 #else 366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 367 #endif 368 } 369 } 370 } 371 372 template< typename UT > 373 static void 374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 375 { 376 typedef typename traits_t< UT >::signed_t ST; 377 dispatch_private_info_template< UT > * pr; 378 379 int gtid = *gtid_ref; 380 // int cid = *cid_ref; 381 kmp_info_t *th = __kmp_threads[ gtid ]; 382 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 383 384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 385 if ( __kmp_env_consistency_check ) { 386 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 387 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 388 if ( pr -> pushed_ws != ct_none ) { 389 #if KMP_USE_DYNAMIC_LOCK 390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 391 #else 392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 393 #endif 394 } 395 } 396 397 if ( ! th -> th.th_team -> t.t_serialized ) { 398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 400 UT lower; 401 402 if ( ! __kmp_env_consistency_check ) { 403 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 404 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 405 } 406 lower = pr->u.p.ordered_lower; 407 408 #if ! defined( KMP_GOMP_COMPAT ) 409 if ( __kmp_env_consistency_check ) { 410 if ( pr->ordered_bumped ) { 411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 412 __kmp_error_construct2( 413 kmp_i18n_msg_CnsMultipleNesting, 414 ct_ordered_in_pdo, loc_ref, 415 & p->stack_data[ p->w_top ] 416 ); 417 } 418 } 419 #endif /* !defined(KMP_GOMP_COMPAT) */ 420 421 KMP_MB(); 422 #ifdef KMP_DEBUG 423 { 424 const char * buff; 425 // create format specifiers before the debug output 426 buff = __kmp_str_format( 427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 428 traits_t< UT >::spec, traits_t< UT >::spec ); 429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 430 __kmp_str_free( &buff ); 431 } 432 #endif 433 434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 435 USE_ITT_BUILD_ARG( NULL ) 436 ); 437 KMP_MB(); /* is this necessary? */ 438 #ifdef KMP_DEBUG 439 { 440 const char * buff; 441 // create format specifiers before the debug output 442 buff = __kmp_str_format( 443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 444 traits_t< UT >::spec, traits_t< UT >::spec ); 445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 446 __kmp_str_free( &buff ); 447 } 448 #endif 449 } 450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 451 } 452 453 static void 454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 455 { 456 kmp_info_t *th; 457 458 if ( __kmp_env_consistency_check ) { 459 th = __kmp_threads[*gtid_ref]; 460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 } 465 466 template< typename UT > 467 static void 468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 469 { 470 typedef typename traits_t< UT >::signed_t ST; 471 dispatch_private_info_template< UT > * pr; 472 473 int gtid = *gtid_ref; 474 // int cid = *cid_ref; 475 kmp_info_t *th = __kmp_threads[ gtid ]; 476 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 477 478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 479 if ( __kmp_env_consistency_check ) { 480 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 481 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 482 if ( pr -> pushed_ws != ct_none ) { 483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 484 } 485 } 486 487 if ( ! th -> th.th_team -> t.t_serialized ) { 488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 489 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 490 491 if ( ! __kmp_env_consistency_check ) { 492 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 493 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 494 } 495 496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 497 #if ! defined( KMP_GOMP_COMPAT ) 498 if ( __kmp_env_consistency_check ) { 499 if ( pr->ordered_bumped != 0 ) { 500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 501 /* How to test it? - OM */ 502 __kmp_error_construct2( 503 kmp_i18n_msg_CnsMultipleNesting, 504 ct_ordered_in_pdo, loc_ref, 505 & p->stack_data[ p->w_top ] 506 ); 507 } 508 } 509 #endif /* !defined(KMP_GOMP_COMPAT) */ 510 511 KMP_MB(); /* Flush all pending memory write invalidates. */ 512 513 pr->ordered_bumped += 1; 514 515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 516 gtid, pr->ordered_bumped ) ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 520 /* TODO use general release procedure? */ 521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 522 523 KMP_MB(); /* Flush all pending memory write invalidates. */ 524 } 525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 526 } 527 528 /* Computes and returns x to the power of y, where y must a non-negative integer */ 529 template< typename UT > 530 static __forceinline long double 531 __kmp_pow(long double x, UT y) { 532 long double s=1.0L; 533 534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 536 while(y) { 537 if ( y & 1 ) 538 s *= x; 539 x *= x; 540 y >>= 1; 541 } 542 return s; 543 } 544 545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 546 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 549 */ 550 template< typename T > 551 static __inline typename traits_t< T >::unsigned_t 552 __kmp_dispatch_guided_remaining( 553 T tc, 554 typename traits_t< T >::floating_t base, 555 typename traits_t< T >::unsigned_t idx 556 ) { 557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 558 least for ICL 8.1, long double arithmetic may not really have 559 long double precision, even with /Qlong_double. Currently, we 560 workaround that in the caller code, by manipulating the FPCW for 561 Windows* OS on IA-32 architecture. The lack of precision is not 562 expected to be a correctness issue, though. 563 */ 564 typedef typename traits_t< T >::unsigned_t UT; 565 566 long double x = tc * __kmp_pow< UT >(base, idx); 567 UT r = (UT) x; 568 if ( x == r ) 569 return r; 570 return r + 1; 571 } 572 573 // Parameters of the guided-iterative algorithm: 574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 578 static int guided_int_param = 2; 579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 580 581 // UT - unsigned flavor of T, ST - signed flavor of T, 582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 583 template< typename T > 584 static void 585 __kmp_dispatch_init( 586 ident_t * loc, 587 int gtid, 588 enum sched_type schedule, 589 T lb, 590 T ub, 591 typename traits_t< T >::signed_t st, 592 typename traits_t< T >::signed_t chunk, 593 int push_ws 594 ) { 595 typedef typename traits_t< T >::unsigned_t UT; 596 typedef typename traits_t< T >::signed_t ST; 597 typedef typename traits_t< T >::floating_t DBL; 598 static const int ___kmp_size_type = sizeof( UT ); 599 600 int active; 601 T tc; 602 kmp_info_t * th; 603 kmp_team_t * team; 604 kmp_uint32 my_buffer_index; 605 dispatch_private_info_template< T > * pr; 606 dispatch_shared_info_template< UT > volatile * sh; 607 608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 610 611 if ( ! TCR_4( __kmp_init_parallel ) ) 612 __kmp_parallel_initialize(); 613 614 #if INCLUDE_SSC_MARKS 615 SSC_MARK_DISPATCH_INIT(); 616 #endif 617 #ifdef KMP_DEBUG 618 { 619 const char * buff; 620 // create format specifiers before the debug output 621 buff = __kmp_str_format( 622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 625 __kmp_str_free( &buff ); 626 } 627 #endif 628 /* setup data */ 629 th = __kmp_threads[ gtid ]; 630 team = th -> th.th_team; 631 active = ! team -> t.t_serialized; 632 th->th.th_ident = loc; 633 634 #if USE_ITT_BUILD 635 kmp_uint64 cur_chunk = chunk; 636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 637 KMP_MASTER_GTID(gtid) && 638 #if OMP_40_ENABLED 639 th->th.th_teams_microtask == NULL && 640 #endif 641 team->t.t_active_level == 1; 642 #endif 643 if ( ! active ) { 644 pr = reinterpret_cast< dispatch_private_info_template< T >* > 645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 646 } else { 647 KMP_DEBUG_ASSERT( th->th.th_dispatch == 648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 649 650 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 651 652 /* What happens when number of threads changes, need to resize buffer? */ 653 pr = reinterpret_cast< dispatch_private_info_template< T > * > 654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 657 } 658 659 /* Pick up the nomerge/ordered bits from the scheduling type */ 660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 661 pr->nomerge = TRUE; 662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 663 } else { 664 pr->nomerge = FALSE; 665 } 666 pr->type_size = ___kmp_size_type; // remember the size of variables 667 if ( kmp_ord_lower & schedule ) { 668 pr->ordered = TRUE; 669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 670 } else { 671 pr->ordered = FALSE; 672 } 673 if ( schedule == kmp_sch_static ) { 674 schedule = __kmp_static; 675 } else { 676 if ( schedule == kmp_sch_runtime ) { 677 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 678 schedule = team -> t.t_sched.r_sched_type; 679 // Detail the schedule if needed (global controls are differentiated appropriately) 680 if ( schedule == kmp_sch_guided_chunked ) { 681 schedule = __kmp_guided; 682 } else if ( schedule == kmp_sch_static ) { 683 schedule = __kmp_static; 684 } 685 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 686 chunk = team -> t.t_sched.chunk; 687 688 #ifdef KMP_DEBUG 689 { 690 const char * buff; 691 // create format specifiers before the debug output 692 buff = __kmp_str_format( 693 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 694 traits_t< ST >::spec ); 695 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 696 __kmp_str_free( &buff ); 697 } 698 #endif 699 } else { 700 if ( schedule == kmp_sch_guided_chunked ) { 701 schedule = __kmp_guided; 702 } 703 if ( chunk <= 0 ) { 704 chunk = KMP_DEFAULT_CHUNK; 705 } 706 } 707 708 if ( schedule == kmp_sch_auto ) { 709 // mapping and differentiation: in the __kmp_do_serial_initialize() 710 schedule = __kmp_auto; 711 #ifdef KMP_DEBUG 712 { 713 const char * buff; 714 // create format specifiers before the debug output 715 buff = __kmp_str_format( 716 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 717 traits_t< ST >::spec ); 718 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 719 __kmp_str_free( &buff ); 720 } 721 #endif 722 } 723 724 /* guided analytical not safe for too many threads */ 725 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 726 schedule = kmp_sch_guided_iterative_chunked; 727 KMP_WARNING( DispatchManyThreads ); 728 } 729 pr->u.p.parm1 = chunk; 730 } 731 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 732 "unknown scheduling type" ); 733 734 pr->u.p.count = 0; 735 736 if ( __kmp_env_consistency_check ) { 737 if ( st == 0 ) { 738 __kmp_error_construct( 739 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 740 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 741 ); 742 } 743 } 744 745 tc = ( ub - lb + st ); 746 if ( st != 1 ) { 747 if ( st < 0 ) { 748 if ( lb < ub ) { 749 tc = 0; // zero-trip 750 } else { // lb >= ub 751 tc = (ST)tc / st; // convert to signed division 752 } 753 } else { // st > 0 754 if ( ub < lb ) { 755 tc = 0; // zero-trip 756 } else { // lb >= ub 757 tc /= st; 758 } 759 } 760 } else if ( ub < lb ) { // st == 1 761 tc = 0; // zero-trip 762 } 763 764 pr->u.p.lb = lb; 765 pr->u.p.ub = ub; 766 pr->u.p.st = st; 767 pr->u.p.tc = tc; 768 769 #if KMP_OS_WINDOWS 770 pr->u.p.last_upper = ub + st; 771 #endif /* KMP_OS_WINDOWS */ 772 773 /* NOTE: only the active parallel region(s) has active ordered sections */ 774 775 if ( active ) { 776 if ( pr->ordered == 0 ) { 777 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 778 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 779 } else { 780 pr->ordered_bumped = 0; 781 782 pr->u.p.ordered_lower = 1; 783 pr->u.p.ordered_upper = 0; 784 785 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 786 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 787 } 788 } 789 790 if ( __kmp_env_consistency_check ) { 791 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 792 if ( push_ws ) { 793 __kmp_push_workshare( gtid, ws, loc ); 794 pr->pushed_ws = ws; 795 } else { 796 __kmp_check_workshare( gtid, ws, loc ); 797 pr->pushed_ws = ct_none; 798 } 799 } 800 801 switch ( schedule ) { 802 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 803 case kmp_sch_static_steal: 804 { 805 T nproc = team->t.t_nproc; 806 T ntc, init; 807 808 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 809 810 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 811 if ( nproc > 1 && ntc >= nproc ) { 812 T id = __kmp_tid_from_gtid(gtid); 813 T small_chunk, extras; 814 815 small_chunk = ntc / nproc; 816 extras = ntc % nproc; 817 818 init = id * small_chunk + ( id < extras ? id : extras ); 819 pr->u.p.count = init; 820 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 821 822 pr->u.p.parm2 = lb; 823 //pr->pfields.parm3 = 0; // it's not used in static_steal 824 pr->u.p.parm4 = id; 825 pr->u.p.st = st; 826 break; 827 } else { 828 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 829 gtid ) ); 830 schedule = kmp_sch_static_balanced; 831 /* too few iterations: fall-through to kmp_sch_static_balanced */ 832 } // if 833 /* FALL-THROUGH to static balanced */ 834 } // case 835 #endif 836 case kmp_sch_static_balanced: 837 { 838 T nproc = team->t.t_nproc; 839 T init, limit; 840 841 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 842 gtid ) ); 843 844 if ( nproc > 1 ) { 845 T id = __kmp_tid_from_gtid(gtid); 846 847 if ( tc < nproc ) { 848 if ( id < tc ) { 849 init = id; 850 limit = id; 851 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 852 } else { 853 pr->u.p.count = 1; /* means no more chunks to execute */ 854 pr->u.p.parm1 = FALSE; 855 break; 856 } 857 } else { 858 T small_chunk = tc / nproc; 859 T extras = tc % nproc; 860 init = id * small_chunk + (id < extras ? id : extras); 861 limit = init + small_chunk - (id < extras ? 0 : 1); 862 pr->u.p.parm1 = (id == nproc - 1); 863 } 864 } else { 865 if ( tc > 0 ) { 866 init = 0; 867 limit = tc - 1; 868 pr->u.p.parm1 = TRUE; 869 } else { 870 // zero trip count 871 pr->u.p.count = 1; /* means no more chunks to execute */ 872 pr->u.p.parm1 = FALSE; 873 break; 874 } 875 } 876 #if USE_ITT_BUILD 877 // Calculate chunk for metadata report 878 if ( itt_need_metadata_reporting ) 879 cur_chunk = limit - init + 1; 880 #endif 881 if ( st == 1 ) { 882 pr->u.p.lb = lb + init; 883 pr->u.p.ub = lb + limit; 884 } else { 885 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 886 pr->u.p.lb = lb + init * st; 887 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 888 if ( st > 0 ) { 889 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 890 } else { 891 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 892 } 893 } 894 if ( pr->ordered ) { 895 pr->u.p.ordered_lower = init; 896 pr->u.p.ordered_upper = limit; 897 } 898 break; 899 } // case 900 case kmp_sch_guided_iterative_chunked : 901 { 902 T nproc = team->t.t_nproc; 903 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 904 905 if ( nproc > 1 ) { 906 if ( (2L * chunk + 1 ) * nproc >= tc ) { 907 /* chunk size too large, switch to dynamic */ 908 schedule = kmp_sch_dynamic_chunked; 909 } else { 910 // when remaining iters become less than parm2 - switch to dynamic 911 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 912 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 913 } 914 } else { 915 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 916 schedule = kmp_sch_static_greedy; 917 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 918 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 919 pr->u.p.parm1 = tc; 920 } // if 921 } // case 922 break; 923 case kmp_sch_guided_analytical_chunked: 924 { 925 T nproc = team->t.t_nproc; 926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 927 928 if ( nproc > 1 ) { 929 if ( (2L * chunk + 1 ) * nproc >= tc ) { 930 /* chunk size too large, switch to dynamic */ 931 schedule = kmp_sch_dynamic_chunked; 932 } else { 933 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 934 DBL x; 935 936 #if KMP_OS_WINDOWS && KMP_ARCH_X86 937 /* Linux* OS already has 64-bit computation by default for 938 long double, and on Windows* OS on Intel(R) 64, 939 /Qlong_double doesn't work. On Windows* OS 940 on IA-32 architecture, we need to set precision to 941 64-bit instead of the default 53-bit. Even though long 942 double doesn't work on Windows* OS on Intel(R) 64, the 943 resulting lack of precision is not expected to impact 944 the correctness of the algorithm, but this has not been 945 mathematically proven. 946 */ 947 // save original FPCW and set precision to 64-bit, as 948 // Windows* OS on IA-32 architecture defaults to 53-bit 949 unsigned int oldFpcw = _control87(0,0); 950 _control87(_PC_64,_MCW_PC); // 0,0x30000 951 #endif 952 /* value used for comparison in solver for cross-over point */ 953 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 954 955 /* crossover point--chunk indexes equal to or greater than 956 this point switch to dynamic-style scheduling */ 957 UT cross; 958 959 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 960 x = (long double)1.0 - (long double)0.5 / nproc; 961 962 #ifdef KMP_DEBUG 963 { // test natural alignment 964 struct _test_a { 965 char a; 966 union { 967 char b; 968 DBL d; 969 }; 970 } t; 971 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 972 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 973 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 974 } 975 #endif // KMP_DEBUG 976 977 /* save the term in thread private dispatch structure */ 978 *(DBL*)&pr->u.p.parm3 = x; 979 980 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 981 { 982 UT left, right, mid; 983 long double p; 984 985 /* estimate initial upper and lower bound */ 986 987 /* doesn't matter what value right is as long as it is positive, but 988 it affects performance of the solver 989 */ 990 right = 229; 991 p = __kmp_pow< UT >(x,right); 992 if ( p > target ) { 993 do{ 994 p *= p; 995 right <<= 1; 996 } while(p>target && right < (1<<27)); 997 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 998 } else { 999 left = 0; 1000 } 1001 1002 /* bisection root-finding method */ 1003 while ( left + 1 < right ) { 1004 mid = (left + right) / 2; 1005 if ( __kmp_pow< UT >(x,mid) > target ) { 1006 left = mid; 1007 } else { 1008 right = mid; 1009 } 1010 } // while 1011 cross = right; 1012 } 1013 /* assert sanity of computed crossover point */ 1014 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1015 1016 /* save the crossover point in thread private dispatch structure */ 1017 pr->u.p.parm2 = cross; 1018 1019 // C75803 1020 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1021 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1022 #else 1023 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1024 #endif 1025 /* dynamic-style scheduling offset */ 1026 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1027 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1028 // restore FPCW 1029 _control87(oldFpcw,_MCW_PC); 1030 #endif 1031 } // if 1032 } else { 1033 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1034 gtid ) ); 1035 schedule = kmp_sch_static_greedy; 1036 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1037 pr->u.p.parm1 = tc; 1038 } // if 1039 } // case 1040 break; 1041 case kmp_sch_static_greedy: 1042 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1043 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1044 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1045 tc; 1046 break; 1047 case kmp_sch_static_chunked : 1048 case kmp_sch_dynamic_chunked : 1049 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1050 break; 1051 case kmp_sch_trapezoidal : 1052 { 1053 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1054 1055 T parm1, parm2, parm3, parm4; 1056 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1057 1058 parm1 = chunk; 1059 1060 /* F : size of the first cycle */ 1061 parm2 = ( tc / (2 * team->t.t_nproc) ); 1062 1063 if ( parm2 < 1 ) { 1064 parm2 = 1; 1065 } 1066 1067 /* L : size of the last cycle. Make sure the last cycle 1068 * is not larger than the first cycle. 1069 */ 1070 if ( parm1 < 1 ) { 1071 parm1 = 1; 1072 } else if ( parm1 > parm2 ) { 1073 parm1 = parm2; 1074 } 1075 1076 /* N : number of cycles */ 1077 parm3 = ( parm2 + parm1 ); 1078 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1079 1080 if ( parm3 < 2 ) { 1081 parm3 = 2; 1082 } 1083 1084 /* sigma : decreasing incr of the trapezoid */ 1085 parm4 = ( parm3 - 1 ); 1086 parm4 = ( parm2 - parm1 ) / parm4; 1087 1088 // pointless check, because parm4 >= 0 always 1089 //if ( parm4 < 0 ) { 1090 // parm4 = 0; 1091 //} 1092 1093 pr->u.p.parm1 = parm1; 1094 pr->u.p.parm2 = parm2; 1095 pr->u.p.parm3 = parm3; 1096 pr->u.p.parm4 = parm4; 1097 } // case 1098 break; 1099 1100 default: 1101 { 1102 __kmp_msg( 1103 kmp_ms_fatal, // Severity 1104 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1105 KMP_HNT( GetNewerLibrary ), // Hint 1106 __kmp_msg_null // Variadic argument list terminator 1107 ); 1108 } 1109 break; 1110 } // switch 1111 pr->schedule = schedule; 1112 if ( active ) { 1113 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1114 1115 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1116 gtid, my_buffer_index, sh->buffer_index) ); 1117 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1118 USE_ITT_BUILD_ARG( NULL ) 1119 ); 1120 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1121 // *always* 32-bit integers. 1122 KMP_MB(); /* is this necessary? */ 1123 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1124 gtid, my_buffer_index, sh->buffer_index) ); 1125 1126 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1127 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1128 #if USE_ITT_BUILD 1129 if ( pr->ordered ) { 1130 __kmp_itt_ordered_init( gtid ); 1131 }; // if 1132 // Report loop metadata 1133 if ( itt_need_metadata_reporting ) { 1134 // Only report metadata by master of active team at level 1 1135 kmp_uint64 schedtype = 0; 1136 switch ( schedule ) { 1137 case kmp_sch_static_chunked: 1138 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1139 break; 1140 case kmp_sch_static_greedy: 1141 cur_chunk = pr->u.p.parm1; 1142 break; 1143 case kmp_sch_dynamic_chunked: 1144 schedtype = 1; 1145 break; 1146 case kmp_sch_guided_iterative_chunked: 1147 case kmp_sch_guided_analytical_chunked: 1148 schedtype = 2; 1149 break; 1150 default: 1151 // Should we put this case under "static"? 1152 // case kmp_sch_static_steal: 1153 schedtype = 3; 1154 break; 1155 } 1156 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1157 } 1158 #endif /* USE_ITT_BUILD */ 1159 }; // if 1160 1161 #ifdef KMP_DEBUG 1162 { 1163 const char * buff; 1164 // create format specifiers before the debug output 1165 buff = __kmp_str_format( 1166 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1167 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1168 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1169 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1170 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1171 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1172 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1173 KD_TRACE(10, ( buff, 1174 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1175 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1176 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1177 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1178 __kmp_str_free( &buff ); 1179 } 1180 #endif 1181 #if ( KMP_STATIC_STEAL_ENABLED ) 1182 if ( ___kmp_size_type < 8 ) { 1183 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1184 // all the parm3 variables will contain the same value. 1185 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1186 // rather than program life-time increment. 1187 // So the dedicated variable is required. The 'static_steal_counter' is used. 1188 if( schedule == kmp_sch_static_steal ) { 1189 // Other threads will inspect this variable when searching for a victim. 1190 // This is a flag showing that other threads may steal from this thread since then. 1191 volatile T * p = &pr->u.p.static_steal_counter; 1192 *p = *p + 1; 1193 } 1194 } 1195 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1196 1197 #if OMPT_SUPPORT && OMPT_TRACE 1198 if ((ompt_status == ompt_status_track_callback) && 1199 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) { 1200 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1201 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1202 ompt_callbacks.ompt_callback(ompt_event_loop_begin)( 1203 team_info->parallel_id, task_info->task_id, team_info->microtask); 1204 } 1205 #endif 1206 } 1207 1208 /* 1209 * For ordered loops, either __kmp_dispatch_finish() should be called after 1210 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1211 * every chunk of iterations. If the ordered section(s) were not executed 1212 * for this iteration (or every iteration in this chunk), we need to set the 1213 * ordered iteration counters so that the next thread can proceed. 1214 */ 1215 template< typename UT > 1216 static void 1217 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1218 { 1219 typedef typename traits_t< UT >::signed_t ST; 1220 kmp_info_t *th = __kmp_threads[ gtid ]; 1221 1222 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1223 if ( ! th -> th.th_team -> t.t_serialized ) { 1224 1225 dispatch_private_info_template< UT > * pr = 1226 reinterpret_cast< dispatch_private_info_template< UT >* > 1227 ( th->th.th_dispatch->th_dispatch_pr_current ); 1228 dispatch_shared_info_template< UT > volatile * sh = 1229 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1230 ( th->th.th_dispatch->th_dispatch_sh_current ); 1231 KMP_DEBUG_ASSERT( pr ); 1232 KMP_DEBUG_ASSERT( sh ); 1233 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1234 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1235 1236 if ( pr->ordered_bumped ) { 1237 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1238 gtid ) ); 1239 pr->ordered_bumped = 0; 1240 } else { 1241 UT lower = pr->u.p.ordered_lower; 1242 1243 #ifdef KMP_DEBUG 1244 { 1245 const char * buff; 1246 // create format specifiers before the debug output 1247 buff = __kmp_str_format( 1248 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1249 traits_t< UT >::spec, traits_t< UT >::spec ); 1250 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1251 __kmp_str_free( &buff ); 1252 } 1253 #endif 1254 1255 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1256 USE_ITT_BUILD_ARG(NULL) 1257 ); 1258 KMP_MB(); /* is this necessary? */ 1259 #ifdef KMP_DEBUG 1260 { 1261 const char * buff; 1262 // create format specifiers before the debug output 1263 buff = __kmp_str_format( 1264 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1265 traits_t< UT >::spec, traits_t< UT >::spec ); 1266 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1267 __kmp_str_free( &buff ); 1268 } 1269 #endif 1270 1271 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1272 } // if 1273 } // if 1274 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1275 } 1276 1277 #ifdef KMP_GOMP_COMPAT 1278 1279 template< typename UT > 1280 static void 1281 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1282 { 1283 typedef typename traits_t< UT >::signed_t ST; 1284 kmp_info_t *th = __kmp_threads[ gtid ]; 1285 1286 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1287 if ( ! th -> th.th_team -> t.t_serialized ) { 1288 // int cid; 1289 dispatch_private_info_template< UT > * pr = 1290 reinterpret_cast< dispatch_private_info_template< UT >* > 1291 ( th->th.th_dispatch->th_dispatch_pr_current ); 1292 dispatch_shared_info_template< UT > volatile * sh = 1293 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1294 ( th->th.th_dispatch->th_dispatch_sh_current ); 1295 KMP_DEBUG_ASSERT( pr ); 1296 KMP_DEBUG_ASSERT( sh ); 1297 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1298 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1299 1300 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1301 UT lower = pr->u.p.ordered_lower; 1302 UT upper = pr->u.p.ordered_upper; 1303 UT inc = upper - lower + 1; 1304 1305 if ( pr->ordered_bumped == inc ) { 1306 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1307 gtid ) ); 1308 pr->ordered_bumped = 0; 1309 } else { 1310 inc -= pr->ordered_bumped; 1311 1312 #ifdef KMP_DEBUG 1313 { 1314 const char * buff; 1315 // create format specifiers before the debug output 1316 buff = __kmp_str_format( 1317 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1318 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1319 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1320 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1321 __kmp_str_free( &buff ); 1322 } 1323 #endif 1324 1325 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1326 USE_ITT_BUILD_ARG(NULL) 1327 ); 1328 1329 KMP_MB(); /* is this necessary? */ 1330 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1331 gtid ) ); 1332 pr->ordered_bumped = 0; 1333 //!!!!! TODO check if the inc should be unsigned, or signed??? 1334 #ifdef KMP_DEBUG 1335 { 1336 const char * buff; 1337 // create format specifiers before the debug output 1338 buff = __kmp_str_format( 1339 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1340 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1341 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1342 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1343 __kmp_str_free( &buff ); 1344 } 1345 #endif 1346 1347 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1348 } 1349 // } 1350 } 1351 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1352 } 1353 1354 #endif /* KMP_GOMP_COMPAT */ 1355 1356 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 1357 * (no more work), then tell OMPT the loop is over. In some cases 1358 * kmp_dispatch_fini() is not called. */ 1359 #if OMPT_SUPPORT && OMPT_TRACE 1360 #define OMPT_LOOP_END \ 1361 if (status == 0) { \ 1362 if ((ompt_status == ompt_status_track_callback) && \ 1363 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \ 1364 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1365 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \ 1366 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \ 1367 team_info->parallel_id, task_info->task_id); \ 1368 } \ 1369 } 1370 #else 1371 #define OMPT_LOOP_END // no-op 1372 #endif 1373 1374 template< typename T > 1375 static int 1376 __kmp_dispatch_next( 1377 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1378 ) { 1379 1380 typedef typename traits_t< T >::unsigned_t UT; 1381 typedef typename traits_t< T >::signed_t ST; 1382 typedef typename traits_t< T >::floating_t DBL; 1383 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1384 static const int ___kmp_size_type = sizeof( UT ); 1385 #endif 1386 1387 int status; 1388 dispatch_private_info_template< T > * pr; 1389 kmp_info_t * th = __kmp_threads[ gtid ]; 1390 kmp_team_t * team = th -> th.th_team; 1391 1392 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL 1393 #ifdef KMP_DEBUG 1394 { 1395 const char * buff; 1396 // create format specifiers before the debug output 1397 buff = __kmp_str_format( 1398 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1399 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1400 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1401 __kmp_str_free( &buff ); 1402 } 1403 #endif 1404 1405 if ( team -> t.t_serialized ) { 1406 /* NOTE: serialize this dispatch becase we are not at the active level */ 1407 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1408 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1409 KMP_DEBUG_ASSERT( pr ); 1410 1411 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1412 *p_lb = 0; 1413 *p_ub = 0; 1414 // if ( p_last != NULL ) 1415 // *p_last = 0; 1416 if ( p_st != NULL ) 1417 *p_st = 0; 1418 if ( __kmp_env_consistency_check ) { 1419 if ( pr->pushed_ws != ct_none ) { 1420 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1421 } 1422 } 1423 } else if ( pr->nomerge ) { 1424 kmp_int32 last; 1425 T start; 1426 UT limit, trip, init; 1427 ST incr; 1428 T chunk = pr->u.p.parm1; 1429 1430 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1431 1432 init = chunk * pr->u.p.count++; 1433 trip = pr->u.p.tc - 1; 1434 1435 if ( (status = (init <= trip)) == 0 ) { 1436 *p_lb = 0; 1437 *p_ub = 0; 1438 // if ( p_last != NULL ) 1439 // *p_last = 0; 1440 if ( p_st != NULL ) 1441 *p_st = 0; 1442 if ( __kmp_env_consistency_check ) { 1443 if ( pr->pushed_ws != ct_none ) { 1444 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1445 } 1446 } 1447 } else { 1448 start = pr->u.p.lb; 1449 limit = chunk + init - 1; 1450 incr = pr->u.p.st; 1451 1452 if ( (last = (limit >= trip)) != 0 ) { 1453 limit = trip; 1454 #if KMP_OS_WINDOWS 1455 pr->u.p.last_upper = pr->u.p.ub; 1456 #endif /* KMP_OS_WINDOWS */ 1457 } 1458 if ( p_last != NULL ) 1459 *p_last = last; 1460 if ( p_st != NULL ) 1461 *p_st = incr; 1462 if ( incr == 1 ) { 1463 *p_lb = start + init; 1464 *p_ub = start + limit; 1465 } else { 1466 *p_lb = start + init * incr; 1467 *p_ub = start + limit * incr; 1468 } 1469 1470 if ( pr->ordered ) { 1471 pr->u.p.ordered_lower = init; 1472 pr->u.p.ordered_upper = limit; 1473 #ifdef KMP_DEBUG 1474 { 1475 const char * buff; 1476 // create format specifiers before the debug output 1477 buff = __kmp_str_format( 1478 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1479 traits_t< UT >::spec, traits_t< UT >::spec ); 1480 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1481 __kmp_str_free( &buff ); 1482 } 1483 #endif 1484 } // if 1485 } // if 1486 } else { 1487 pr->u.p.tc = 0; 1488 *p_lb = pr->u.p.lb; 1489 *p_ub = pr->u.p.ub; 1490 #if KMP_OS_WINDOWS 1491 pr->u.p.last_upper = *p_ub; 1492 #endif /* KMP_OS_WINDOWS */ 1493 if ( p_last != NULL ) 1494 *p_last = TRUE; 1495 if ( p_st != NULL ) 1496 *p_st = pr->u.p.st; 1497 } // if 1498 #ifdef KMP_DEBUG 1499 { 1500 const char * buff; 1501 // create format specifiers before the debug output 1502 buff = __kmp_str_format( 1503 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1504 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1505 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1506 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1507 __kmp_str_free( &buff ); 1508 } 1509 #endif 1510 #if INCLUDE_SSC_MARKS 1511 SSC_MARK_DISPATCH_NEXT(); 1512 #endif 1513 OMPT_LOOP_END; 1514 return status; 1515 } else { 1516 kmp_int32 last = 0; 1517 dispatch_shared_info_template< UT > *sh; 1518 T start; 1519 ST incr; 1520 UT limit, trip, init; 1521 1522 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1523 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1524 1525 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1526 ( th->th.th_dispatch->th_dispatch_pr_current ); 1527 KMP_DEBUG_ASSERT( pr ); 1528 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1529 ( th->th.th_dispatch->th_dispatch_sh_current ); 1530 KMP_DEBUG_ASSERT( sh ); 1531 1532 if ( pr->u.p.tc == 0 ) { 1533 // zero trip count 1534 status = 0; 1535 } else { 1536 switch (pr->schedule) { 1537 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1538 case kmp_sch_static_steal: 1539 { 1540 T chunk = pr->u.p.parm1; 1541 1542 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1543 1544 trip = pr->u.p.tc - 1; 1545 1546 if ( ___kmp_size_type > 4 ) { 1547 // Other threads do not look into the data of this thread, 1548 // so it's not necessary to make volatile casting. 1549 init = ( pr->u.p.count )++; 1550 status = ( init < (UT)pr->u.p.ub ); 1551 } else { 1552 typedef union { 1553 struct { 1554 UT count; 1555 T ub; 1556 } p; 1557 kmp_int64 b; 1558 } union_i4; 1559 // All operations on 'count' or 'ub' must be combined atomically together. 1560 // stealing implemented only for 4-byte indexes 1561 { 1562 union_i4 vold, vnew; 1563 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1564 vnew = vold; 1565 vnew.p.count++; 1566 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1567 ( volatile kmp_int64* )&pr->u.p.count, 1568 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1569 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1570 KMP_CPU_PAUSE(); 1571 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1572 vnew = vold; 1573 vnew.p.count++; 1574 } 1575 vnew = vold; 1576 init = vnew.p.count; 1577 status = ( init < (UT)vnew.p.ub ) ; 1578 } 1579 1580 if( !status ) { 1581 kmp_info_t **other_threads = team->t.t_threads; 1582 int while_limit = 10; 1583 int while_index = 0; 1584 1585 // TODO: algorithm of searching for a victim 1586 // should be cleaned up and measured 1587 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1588 union_i4 vold, vnew; 1589 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1590 T victimIdx = pr->u.p.parm4; 1591 T oldVictimIdx = victimIdx; 1592 dispatch_private_info_template< T > * victim; 1593 1594 do { 1595 if( !victimIdx ) { 1596 victimIdx = team->t.t_nproc - 1; 1597 } else { 1598 --victimIdx; 1599 } 1600 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1601 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1602 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1603 // TODO: think about a proper place of this test 1604 if ( ( !victim ) || 1605 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1606 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1607 // TODO: delay would be nice 1608 continue; 1609 // the victim is not ready yet to participate in stealing 1610 // because the victim is still in kmp_init_dispatch 1611 } 1612 if ( oldVictimIdx == victimIdx ) { 1613 break; 1614 } 1615 pr->u.p.parm4 = victimIdx; 1616 1617 while( 1 ) { 1618 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1619 vnew = vold; 1620 1621 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1622 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1623 break; 1624 } 1625 vnew.p.ub -= (remaining >> 2); 1626 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1627 #pragma warning( push ) 1628 // disable warning on pointless comparison of unsigned with 0 1629 #pragma warning( disable: 186 ) 1630 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1631 #pragma warning( pop ) 1632 // TODO: Should this be acquire or release? 1633 if ( KMP_COMPARE_AND_STORE_ACQ64( 1634 ( volatile kmp_int64 * )&victim->u.p.count, 1635 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1636 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1637 status = 1; 1638 while_index = 0; 1639 // now update own count and ub 1640 #if KMP_ARCH_X86 1641 // stealing executed on non-KMP_ARCH_X86 only 1642 // Atomic 64-bit write on ia32 is 1643 // unavailable, so we do this in steps. 1644 // This code is not tested. 1645 init = vold.p.count; 1646 pr->u.p.ub = 0; 1647 pr->u.p.count = init + 1; 1648 pr->u.p.ub = vnew.p.count; 1649 #else 1650 init = vnew.p.ub; 1651 vold.p.count = init + 1; 1652 // TODO: is it safe and enough? 1653 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1654 #endif // KMP_ARCH_X86 1655 break; 1656 } // if 1657 KMP_CPU_PAUSE(); 1658 } // while (1) 1659 } // while 1660 } // if 1661 } // if 1662 if ( !status ) { 1663 *p_lb = 0; 1664 *p_ub = 0; 1665 if ( p_st != NULL ) *p_st = 0; 1666 } else { 1667 start = pr->u.p.parm2; 1668 init *= chunk; 1669 limit = chunk + init - 1; 1670 incr = pr->u.p.st; 1671 1672 KMP_DEBUG_ASSERT(init <= trip); 1673 if ( (last = (limit >= trip)) != 0 ) 1674 limit = trip; 1675 if ( p_st != NULL ) *p_st = incr; 1676 1677 if ( incr == 1 ) { 1678 *p_lb = start + init; 1679 *p_ub = start + limit; 1680 } else { 1681 *p_lb = start + init * incr; 1682 *p_ub = start + limit * incr; 1683 } 1684 1685 if ( pr->ordered ) { 1686 pr->u.p.ordered_lower = init; 1687 pr->u.p.ordered_upper = limit; 1688 #ifdef KMP_DEBUG 1689 { 1690 const char * buff; 1691 // create format specifiers before the debug output 1692 buff = __kmp_str_format( 1693 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1694 traits_t< UT >::spec, traits_t< UT >::spec ); 1695 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1696 __kmp_str_free( &buff ); 1697 } 1698 #endif 1699 } // if 1700 } // if 1701 break; 1702 } // case 1703 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1704 case kmp_sch_static_balanced: 1705 { 1706 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1707 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1708 pr->u.p.count = 1; 1709 *p_lb = pr->u.p.lb; 1710 *p_ub = pr->u.p.ub; 1711 last = pr->u.p.parm1; 1712 if ( p_st != NULL ) 1713 *p_st = pr->u.p.st; 1714 } else { /* no iterations to do */ 1715 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1716 } 1717 if ( pr->ordered ) { 1718 #ifdef KMP_DEBUG 1719 { 1720 const char * buff; 1721 // create format specifiers before the debug output 1722 buff = __kmp_str_format( 1723 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1724 traits_t< UT >::spec, traits_t< UT >::spec ); 1725 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1726 __kmp_str_free( &buff ); 1727 } 1728 #endif 1729 } // if 1730 } // case 1731 break; 1732 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1733 case kmp_sch_static_chunked: 1734 { 1735 T parm1; 1736 1737 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1738 gtid ) ); 1739 parm1 = pr->u.p.parm1; 1740 1741 trip = pr->u.p.tc - 1; 1742 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1743 1744 if ( (status = (init <= trip)) != 0 ) { 1745 start = pr->u.p.lb; 1746 incr = pr->u.p.st; 1747 limit = parm1 + init - 1; 1748 1749 if ( (last = (limit >= trip)) != 0 ) 1750 limit = trip; 1751 1752 if ( p_st != NULL ) *p_st = incr; 1753 1754 pr->u.p.count += team->t.t_nproc; 1755 1756 if ( incr == 1 ) { 1757 *p_lb = start + init; 1758 *p_ub = start + limit; 1759 } 1760 else { 1761 *p_lb = start + init * incr; 1762 *p_ub = start + limit * incr; 1763 } 1764 1765 if ( pr->ordered ) { 1766 pr->u.p.ordered_lower = init; 1767 pr->u.p.ordered_upper = limit; 1768 #ifdef KMP_DEBUG 1769 { 1770 const char * buff; 1771 // create format specifiers before the debug output 1772 buff = __kmp_str_format( 1773 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1774 traits_t< UT >::spec, traits_t< UT >::spec ); 1775 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1776 __kmp_str_free( &buff ); 1777 } 1778 #endif 1779 } // if 1780 } // if 1781 } // case 1782 break; 1783 1784 case kmp_sch_dynamic_chunked: 1785 { 1786 T chunk = pr->u.p.parm1; 1787 1788 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1789 gtid ) ); 1790 1791 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1792 trip = pr->u.p.tc - 1; 1793 1794 if ( (status = (init <= trip)) == 0 ) { 1795 *p_lb = 0; 1796 *p_ub = 0; 1797 if ( p_st != NULL ) *p_st = 0; 1798 } else { 1799 start = pr->u.p.lb; 1800 limit = chunk + init - 1; 1801 incr = pr->u.p.st; 1802 1803 if ( (last = (limit >= trip)) != 0 ) 1804 limit = trip; 1805 1806 if ( p_st != NULL ) *p_st = incr; 1807 1808 if ( incr == 1 ) { 1809 *p_lb = start + init; 1810 *p_ub = start + limit; 1811 } else { 1812 *p_lb = start + init * incr; 1813 *p_ub = start + limit * incr; 1814 } 1815 1816 if ( pr->ordered ) { 1817 pr->u.p.ordered_lower = init; 1818 pr->u.p.ordered_upper = limit; 1819 #ifdef KMP_DEBUG 1820 { 1821 const char * buff; 1822 // create format specifiers before the debug output 1823 buff = __kmp_str_format( 1824 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1825 traits_t< UT >::spec, traits_t< UT >::spec ); 1826 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1827 __kmp_str_free( &buff ); 1828 } 1829 #endif 1830 } // if 1831 } // if 1832 } // case 1833 break; 1834 1835 case kmp_sch_guided_iterative_chunked: 1836 { 1837 T chunkspec = pr->u.p.parm1; 1838 KD_TRACE(100, 1839 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1840 trip = pr->u.p.tc; 1841 // Start atomic part of calculations 1842 while(1) { 1843 ST remaining; // signed, because can be < 0 1844 init = sh->u.s.iteration; // shared value 1845 remaining = trip - init; 1846 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1847 // nothing to do, don't try atomic op 1848 status = 0; 1849 break; 1850 } 1851 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1852 // use dynamic-style shcedule 1853 // atomically inrement iterations, get old value 1854 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1855 remaining = trip - init; 1856 if (remaining <= 0) { 1857 status = 0; // all iterations got by other threads 1858 } else { 1859 // got some iterations to work on 1860 status = 1; 1861 if ( (T)remaining > chunkspec ) { 1862 limit = init + chunkspec - 1; 1863 } else { 1864 last = 1; // the last chunk 1865 limit = init + remaining - 1; 1866 } // if 1867 } // if 1868 break; 1869 } // if 1870 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1871 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1872 // CAS was successful, chunk obtained 1873 status = 1; 1874 --limit; 1875 break; 1876 } // if 1877 } // while 1878 if ( status != 0 ) { 1879 start = pr->u.p.lb; 1880 incr = pr->u.p.st; 1881 if ( p_st != NULL ) 1882 *p_st = incr; 1883 *p_lb = start + init * incr; 1884 *p_ub = start + limit * incr; 1885 if ( pr->ordered ) { 1886 pr->u.p.ordered_lower = init; 1887 pr->u.p.ordered_upper = limit; 1888 #ifdef KMP_DEBUG 1889 { 1890 const char * buff; 1891 // create format specifiers before the debug output 1892 buff = __kmp_str_format( 1893 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1894 traits_t< UT >::spec, traits_t< UT >::spec ); 1895 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1896 __kmp_str_free( &buff ); 1897 } 1898 #endif 1899 } // if 1900 } else { 1901 *p_lb = 0; 1902 *p_ub = 0; 1903 if ( p_st != NULL ) 1904 *p_st = 0; 1905 } // if 1906 } // case 1907 break; 1908 1909 case kmp_sch_guided_analytical_chunked: 1910 { 1911 T chunkspec = pr->u.p.parm1; 1912 UT chunkIdx; 1913 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1914 /* for storing original FPCW value for Windows* OS on 1915 IA-32 architecture 8-byte version */ 1916 unsigned int oldFpcw; 1917 unsigned int fpcwSet = 0; 1918 #endif 1919 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1920 gtid ) ); 1921 1922 trip = pr->u.p.tc; 1923 1924 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1925 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1926 1927 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1928 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1929 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1930 --trip; 1931 /* use dynamic-style scheduling */ 1932 init = chunkIdx * chunkspec + pr->u.p.count; 1933 /* need to verify init > 0 in case of overflow in the above calculation */ 1934 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1935 limit = init + chunkspec -1; 1936 1937 if ( (last = (limit >= trip)) != 0 ) 1938 limit = trip; 1939 } 1940 break; 1941 } else { 1942 /* use exponential-style scheduling */ 1943 /* The following check is to workaround the lack of long double precision on Windows* OS. 1944 This check works around the possible effect that init != 0 for chunkIdx == 0. 1945 */ 1946 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1947 /* If we haven't already done so, save original 1948 FPCW and set precision to 64-bit, as Windows* OS 1949 on IA-32 architecture defaults to 53-bit */ 1950 if ( !fpcwSet ) { 1951 oldFpcw = _control87(0,0); 1952 _control87(_PC_64,_MCW_PC); 1953 fpcwSet = 0x30000; 1954 } 1955 #endif 1956 if ( chunkIdx ) { 1957 init = __kmp_dispatch_guided_remaining< T >( 1958 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1959 KMP_DEBUG_ASSERT(init); 1960 init = trip - init; 1961 } else 1962 init = 0; 1963 limit = trip - __kmp_dispatch_guided_remaining< T >( 1964 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1965 KMP_ASSERT(init <= limit); 1966 if ( init < limit ) { 1967 KMP_DEBUG_ASSERT(limit <= trip); 1968 --limit; 1969 status = 1; 1970 break; 1971 } // if 1972 } // if 1973 } // while (1) 1974 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1975 /* restore FPCW if necessary 1976 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1977 */ 1978 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1979 _control87(oldFpcw,_MCW_PC); 1980 #endif 1981 if ( status != 0 ) { 1982 start = pr->u.p.lb; 1983 incr = pr->u.p.st; 1984 if ( p_st != NULL ) 1985 *p_st = incr; 1986 *p_lb = start + init * incr; 1987 *p_ub = start + limit * incr; 1988 if ( pr->ordered ) { 1989 pr->u.p.ordered_lower = init; 1990 pr->u.p.ordered_upper = limit; 1991 #ifdef KMP_DEBUG 1992 { 1993 const char * buff; 1994 // create format specifiers before the debug output 1995 buff = __kmp_str_format( 1996 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1997 traits_t< UT >::spec, traits_t< UT >::spec ); 1998 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1999 __kmp_str_free( &buff ); 2000 } 2001 #endif 2002 } 2003 } else { 2004 *p_lb = 0; 2005 *p_ub = 0; 2006 if ( p_st != NULL ) 2007 *p_st = 0; 2008 } 2009 } // case 2010 break; 2011 2012 case kmp_sch_trapezoidal: 2013 { 2014 UT index; 2015 T parm2 = pr->u.p.parm2; 2016 T parm3 = pr->u.p.parm3; 2017 T parm4 = pr->u.p.parm4; 2018 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 2019 gtid ) ); 2020 2021 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 2022 2023 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 2024 trip = pr->u.p.tc - 1; 2025 2026 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 2027 *p_lb = 0; 2028 *p_ub = 0; 2029 if ( p_st != NULL ) *p_st = 0; 2030 } else { 2031 start = pr->u.p.lb; 2032 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 2033 incr = pr->u.p.st; 2034 2035 if ( (last = (limit >= trip)) != 0 ) 2036 limit = trip; 2037 2038 if ( p_st != NULL ) *p_st = incr; 2039 2040 if ( incr == 1 ) { 2041 *p_lb = start + init; 2042 *p_ub = start + limit; 2043 } else { 2044 *p_lb = start + init * incr; 2045 *p_ub = start + limit * incr; 2046 } 2047 2048 if ( pr->ordered ) { 2049 pr->u.p.ordered_lower = init; 2050 pr->u.p.ordered_upper = limit; 2051 #ifdef KMP_DEBUG 2052 { 2053 const char * buff; 2054 // create format specifiers before the debug output 2055 buff = __kmp_str_format( 2056 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2057 traits_t< UT >::spec, traits_t< UT >::spec ); 2058 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2059 __kmp_str_free( &buff ); 2060 } 2061 #endif 2062 } // if 2063 } // if 2064 } // case 2065 break; 2066 default: 2067 { 2068 status = 0; // to avoid complaints on uninitialized variable use 2069 __kmp_msg( 2070 kmp_ms_fatal, // Severity 2071 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2072 KMP_HNT( GetNewerLibrary ), // Hint 2073 __kmp_msg_null // Variadic argument list terminator 2074 ); 2075 } 2076 break; 2077 } // switch 2078 } // if tc == 0; 2079 2080 if ( status == 0 ) { 2081 UT num_done; 2082 2083 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2084 #ifdef KMP_DEBUG 2085 { 2086 const char * buff; 2087 // create format specifiers before the debug output 2088 buff = __kmp_str_format( 2089 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2090 traits_t< UT >::spec ); 2091 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2092 __kmp_str_free( &buff ); 2093 } 2094 #endif 2095 2096 if ( (ST)num_done == team->t.t_nproc-1 ) { 2097 /* NOTE: release this buffer to be reused */ 2098 2099 KMP_MB(); /* Flush all pending memory write invalidates. */ 2100 2101 sh->u.s.num_done = 0; 2102 sh->u.s.iteration = 0; 2103 2104 /* TODO replace with general release procedure? */ 2105 if ( pr->ordered ) { 2106 sh->u.s.ordered_iteration = 0; 2107 } 2108 2109 KMP_MB(); /* Flush all pending memory write invalidates. */ 2110 2111 sh -> buffer_index += KMP_MAX_DISP_BUF; 2112 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2113 gtid, sh->buffer_index) ); 2114 2115 KMP_MB(); /* Flush all pending memory write invalidates. */ 2116 2117 } // if 2118 if ( __kmp_env_consistency_check ) { 2119 if ( pr->pushed_ws != ct_none ) { 2120 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2121 } 2122 } 2123 2124 th -> th.th_dispatch -> th_deo_fcn = NULL; 2125 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2126 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2127 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2128 } // if (status == 0) 2129 #if KMP_OS_WINDOWS 2130 else if ( last ) { 2131 pr->u.p.last_upper = pr->u.p.ub; 2132 } 2133 #endif /* KMP_OS_WINDOWS */ 2134 if ( p_last != NULL && status != 0 ) 2135 *p_last = last; 2136 } // if 2137 2138 #ifdef KMP_DEBUG 2139 { 2140 const char * buff; 2141 // create format specifiers before the debug output 2142 buff = __kmp_str_format( 2143 "__kmp_dispatch_next: T#%%d normal case: " \ 2144 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2145 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2146 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2147 __kmp_str_free( &buff ); 2148 } 2149 #endif 2150 #if INCLUDE_SSC_MARKS 2151 SSC_MARK_DISPATCH_NEXT(); 2152 #endif 2153 OMPT_LOOP_END; 2154 return status; 2155 } 2156 2157 template< typename T > 2158 static void 2159 __kmp_dist_get_bounds( 2160 ident_t *loc, 2161 kmp_int32 gtid, 2162 kmp_int32 *plastiter, 2163 T *plower, 2164 T *pupper, 2165 typename traits_t< T >::signed_t incr 2166 ) { 2167 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); 2168 typedef typename traits_t< T >::unsigned_t UT; 2169 typedef typename traits_t< T >::signed_t ST; 2170 register kmp_uint32 team_id; 2171 register kmp_uint32 nteams; 2172 register UT trip_count; 2173 register kmp_team_t *team; 2174 kmp_info_t * th; 2175 2176 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2177 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2178 #ifdef KMP_DEBUG 2179 { 2180 const char * buff; 2181 // create format specifiers before the debug output 2182 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2183 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2184 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2185 traits_t< T >::spec ); 2186 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2187 __kmp_str_free( &buff ); 2188 } 2189 #endif 2190 2191 if( __kmp_env_consistency_check ) { 2192 if( incr == 0 ) { 2193 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2194 } 2195 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2196 // The loop is illegal. 2197 // Some zero-trip loops maintained by compiler, e.g.: 2198 // for(i=10;i<0;++i) // lower >= upper - run-time check 2199 // for(i=0;i>10;--i) // lower <= upper - run-time check 2200 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2201 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2202 // Compiler does not check the following illegal loops: 2203 // for(i=0;i<10;i+=incr) // where incr<0 2204 // for(i=10;i>0;i-=incr) // where incr<0 2205 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2206 } 2207 } 2208 th = __kmp_threads[gtid]; 2209 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2210 team = th->th.th_team; 2211 #if OMP_40_ENABLED 2212 nteams = th->th.th_teams_size.nteams; 2213 #endif 2214 team_id = team->t.t_master_tid; 2215 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2216 2217 // compute global trip count 2218 if( incr == 1 ) { 2219 trip_count = *pupper - *plower + 1; 2220 } else if(incr == -1) { 2221 trip_count = *plower - *pupper + 1; 2222 } else { 2223 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2224 } 2225 if( trip_count <= nteams ) { 2226 KMP_DEBUG_ASSERT( 2227 __kmp_static == kmp_sch_static_greedy || \ 2228 __kmp_static == kmp_sch_static_balanced 2229 ); // Unknown static scheduling type. 2230 // only some teams get single iteration, others get nothing 2231 if( team_id < trip_count ) { 2232 *pupper = *plower = *plower + team_id * incr; 2233 } else { 2234 *plower = *pupper + incr; // zero-trip loop 2235 } 2236 if( plastiter != NULL ) 2237 *plastiter = ( team_id == trip_count - 1 ); 2238 } else { 2239 if( __kmp_static == kmp_sch_static_balanced ) { 2240 register UT chunk = trip_count / nteams; 2241 register UT extras = trip_count % nteams; 2242 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2243 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2244 if( plastiter != NULL ) 2245 *plastiter = ( team_id == nteams - 1 ); 2246 } else { 2247 register T chunk_inc_count = 2248 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2249 register T upper = *pupper; 2250 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2251 // Unknown static scheduling type. 2252 *plower += team_id * chunk_inc_count; 2253 *pupper = *plower + chunk_inc_count - incr; 2254 // Check/correct bounds if needed 2255 if( incr > 0 ) { 2256 if( *pupper < *plower ) 2257 *pupper = i_maxmin< T >::mx; 2258 if( plastiter != NULL ) 2259 *plastiter = *plower <= upper && *pupper > upper - incr; 2260 if( *pupper > upper ) 2261 *pupper = upper; // tracker C73258 2262 } else { 2263 if( *pupper > *plower ) 2264 *pupper = i_maxmin< T >::mn; 2265 if( plastiter != NULL ) 2266 *plastiter = *plower >= upper && *pupper < upper - incr; 2267 if( *pupper < upper ) 2268 *pupper = upper; // tracker C73258 2269 } 2270 } 2271 } 2272 } 2273 2274 //----------------------------------------------------------------------------------------- 2275 // Dispatch routines 2276 // Transfer call to template< type T > 2277 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2278 // T lb, T ub, ST st, ST chunk ) 2279 extern "C" { 2280 2281 /*! 2282 @ingroup WORK_SHARING 2283 @{ 2284 @param loc Source location 2285 @param gtid Global thread id 2286 @param schedule Schedule type 2287 @param lb Lower bound 2288 @param ub Upper bound 2289 @param st Step (or increment if you prefer) 2290 @param chunk The chunk size to block with 2291 2292 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2293 These functions are all identical apart from the types of the arguments. 2294 */ 2295 2296 void 2297 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2298 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2299 { 2300 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2301 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2302 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2303 } 2304 /*! 2305 See @ref __kmpc_dispatch_init_4 2306 */ 2307 void 2308 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2309 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2310 { 2311 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2312 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2313 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2314 } 2315 2316 /*! 2317 See @ref __kmpc_dispatch_init_4 2318 */ 2319 void 2320 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2321 kmp_int64 lb, kmp_int64 ub, 2322 kmp_int64 st, kmp_int64 chunk ) 2323 { 2324 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2325 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2326 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2327 } 2328 2329 /*! 2330 See @ref __kmpc_dispatch_init_4 2331 */ 2332 void 2333 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2334 kmp_uint64 lb, kmp_uint64 ub, 2335 kmp_int64 st, kmp_int64 chunk ) 2336 { 2337 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2338 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2339 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2340 } 2341 2342 /*! 2343 See @ref __kmpc_dispatch_init_4 2344 2345 Difference from __kmpc_dispatch_init set of functions is these functions 2346 are called for composite distribute parallel for construct. Thus before 2347 regular iterations dispatching we need to calc per-team iteration space. 2348 2349 These functions are all identical apart from the types of the arguments. 2350 */ 2351 void 2352 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2353 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2354 { 2355 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2356 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2357 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2358 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2359 } 2360 2361 void 2362 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2363 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2364 { 2365 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2366 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2367 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2368 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2369 } 2370 2371 void 2372 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2373 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2374 { 2375 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2376 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2377 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2378 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2379 } 2380 2381 void 2382 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2383 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2384 { 2385 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2386 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2387 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2388 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2389 } 2390 2391 /*! 2392 @param loc Source code location 2393 @param gtid Global thread id 2394 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2395 @param p_lb Pointer to the lower bound for the next chunk of work 2396 @param p_ub Pointer to the upper bound for the next chunk of work 2397 @param p_st Pointer to the stride for the next chunk of work 2398 @return one if there is work to be done, zero otherwise 2399 2400 Get the next dynamically allocated chunk of work for this thread. 2401 If there is no more work, then the lb,ub and stride need not be modified. 2402 */ 2403 int 2404 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2405 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2406 { 2407 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2408 } 2409 2410 /*! 2411 See @ref __kmpc_dispatch_next_4 2412 */ 2413 int 2414 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2415 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2416 { 2417 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2418 } 2419 2420 /*! 2421 See @ref __kmpc_dispatch_next_4 2422 */ 2423 int 2424 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2425 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2426 { 2427 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2428 } 2429 2430 /*! 2431 See @ref __kmpc_dispatch_next_4 2432 */ 2433 int 2434 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2435 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2436 { 2437 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2438 } 2439 2440 /*! 2441 @param loc Source code location 2442 @param gtid Global thread id 2443 2444 Mark the end of a dynamic loop. 2445 */ 2446 void 2447 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2448 { 2449 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2450 } 2451 2452 /*! 2453 See @ref __kmpc_dispatch_fini_4 2454 */ 2455 void 2456 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2457 { 2458 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2459 } 2460 2461 /*! 2462 See @ref __kmpc_dispatch_fini_4 2463 */ 2464 void 2465 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2466 { 2467 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2468 } 2469 2470 /*! 2471 See @ref __kmpc_dispatch_fini_4 2472 */ 2473 void 2474 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2475 { 2476 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2477 } 2478 /*! @} */ 2479 2480 //----------------------------------------------------------------------------------------- 2481 //Non-template routines from kmp_dispatch.c used in other sources 2482 2483 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2484 return value == checker; 2485 } 2486 2487 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2488 return value != checker; 2489 } 2490 2491 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2492 return value < checker; 2493 } 2494 2495 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2496 return value >= checker; 2497 } 2498 2499 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2500 return value <= checker; 2501 } 2502 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2503 return value == checker; 2504 } 2505 2506 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2507 return value != checker; 2508 } 2509 2510 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2511 return value < checker; 2512 } 2513 2514 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2515 return value >= checker; 2516 } 2517 2518 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2519 return value <= checker; 2520 } 2521 2522 kmp_uint32 2523 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2524 kmp_uint32 checker, 2525 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2526 , void * obj // Higher-level synchronization object, or NULL. 2527 ) 2528 { 2529 // note: we may not belong to a team at this point 2530 register volatile kmp_uint32 * spin = spinner; 2531 register kmp_uint32 check = checker; 2532 register kmp_uint32 spins; 2533 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2534 register kmp_uint32 r; 2535 2536 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2537 KMP_INIT_YIELD( spins ); 2538 // main wait spin loop 2539 while(!f(r = TCR_4(*spin), check)) { 2540 KMP_FSYNC_SPIN_PREPARE( obj ); 2541 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2542 It causes problems with infinite recursion because of exit lock */ 2543 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2544 __kmp_abort_thread(); */ 2545 2546 /* if we have waited a bit, or are oversubscribed, yield */ 2547 /* pause is in the following code */ 2548 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2549 KMP_YIELD_SPIN( spins ); 2550 } 2551 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2552 return r; 2553 } 2554 2555 kmp_uint64 2556 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2557 kmp_uint64 checker, 2558 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2559 , void * obj // Higher-level synchronization object, or NULL. 2560 ) 2561 { 2562 // note: we may not belong to a team at this point 2563 register volatile kmp_uint64 * spin = spinner; 2564 register kmp_uint64 check = checker; 2565 register kmp_uint32 spins; 2566 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2567 register kmp_uint64 r; 2568 2569 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2570 KMP_INIT_YIELD( spins ); 2571 // main wait spin loop 2572 while(!f(r = *spin, check)) 2573 { 2574 KMP_FSYNC_SPIN_PREPARE( obj ); 2575 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2576 It causes problems with infinite recursion because of exit lock */ 2577 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2578 __kmp_abort_thread(); */ 2579 2580 // if we are oversubscribed, 2581 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2582 // pause is in the following code 2583 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2584 KMP_YIELD_SPIN( spins ); 2585 } 2586 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2587 return r; 2588 } 2589 2590 } // extern "C" 2591 2592 #ifdef KMP_GOMP_COMPAT 2593 2594 void 2595 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2596 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2597 kmp_int32 chunk, int push_ws ) 2598 { 2599 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2600 push_ws ); 2601 } 2602 2603 void 2604 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2605 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2606 kmp_int32 chunk, int push_ws ) 2607 { 2608 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2609 push_ws ); 2610 } 2611 2612 void 2613 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2614 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2615 kmp_int64 chunk, int push_ws ) 2616 { 2617 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2618 push_ws ); 2619 } 2620 2621 void 2622 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2623 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2624 kmp_int64 chunk, int push_ws ) 2625 { 2626 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2627 push_ws ); 2628 } 2629 2630 void 2631 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2632 { 2633 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2634 } 2635 2636 void 2637 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2638 { 2639 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2640 } 2641 2642 void 2643 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2644 { 2645 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2646 } 2647 2648 void 2649 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2650 { 2651 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2652 } 2653 2654 #endif /* KMP_GOMP_COMPAT */ 2655 2656 /* ------------------------------------------------------------------------ */ 2657 /* ------------------------------------------------------------------------ */ 2658 2659