1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 /* ------------------------------------------------------------------------ */ 39 /* ------------------------------------------------------------------------ */ 40 41 // template for type limits 42 template< typename T > 43 struct i_maxmin { 44 static const T mx; 45 static const T mn; 46 }; 47 template<> 48 struct i_maxmin< int > { 49 static const int mx = 0x7fffffff; 50 static const int mn = 0x80000000; 51 }; 52 template<> 53 struct i_maxmin< unsigned int > { 54 static const unsigned int mx = 0xffffffff; 55 static const unsigned int mn = 0x00000000; 56 }; 57 template<> 58 struct i_maxmin< long long > { 59 static const long long mx = 0x7fffffffffffffffLL; 60 static const long long mn = 0x8000000000000000LL; 61 }; 62 template<> 63 struct i_maxmin< unsigned long long > { 64 static const unsigned long long mx = 0xffffffffffffffffLL; 65 static const unsigned long long mn = 0x0000000000000000LL; 66 }; 67 //------------------------------------------------------------------------- 68 69 #ifdef KMP_STATIC_STEAL_ENABLED 70 71 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 72 template< typename T > 73 struct dispatch_private_infoXX_template { 74 typedef typename traits_t< T >::unsigned_t UT; 75 typedef typename traits_t< T >::signed_t ST; 76 UT count; // unsigned 77 T ub; 78 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 79 T lb; 80 ST st; // signed 81 UT tc; // unsigned 82 T static_steal_counter; // for static_steal only; maybe better to put after ub 83 84 /* parm[1-4] are used in different ways by different scheduling algorithms */ 85 86 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 87 // a) parm3 is properly aligned and 88 // b) all parm1-4 are in the same cache line. 89 // Because of parm1-4 are used together, performance seems to be better 90 // if they are in the same line (not measured though). 91 92 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 93 T parm1; 94 T parm2; 95 T parm3; 96 T parm4; 97 }; 98 99 UT ordered_lower; // unsigned 100 UT ordered_upper; // unsigned 101 #if KMP_OS_WINDOWS 102 T last_upper; 103 #endif /* KMP_OS_WINDOWS */ 104 }; 105 106 #else /* KMP_STATIC_STEAL_ENABLED */ 107 108 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 109 template< typename T > 110 struct dispatch_private_infoXX_template { 111 typedef typename traits_t< T >::unsigned_t UT; 112 typedef typename traits_t< T >::signed_t ST; 113 T lb; 114 T ub; 115 ST st; // signed 116 UT tc; // unsigned 117 118 T parm1; 119 T parm2; 120 T parm3; 121 T parm4; 122 123 UT count; // unsigned 124 125 UT ordered_lower; // unsigned 126 UT ordered_upper; // unsigned 127 #if KMP_OS_WINDOWS 128 T last_upper; 129 #endif /* KMP_OS_WINDOWS */ 130 }; 131 132 #endif /* KMP_STATIC_STEAL_ENABLED */ 133 134 // replaces dispatch_private_info structure and dispatch_private_info_t type 135 template< typename T > 136 struct KMP_ALIGN_CACHE dispatch_private_info_template { 137 // duplicate alignment here, otherwise size of structure is not correct in our compiler 138 union KMP_ALIGN_CACHE private_info_tmpl { 139 dispatch_private_infoXX_template< T > p; 140 dispatch_private_info64_t p64; 141 } u; 142 enum sched_type schedule; /* scheduling algorithm */ 143 kmp_uint32 ordered; /* ordered clause specified */ 144 kmp_uint32 ordered_bumped; 145 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 146 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 147 kmp_uint32 nomerge; /* don't merge iters if serialized */ 148 kmp_uint32 type_size; 149 enum cons_type pushed_ws; 150 }; 151 152 153 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 154 template< typename UT > 155 struct dispatch_shared_infoXX_template { 156 /* chunk index under dynamic, number of idle threads under static-steal; 157 iteration index otherwise */ 158 volatile UT iteration; 159 volatile UT num_done; 160 volatile UT ordered_iteration; 161 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 162 }; 163 164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 165 template< typename UT > 166 struct dispatch_shared_info_template { 167 // we need union here to keep the structure size 168 union shared_info_tmpl { 169 dispatch_shared_infoXX_template< UT > s; 170 dispatch_shared_info64_t s64; 171 } u; 172 volatile kmp_uint32 buffer_index; 173 }; 174 175 /* ------------------------------------------------------------------------ */ 176 /* ------------------------------------------------------------------------ */ 177 178 #undef USE_TEST_LOCKS 179 180 // test_then_add template (general template should NOT be used) 181 template< typename T > 182 static __forceinline T 183 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 184 185 template<> 186 __forceinline kmp_int32 187 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 188 { 189 kmp_int32 r; 190 r = KMP_TEST_THEN_ADD32( p, d ); 191 return r; 192 } 193 194 template<> 195 __forceinline kmp_int64 196 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 197 { 198 kmp_int64 r; 199 r = KMP_TEST_THEN_ADD64( p, d ); 200 return r; 201 } 202 203 // test_then_inc_acq template (general template should NOT be used) 204 template< typename T > 205 static __forceinline T 206 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 207 208 template<> 209 __forceinline kmp_int32 210 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 211 { 212 kmp_int32 r; 213 r = KMP_TEST_THEN_INC_ACQ32( p ); 214 return r; 215 } 216 217 template<> 218 __forceinline kmp_int64 219 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 220 { 221 kmp_int64 r; 222 r = KMP_TEST_THEN_INC_ACQ64( p ); 223 return r; 224 } 225 226 // test_then_inc template (general template should NOT be used) 227 template< typename T > 228 static __forceinline T 229 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 230 231 template<> 232 __forceinline kmp_int32 233 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 234 { 235 kmp_int32 r; 236 r = KMP_TEST_THEN_INC32( p ); 237 return r; 238 } 239 240 template<> 241 __forceinline kmp_int64 242 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 243 { 244 kmp_int64 r; 245 r = KMP_TEST_THEN_INC64( p ); 246 return r; 247 } 248 249 // compare_and_swap template (general template should NOT be used) 250 template< typename T > 251 static __forceinline kmp_int32 252 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 253 254 template<> 255 __forceinline kmp_int32 256 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 257 { 258 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 259 } 260 261 template<> 262 __forceinline kmp_int32 263 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 264 { 265 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 266 } 267 268 /* 269 Spin wait loop that first does pause, then yield. 270 Waits until function returns non-zero when called with *spinner and check. 271 Does NOT put threads to sleep. 272 #if USE_ITT_BUILD 273 Arguments: 274 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 275 locks consistently. For example, if lock is acquired immediately, its address is 276 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 277 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 278 address, not an address of low-level spinner. 279 #endif // USE_ITT_BUILD 280 */ 281 template< typename UT > 282 // ToDo: make inline function (move to header file for icl) 283 static UT // unsigned 4- or 8-byte type 284 __kmp_wait_yield( volatile UT * spinner, 285 UT checker, 286 kmp_uint32 (* pred)( UT, UT ) 287 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 288 ) 289 { 290 // note: we may not belong to a team at this point 291 register volatile UT * spin = spinner; 292 register UT check = checker; 293 register kmp_uint32 spins; 294 register kmp_uint32 (*f) ( UT, UT ) = pred; 295 register UT r; 296 297 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 298 KMP_INIT_YIELD( spins ); 299 // main wait spin loop 300 while(!f(r = *spin, check)) 301 { 302 KMP_FSYNC_SPIN_PREPARE( obj ); 303 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 304 It causes problems with infinite recursion because of exit lock */ 305 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 306 __kmp_abort_thread(); */ 307 308 // if we are oversubscribed, 309 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 310 // pause is in the following code 311 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 312 KMP_YIELD_SPIN( spins ); 313 } 314 KMP_FSYNC_SPIN_ACQUIRED( obj ); 315 return r; 316 } 317 318 template< typename UT > 319 static kmp_uint32 __kmp_eq( UT value, UT checker) { 320 return value == checker; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_neq( UT value, UT checker) { 325 return value != checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_lt( UT value, UT checker) { 330 return value < checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_ge( UT value, UT checker) { 335 return value >= checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_le( UT value, UT checker) { 340 return value <= checker; 341 } 342 343 344 /* ------------------------------------------------------------------------ */ 345 /* ------------------------------------------------------------------------ */ 346 347 static void 348 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 349 { 350 kmp_info_t *th; 351 352 KMP_DEBUG_ASSERT( gtid_ref ); 353 354 if ( __kmp_env_consistency_check ) { 355 th = __kmp_threads[*gtid_ref]; 356 if ( th -> th.th_root -> r.r_active 357 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 358 #if KMP_USE_DYNAMIC_LOCK 359 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 360 #else 361 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 362 #endif 363 } 364 } 365 } 366 367 template< typename UT > 368 static void 369 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 370 { 371 typedef typename traits_t< UT >::signed_t ST; 372 dispatch_private_info_template< UT > * pr; 373 374 int gtid = *gtid_ref; 375 // int cid = *cid_ref; 376 kmp_info_t *th = __kmp_threads[ gtid ]; 377 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 378 379 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 380 if ( __kmp_env_consistency_check ) { 381 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 382 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 383 if ( pr -> pushed_ws != ct_none ) { 384 #if KMP_USE_DYNAMIC_LOCK 385 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 ); 386 #else 387 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 388 #endif 389 } 390 } 391 392 if ( ! th -> th.th_team -> t.t_serialized ) { 393 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 394 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 395 UT lower; 396 397 if ( ! __kmp_env_consistency_check ) { 398 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 399 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 400 } 401 lower = pr->u.p.ordered_lower; 402 403 #if ! defined( KMP_GOMP_COMPAT ) 404 if ( __kmp_env_consistency_check ) { 405 if ( pr->ordered_bumped ) { 406 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 407 __kmp_error_construct2( 408 kmp_i18n_msg_CnsMultipleNesting, 409 ct_ordered_in_pdo, loc_ref, 410 & p->stack_data[ p->w_top ] 411 ); 412 } 413 } 414 #endif /* !defined(KMP_GOMP_COMPAT) */ 415 416 KMP_MB(); 417 #ifdef KMP_DEBUG 418 { 419 const char * buff; 420 // create format specifiers before the debug output 421 buff = __kmp_str_format( 422 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 423 traits_t< UT >::spec, traits_t< UT >::spec ); 424 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 425 __kmp_str_free( &buff ); 426 } 427 #endif 428 429 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 430 USE_ITT_BUILD_ARG( NULL ) 431 ); 432 KMP_MB(); /* is this necessary? */ 433 #ifdef KMP_DEBUG 434 { 435 const char * buff; 436 // create format specifiers before the debug output 437 buff = __kmp_str_format( 438 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 439 traits_t< UT >::spec, traits_t< UT >::spec ); 440 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 441 __kmp_str_free( &buff ); 442 } 443 #endif 444 } 445 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 446 } 447 448 static void 449 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 450 { 451 kmp_info_t *th; 452 453 if ( __kmp_env_consistency_check ) { 454 th = __kmp_threads[*gtid_ref]; 455 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 456 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 457 } 458 } 459 } 460 461 template< typename UT > 462 static void 463 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 464 { 465 typedef typename traits_t< UT >::signed_t ST; 466 dispatch_private_info_template< UT > * pr; 467 468 int gtid = *gtid_ref; 469 // int cid = *cid_ref; 470 kmp_info_t *th = __kmp_threads[ gtid ]; 471 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 472 473 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 474 if ( __kmp_env_consistency_check ) { 475 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 476 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 477 if ( pr -> pushed_ws != ct_none ) { 478 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 479 } 480 } 481 482 if ( ! th -> th.th_team -> t.t_serialized ) { 483 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 484 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 485 486 if ( ! __kmp_env_consistency_check ) { 487 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 488 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 489 } 490 491 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 492 #if ! defined( KMP_GOMP_COMPAT ) 493 if ( __kmp_env_consistency_check ) { 494 if ( pr->ordered_bumped != 0 ) { 495 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 496 /* How to test it? - OM */ 497 __kmp_error_construct2( 498 kmp_i18n_msg_CnsMultipleNesting, 499 ct_ordered_in_pdo, loc_ref, 500 & p->stack_data[ p->w_top ] 501 ); 502 } 503 } 504 #endif /* !defined(KMP_GOMP_COMPAT) */ 505 506 KMP_MB(); /* Flush all pending memory write invalidates. */ 507 508 pr->ordered_bumped += 1; 509 510 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 511 gtid, pr->ordered_bumped ) ); 512 513 KMP_MB(); /* Flush all pending memory write invalidates. */ 514 515 /* TODO use general release procedure? */ 516 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 517 518 KMP_MB(); /* Flush all pending memory write invalidates. */ 519 } 520 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 521 } 522 523 /* Computes and returns x to the power of y, where y must a non-negative integer */ 524 template< typename UT > 525 static __forceinline long double 526 __kmp_pow(long double x, UT y) { 527 long double s=1.0L; 528 529 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 530 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 531 while(y) { 532 if ( y & 1 ) 533 s *= x; 534 x *= x; 535 y >>= 1; 536 } 537 return s; 538 } 539 540 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 541 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 542 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 543 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 544 */ 545 template< typename T > 546 static __inline typename traits_t< T >::unsigned_t 547 __kmp_dispatch_guided_remaining( 548 T tc, 549 typename traits_t< T >::floating_t base, 550 typename traits_t< T >::unsigned_t idx 551 ) { 552 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 553 least for ICL 8.1, long double arithmetic may not really have 554 long double precision, even with /Qlong_double. Currently, we 555 workaround that in the caller code, by manipulating the FPCW for 556 Windows* OS on IA-32 architecture. The lack of precision is not 557 expected to be a correctness issue, though. 558 */ 559 typedef typename traits_t< T >::unsigned_t UT; 560 561 long double x = tc * __kmp_pow< UT >(base, idx); 562 UT r = (UT) x; 563 if ( x == r ) 564 return r; 565 return r + 1; 566 } 567 568 // Parameters of the guided-iterative algorithm: 569 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 570 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 571 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 572 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 573 static int guided_int_param = 2; 574 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 575 576 // UT - unsigned flavor of T, ST - signed flavor of T, 577 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 578 template< typename T > 579 static void 580 __kmp_dispatch_init( 581 ident_t * loc, 582 int gtid, 583 enum sched_type schedule, 584 T lb, 585 T ub, 586 typename traits_t< T >::signed_t st, 587 typename traits_t< T >::signed_t chunk, 588 int push_ws 589 ) { 590 typedef typename traits_t< T >::unsigned_t UT; 591 typedef typename traits_t< T >::signed_t ST; 592 typedef typename traits_t< T >::floating_t DBL; 593 static const int ___kmp_size_type = sizeof( UT ); 594 595 int active; 596 T tc; 597 kmp_info_t * th; 598 kmp_team_t * team; 599 kmp_uint32 my_buffer_index; 600 dispatch_private_info_template< T > * pr; 601 dispatch_shared_info_template< UT > volatile * sh; 602 603 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 604 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 605 606 if ( ! TCR_4( __kmp_init_parallel ) ) 607 __kmp_parallel_initialize(); 608 609 #if INCLUDE_SSC_MARKS 610 SSC_MARK_DISPATCH_INIT(); 611 #endif 612 #ifdef KMP_DEBUG 613 { 614 const char * buff; 615 // create format specifiers before the debug output 616 buff = __kmp_str_format( 617 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 618 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 619 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 620 __kmp_str_free( &buff ); 621 } 622 #endif 623 /* setup data */ 624 th = __kmp_threads[ gtid ]; 625 team = th -> th.th_team; 626 active = ! team -> t.t_serialized; 627 th->th.th_ident = loc; 628 629 #if USE_ITT_BUILD 630 kmp_uint64 cur_chunk = chunk; 631 #endif 632 if ( ! active ) { 633 pr = reinterpret_cast< dispatch_private_info_template< T >* > 634 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 635 } else { 636 KMP_DEBUG_ASSERT( th->th.th_dispatch == 637 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 638 639 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 640 641 /* What happens when number of threads changes, need to resize buffer? */ 642 pr = reinterpret_cast< dispatch_private_info_template< T > * > 643 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 644 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 645 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 646 } 647 648 /* Pick up the nomerge/ordered bits from the scheduling type */ 649 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 650 pr->nomerge = TRUE; 651 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 652 } else { 653 pr->nomerge = FALSE; 654 } 655 pr->type_size = ___kmp_size_type; // remember the size of variables 656 if ( kmp_ord_lower & schedule ) { 657 pr->ordered = TRUE; 658 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 659 } else { 660 pr->ordered = FALSE; 661 } 662 if ( schedule == kmp_sch_static ) { 663 schedule = __kmp_static; 664 } else { 665 if ( schedule == kmp_sch_runtime ) { 666 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 667 schedule = team -> t.t_sched.r_sched_type; 668 // Detail the schedule if needed (global controls are differentiated appropriately) 669 if ( schedule == kmp_sch_guided_chunked ) { 670 schedule = __kmp_guided; 671 } else if ( schedule == kmp_sch_static ) { 672 schedule = __kmp_static; 673 } 674 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 675 chunk = team -> t.t_sched.chunk; 676 677 #ifdef KMP_DEBUG 678 { 679 const char * buff; 680 // create format specifiers before the debug output 681 buff = __kmp_str_format( 682 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 683 traits_t< ST >::spec ); 684 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 685 __kmp_str_free( &buff ); 686 } 687 #endif 688 } else { 689 if ( schedule == kmp_sch_guided_chunked ) { 690 schedule = __kmp_guided; 691 } 692 if ( chunk <= 0 ) { 693 chunk = KMP_DEFAULT_CHUNK; 694 } 695 } 696 697 if ( schedule == kmp_sch_auto ) { 698 // mapping and differentiation: in the __kmp_do_serial_initialize() 699 schedule = __kmp_auto; 700 #ifdef KMP_DEBUG 701 { 702 const char * buff; 703 // create format specifiers before the debug output 704 buff = __kmp_str_format( 705 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 706 traits_t< ST >::spec ); 707 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 708 __kmp_str_free( &buff ); 709 } 710 #endif 711 } 712 713 /* guided analytical not safe for too many threads */ 714 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 715 schedule = kmp_sch_guided_iterative_chunked; 716 KMP_WARNING( DispatchManyThreads ); 717 } 718 pr->u.p.parm1 = chunk; 719 } 720 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 721 "unknown scheduling type" ); 722 723 pr->u.p.count = 0; 724 725 if ( __kmp_env_consistency_check ) { 726 if ( st == 0 ) { 727 __kmp_error_construct( 728 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 729 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 730 ); 731 } 732 } 733 734 tc = ( ub - lb + st ); 735 if ( st != 1 ) { 736 if ( st < 0 ) { 737 if ( lb < ub ) { 738 tc = 0; // zero-trip 739 } else { // lb >= ub 740 tc = (ST)tc / st; // convert to signed division 741 } 742 } else { // st > 0 743 if ( ub < lb ) { 744 tc = 0; // zero-trip 745 } else { // lb >= ub 746 tc /= st; 747 } 748 } 749 } else if ( ub < lb ) { // st == 1 750 tc = 0; // zero-trip 751 } 752 753 pr->u.p.lb = lb; 754 pr->u.p.ub = ub; 755 pr->u.p.st = st; 756 pr->u.p.tc = tc; 757 758 #if KMP_OS_WINDOWS 759 pr->u.p.last_upper = ub + st; 760 #endif /* KMP_OS_WINDOWS */ 761 762 /* NOTE: only the active parallel region(s) has active ordered sections */ 763 764 if ( active ) { 765 if ( pr->ordered == 0 ) { 766 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 767 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 768 } else { 769 pr->ordered_bumped = 0; 770 771 pr->u.p.ordered_lower = 1; 772 pr->u.p.ordered_upper = 0; 773 774 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 775 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 776 } 777 } 778 779 if ( __kmp_env_consistency_check ) { 780 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 781 if ( push_ws ) { 782 __kmp_push_workshare( gtid, ws, loc ); 783 pr->pushed_ws = ws; 784 } else { 785 __kmp_check_workshare( gtid, ws, loc ); 786 pr->pushed_ws = ct_none; 787 } 788 } 789 790 switch ( schedule ) { 791 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 792 case kmp_sch_static_steal: 793 { 794 T nproc = team->t.t_nproc; 795 T ntc, init; 796 797 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 798 799 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 800 if ( nproc > 1 && ntc >= nproc ) { 801 T id = __kmp_tid_from_gtid(gtid); 802 T small_chunk, extras; 803 804 small_chunk = ntc / nproc; 805 extras = ntc % nproc; 806 807 init = id * small_chunk + ( id < extras ? id : extras ); 808 pr->u.p.count = init; 809 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 810 811 pr->u.p.parm2 = lb; 812 //pr->pfields.parm3 = 0; // it's not used in static_steal 813 pr->u.p.parm4 = id; 814 pr->u.p.st = st; 815 break; 816 } else { 817 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 818 gtid ) ); 819 schedule = kmp_sch_static_balanced; 820 /* too few iterations: fall-through to kmp_sch_static_balanced */ 821 } // if 822 /* FALL-THROUGH to static balanced */ 823 } // case 824 #endif 825 case kmp_sch_static_balanced: 826 { 827 T nproc = team->t.t_nproc; 828 T init, limit; 829 830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 831 gtid ) ); 832 833 if ( nproc > 1 ) { 834 T id = __kmp_tid_from_gtid(gtid); 835 836 if ( tc < nproc ) { 837 if ( id < tc ) { 838 init = id; 839 limit = id; 840 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 841 } else { 842 pr->u.p.count = 1; /* means no more chunks to execute */ 843 pr->u.p.parm1 = FALSE; 844 break; 845 } 846 } else { 847 T small_chunk = tc / nproc; 848 T extras = tc % nproc; 849 init = id * small_chunk + (id < extras ? id : extras); 850 limit = init + small_chunk - (id < extras ? 0 : 1); 851 pr->u.p.parm1 = (id == nproc - 1); 852 } 853 } else { 854 if ( tc > 0 ) { 855 init = 0; 856 limit = tc - 1; 857 pr->u.p.parm1 = TRUE; 858 } else { 859 // zero trip count 860 pr->u.p.count = 1; /* means no more chunks to execute */ 861 pr->u.p.parm1 = FALSE; 862 break; 863 } 864 } 865 #if USE_ITT_BUILD 866 // Calculate chunk for metadata report 867 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 868 cur_chunk = limit - init + 1; 869 } 870 #endif 871 if ( st == 1 ) { 872 pr->u.p.lb = lb + init; 873 pr->u.p.ub = lb + limit; 874 } else { 875 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 876 pr->u.p.lb = lb + init * st; 877 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 878 if ( st > 0 ) { 879 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 880 } else { 881 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 882 } 883 } 884 if ( pr->ordered ) { 885 pr->u.p.ordered_lower = init; 886 pr->u.p.ordered_upper = limit; 887 } 888 break; 889 } // case 890 case kmp_sch_guided_iterative_chunked : 891 { 892 T nproc = team->t.t_nproc; 893 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 894 895 if ( nproc > 1 ) { 896 if ( (2L * chunk + 1 ) * nproc >= tc ) { 897 /* chunk size too large, switch to dynamic */ 898 schedule = kmp_sch_dynamic_chunked; 899 } else { 900 // when remaining iters become less than parm2 - switch to dynamic 901 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 902 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 903 } 904 } else { 905 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 906 schedule = kmp_sch_static_greedy; 907 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 908 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 909 pr->u.p.parm1 = tc; 910 } // if 911 } // case 912 break; 913 case kmp_sch_guided_analytical_chunked: 914 { 915 T nproc = team->t.t_nproc; 916 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 917 918 if ( nproc > 1 ) { 919 if ( (2L * chunk + 1 ) * nproc >= tc ) { 920 /* chunk size too large, switch to dynamic */ 921 schedule = kmp_sch_dynamic_chunked; 922 } else { 923 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 924 DBL x; 925 926 #if KMP_OS_WINDOWS && KMP_ARCH_X86 927 /* Linux* OS already has 64-bit computation by default for 928 long double, and on Windows* OS on Intel(R) 64, 929 /Qlong_double doesn't work. On Windows* OS 930 on IA-32 architecture, we need to set precision to 931 64-bit instead of the default 53-bit. Even though long 932 double doesn't work on Windows* OS on Intel(R) 64, the 933 resulting lack of precision is not expected to impact 934 the correctness of the algorithm, but this has not been 935 mathematically proven. 936 */ 937 // save original FPCW and set precision to 64-bit, as 938 // Windows* OS on IA-32 architecture defaults to 53-bit 939 unsigned int oldFpcw = _control87(0,0); 940 _control87(_PC_64,_MCW_PC); // 0,0x30000 941 #endif 942 /* value used for comparison in solver for cross-over point */ 943 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 944 945 /* crossover point--chunk indexes equal to or greater than 946 this point switch to dynamic-style scheduling */ 947 UT cross; 948 949 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 950 x = (long double)1.0 - (long double)0.5 / nproc; 951 952 #ifdef KMP_DEBUG 953 { // test natural alignment 954 struct _test_a { 955 char a; 956 union { 957 char b; 958 DBL d; 959 }; 960 } t; 961 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 962 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 963 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 964 } 965 #endif // KMP_DEBUG 966 967 /* save the term in thread private dispatch structure */ 968 *(DBL*)&pr->u.p.parm3 = x; 969 970 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 971 { 972 UT left, right, mid; 973 long double p; 974 975 /* estimate initial upper and lower bound */ 976 977 /* doesn't matter what value right is as long as it is positive, but 978 it affects performance of the solver 979 */ 980 right = 229; 981 p = __kmp_pow< UT >(x,right); 982 if ( p > target ) { 983 do{ 984 p *= p; 985 right <<= 1; 986 } while(p>target && right < (1<<27)); 987 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 988 } else { 989 left = 0; 990 } 991 992 /* bisection root-finding method */ 993 while ( left + 1 < right ) { 994 mid = (left + right) / 2; 995 if ( __kmp_pow< UT >(x,mid) > target ) { 996 left = mid; 997 } else { 998 right = mid; 999 } 1000 } // while 1001 cross = right; 1002 } 1003 /* assert sanity of computed crossover point */ 1004 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 1005 1006 /* save the crossover point in thread private dispatch structure */ 1007 pr->u.p.parm2 = cross; 1008 1009 // C75803 1010 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1011 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1012 #else 1013 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1014 #endif 1015 /* dynamic-style scheduling offset */ 1016 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1017 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1018 // restore FPCW 1019 _control87(oldFpcw,_MCW_PC); 1020 #endif 1021 } // if 1022 } else { 1023 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1024 gtid ) ); 1025 schedule = kmp_sch_static_greedy; 1026 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1027 pr->u.p.parm1 = tc; 1028 } // if 1029 } // case 1030 break; 1031 case kmp_sch_static_greedy: 1032 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1033 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1034 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1035 tc; 1036 break; 1037 case kmp_sch_static_chunked : 1038 case kmp_sch_dynamic_chunked : 1039 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1040 break; 1041 case kmp_sch_trapezoidal : 1042 { 1043 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1044 1045 T parm1, parm2, parm3, parm4; 1046 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1047 1048 parm1 = chunk; 1049 1050 /* F : size of the first cycle */ 1051 parm2 = ( tc / (2 * team->t.t_nproc) ); 1052 1053 if ( parm2 < 1 ) { 1054 parm2 = 1; 1055 } 1056 1057 /* L : size of the last cycle. Make sure the last cycle 1058 * is not larger than the first cycle. 1059 */ 1060 if ( parm1 < 1 ) { 1061 parm1 = 1; 1062 } else if ( parm1 > parm2 ) { 1063 parm1 = parm2; 1064 } 1065 1066 /* N : number of cycles */ 1067 parm3 = ( parm2 + parm1 ); 1068 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1069 1070 if ( parm3 < 2 ) { 1071 parm3 = 2; 1072 } 1073 1074 /* sigma : decreasing incr of the trapezoid */ 1075 parm4 = ( parm3 - 1 ); 1076 parm4 = ( parm2 - parm1 ) / parm4; 1077 1078 // pointless check, because parm4 >= 0 always 1079 //if ( parm4 < 0 ) { 1080 // parm4 = 0; 1081 //} 1082 1083 pr->u.p.parm1 = parm1; 1084 pr->u.p.parm2 = parm2; 1085 pr->u.p.parm3 = parm3; 1086 pr->u.p.parm4 = parm4; 1087 } // case 1088 break; 1089 1090 default: 1091 { 1092 __kmp_msg( 1093 kmp_ms_fatal, // Severity 1094 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1095 KMP_HNT( GetNewerLibrary ), // Hint 1096 __kmp_msg_null // Variadic argument list terminator 1097 ); 1098 } 1099 break; 1100 } // switch 1101 pr->schedule = schedule; 1102 if ( active ) { 1103 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1104 1105 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1106 gtid, my_buffer_index, sh->buffer_index) ); 1107 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1108 USE_ITT_BUILD_ARG( NULL ) 1109 ); 1110 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1111 // *always* 32-bit integers. 1112 KMP_MB(); /* is this necessary? */ 1113 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1114 gtid, my_buffer_index, sh->buffer_index) ); 1115 1116 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1117 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1118 #if USE_ITT_BUILD 1119 if ( pr->ordered ) { 1120 __kmp_itt_ordered_init( gtid ); 1121 }; // if 1122 #endif /* USE_ITT_BUILD */ 1123 }; // if 1124 1125 #if USE_ITT_BUILD 1126 // Report loop metadata 1127 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 1128 kmp_uint32 tid = __kmp_tid_from_gtid( gtid ); 1129 if (KMP_MASTER_TID(tid)) { 1130 kmp_uint64 schedtype = 0; 1131 1132 switch ( schedule ) { 1133 case kmp_sch_static_chunked: 1134 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1135 break; 1136 case kmp_sch_static_greedy: 1137 cur_chunk = pr->u.p.parm1; 1138 break; 1139 case kmp_sch_dynamic_chunked: 1140 schedtype = 1; 1141 break; 1142 case kmp_sch_guided_iterative_chunked: 1143 case kmp_sch_guided_analytical_chunked: 1144 schedtype = 2; 1145 break; 1146 default: 1147 // Should we put this case under "static"? 1148 // case kmp_sch_static_steal: 1149 schedtype = 3; 1150 break; 1151 } 1152 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1153 } 1154 } 1155 #endif /* USE_ITT_BUILD */ 1156 1157 #ifdef KMP_DEBUG 1158 { 1159 const char * buff; 1160 // create format specifiers before the debug output 1161 buff = __kmp_str_format( 1162 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1163 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1164 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1165 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1166 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1167 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1168 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1169 KD_TRACE(10, ( buff, 1170 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1171 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1172 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1173 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1174 __kmp_str_free( &buff ); 1175 } 1176 #endif 1177 #if ( KMP_STATIC_STEAL_ENABLED ) 1178 if ( ___kmp_size_type < 8 ) { 1179 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1180 // all the parm3 variables will contain the same value. 1181 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1182 // rather than program life-time increment. 1183 // So the dedicated variable is required. The 'static_steal_counter' is used. 1184 if( schedule == kmp_sch_static_steal ) { 1185 // Other threads will inspect this variable when searching for a victim. 1186 // This is a flag showing that other threads may steal from this thread since then. 1187 volatile T * p = &pr->u.p.static_steal_counter; 1188 *p = *p + 1; 1189 } 1190 } 1191 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1192 } 1193 1194 /* 1195 * For ordered loops, either __kmp_dispatch_finish() should be called after 1196 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1197 * every chunk of iterations. If the ordered section(s) were not executed 1198 * for this iteration (or every iteration in this chunk), we need to set the 1199 * ordered iteration counters so that the next thread can proceed. 1200 */ 1201 template< typename UT > 1202 static void 1203 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1204 { 1205 typedef typename traits_t< UT >::signed_t ST; 1206 kmp_info_t *th = __kmp_threads[ gtid ]; 1207 1208 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1209 if ( ! th -> th.th_team -> t.t_serialized ) { 1210 1211 dispatch_private_info_template< UT > * pr = 1212 reinterpret_cast< dispatch_private_info_template< UT >* > 1213 ( th->th.th_dispatch->th_dispatch_pr_current ); 1214 dispatch_shared_info_template< UT > volatile * sh = 1215 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1216 ( th->th.th_dispatch->th_dispatch_sh_current ); 1217 KMP_DEBUG_ASSERT( pr ); 1218 KMP_DEBUG_ASSERT( sh ); 1219 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1220 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1221 1222 if ( pr->ordered_bumped ) { 1223 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1224 gtid ) ); 1225 pr->ordered_bumped = 0; 1226 } else { 1227 UT lower = pr->u.p.ordered_lower; 1228 1229 #ifdef KMP_DEBUG 1230 { 1231 const char * buff; 1232 // create format specifiers before the debug output 1233 buff = __kmp_str_format( 1234 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1235 traits_t< UT >::spec, traits_t< UT >::spec ); 1236 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1237 __kmp_str_free( &buff ); 1238 } 1239 #endif 1240 1241 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1242 USE_ITT_BUILD_ARG(NULL) 1243 ); 1244 KMP_MB(); /* is this necessary? */ 1245 #ifdef KMP_DEBUG 1246 { 1247 const char * buff; 1248 // create format specifiers before the debug output 1249 buff = __kmp_str_format( 1250 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1251 traits_t< UT >::spec, traits_t< UT >::spec ); 1252 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1253 __kmp_str_free( &buff ); 1254 } 1255 #endif 1256 1257 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1258 } // if 1259 } // if 1260 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1261 } 1262 1263 #ifdef KMP_GOMP_COMPAT 1264 1265 template< typename UT > 1266 static void 1267 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1268 { 1269 typedef typename traits_t< UT >::signed_t ST; 1270 kmp_info_t *th = __kmp_threads[ gtid ]; 1271 1272 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1273 if ( ! th -> th.th_team -> t.t_serialized ) { 1274 // int cid; 1275 dispatch_private_info_template< UT > * pr = 1276 reinterpret_cast< dispatch_private_info_template< UT >* > 1277 ( th->th.th_dispatch->th_dispatch_pr_current ); 1278 dispatch_shared_info_template< UT > volatile * sh = 1279 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1280 ( th->th.th_dispatch->th_dispatch_sh_current ); 1281 KMP_DEBUG_ASSERT( pr ); 1282 KMP_DEBUG_ASSERT( sh ); 1283 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1284 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1285 1286 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1287 UT lower = pr->u.p.ordered_lower; 1288 UT upper = pr->u.p.ordered_upper; 1289 UT inc = upper - lower + 1; 1290 1291 if ( pr->ordered_bumped == inc ) { 1292 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1293 gtid ) ); 1294 pr->ordered_bumped = 0; 1295 } else { 1296 inc -= pr->ordered_bumped; 1297 1298 #ifdef KMP_DEBUG 1299 { 1300 const char * buff; 1301 // create format specifiers before the debug output 1302 buff = __kmp_str_format( 1303 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1304 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1305 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1306 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1307 __kmp_str_free( &buff ); 1308 } 1309 #endif 1310 1311 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1312 USE_ITT_BUILD_ARG(NULL) 1313 ); 1314 1315 KMP_MB(); /* is this necessary? */ 1316 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1317 gtid ) ); 1318 pr->ordered_bumped = 0; 1319 //!!!!! TODO check if the inc should be unsigned, or signed??? 1320 #ifdef KMP_DEBUG 1321 { 1322 const char * buff; 1323 // create format specifiers before the debug output 1324 buff = __kmp_str_format( 1325 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1326 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1327 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1328 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1329 __kmp_str_free( &buff ); 1330 } 1331 #endif 1332 1333 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1334 } 1335 // } 1336 } 1337 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1338 } 1339 1340 #endif /* KMP_GOMP_COMPAT */ 1341 1342 template< typename T > 1343 static int 1344 __kmp_dispatch_next( 1345 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1346 ) { 1347 1348 typedef typename traits_t< T >::unsigned_t UT; 1349 typedef typename traits_t< T >::signed_t ST; 1350 typedef typename traits_t< T >::floating_t DBL; 1351 static const int ___kmp_size_type = sizeof( UT ); 1352 1353 int status; 1354 dispatch_private_info_template< T > * pr; 1355 kmp_info_t * th = __kmp_threads[ gtid ]; 1356 kmp_team_t * team = th -> th.th_team; 1357 1358 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL 1359 #ifdef KMP_DEBUG 1360 { 1361 const char * buff; 1362 // create format specifiers before the debug output 1363 buff = __kmp_str_format( 1364 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1365 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1366 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1367 __kmp_str_free( &buff ); 1368 } 1369 #endif 1370 1371 if ( team -> t.t_serialized ) { 1372 /* NOTE: serialize this dispatch becase we are not at the active level */ 1373 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1374 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1375 KMP_DEBUG_ASSERT( pr ); 1376 1377 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1378 *p_lb = 0; 1379 *p_ub = 0; 1380 // if ( p_last != NULL ) 1381 // *p_last = 0; 1382 if ( p_st != NULL ) 1383 *p_st = 0; 1384 if ( __kmp_env_consistency_check ) { 1385 if ( pr->pushed_ws != ct_none ) { 1386 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1387 } 1388 } 1389 } else if ( pr->nomerge ) { 1390 kmp_int32 last; 1391 T start; 1392 UT limit, trip, init; 1393 ST incr; 1394 T chunk = pr->u.p.parm1; 1395 1396 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1397 1398 init = chunk * pr->u.p.count++; 1399 trip = pr->u.p.tc - 1; 1400 1401 if ( (status = (init <= trip)) == 0 ) { 1402 *p_lb = 0; 1403 *p_ub = 0; 1404 // if ( p_last != NULL ) 1405 // *p_last = 0; 1406 if ( p_st != NULL ) 1407 *p_st = 0; 1408 if ( __kmp_env_consistency_check ) { 1409 if ( pr->pushed_ws != ct_none ) { 1410 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1411 } 1412 } 1413 } else { 1414 start = pr->u.p.lb; 1415 limit = chunk + init - 1; 1416 incr = pr->u.p.st; 1417 1418 if ( (last = (limit >= trip)) != 0 ) { 1419 limit = trip; 1420 #if KMP_OS_WINDOWS 1421 pr->u.p.last_upper = pr->u.p.ub; 1422 #endif /* KMP_OS_WINDOWS */ 1423 } 1424 if ( p_last != NULL ) 1425 *p_last = last; 1426 if ( p_st != NULL ) 1427 *p_st = incr; 1428 if ( incr == 1 ) { 1429 *p_lb = start + init; 1430 *p_ub = start + limit; 1431 } else { 1432 *p_lb = start + init * incr; 1433 *p_ub = start + limit * incr; 1434 } 1435 1436 if ( pr->ordered ) { 1437 pr->u.p.ordered_lower = init; 1438 pr->u.p.ordered_upper = limit; 1439 #ifdef KMP_DEBUG 1440 { 1441 const char * buff; 1442 // create format specifiers before the debug output 1443 buff = __kmp_str_format( 1444 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1445 traits_t< UT >::spec, traits_t< UT >::spec ); 1446 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1447 __kmp_str_free( &buff ); 1448 } 1449 #endif 1450 } // if 1451 } // if 1452 } else { 1453 pr->u.p.tc = 0; 1454 *p_lb = pr->u.p.lb; 1455 *p_ub = pr->u.p.ub; 1456 #if KMP_OS_WINDOWS 1457 pr->u.p.last_upper = *p_ub; 1458 #endif /* KMP_OS_WINDOWS */ 1459 if ( p_last != NULL ) 1460 *p_last = TRUE; 1461 if ( p_st != NULL ) 1462 *p_st = pr->u.p.st; 1463 } // if 1464 #ifdef KMP_DEBUG 1465 { 1466 const char * buff; 1467 // create format specifiers before the debug output 1468 buff = __kmp_str_format( 1469 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1470 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1471 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1472 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1473 __kmp_str_free( &buff ); 1474 } 1475 #endif 1476 #if INCLUDE_SSC_MARKS 1477 SSC_MARK_DISPATCH_NEXT(); 1478 #endif 1479 return status; 1480 } else { 1481 kmp_int32 last = 0; 1482 dispatch_shared_info_template< UT > *sh; 1483 T start; 1484 ST incr; 1485 UT limit, trip, init; 1486 1487 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1488 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1489 1490 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1491 ( th->th.th_dispatch->th_dispatch_pr_current ); 1492 KMP_DEBUG_ASSERT( pr ); 1493 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1494 ( th->th.th_dispatch->th_dispatch_sh_current ); 1495 KMP_DEBUG_ASSERT( sh ); 1496 1497 if ( pr->u.p.tc == 0 ) { 1498 // zero trip count 1499 status = 0; 1500 } else { 1501 switch (pr->schedule) { 1502 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1503 case kmp_sch_static_steal: 1504 { 1505 T chunk = pr->u.p.parm1; 1506 1507 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1508 1509 trip = pr->u.p.tc - 1; 1510 1511 if ( ___kmp_size_type > 4 ) { 1512 // Other threads do not look into the data of this thread, 1513 // so it's not necessary to make volatile casting. 1514 init = ( pr->u.p.count )++; 1515 status = ( init < (UT)pr->u.p.ub ); 1516 } else { 1517 typedef union { 1518 struct { 1519 UT count; 1520 T ub; 1521 } p; 1522 kmp_int64 b; 1523 } union_i4; 1524 // All operations on 'count' or 'ub' must be combined atomically together. 1525 // stealing implemented only for 4-byte indexes 1526 { 1527 union_i4 vold, vnew; 1528 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1529 vnew = vold; 1530 vnew.p.count++; 1531 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1532 ( volatile kmp_int64* )&pr->u.p.count, 1533 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1534 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1535 KMP_CPU_PAUSE(); 1536 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1537 vnew = vold; 1538 vnew.p.count++; 1539 } 1540 vnew = vold; 1541 init = vnew.p.count; 1542 status = ( init < (UT)vnew.p.ub ) ; 1543 } 1544 1545 if( !status ) { 1546 kmp_info_t **other_threads = team->t.t_threads; 1547 int while_limit = 10; 1548 int while_index = 0; 1549 1550 // TODO: algorithm of searching for a victim 1551 // should be cleaned up and measured 1552 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1553 union_i4 vold, vnew; 1554 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1555 T victimIdx = pr->u.p.parm4; 1556 T oldVictimIdx = victimIdx; 1557 dispatch_private_info_template< T > * victim; 1558 1559 do { 1560 if( !victimIdx ) { 1561 victimIdx = team->t.t_nproc - 1; 1562 } else { 1563 --victimIdx; 1564 } 1565 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1566 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1567 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1568 // TODO: think about a proper place of this test 1569 if ( ( !victim ) || 1570 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1571 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1572 // TODO: delay would be nice 1573 continue; 1574 // the victim is not ready yet to participate in stealing 1575 // because the victim is still in kmp_init_dispatch 1576 } 1577 if ( oldVictimIdx == victimIdx ) { 1578 break; 1579 } 1580 pr->u.p.parm4 = victimIdx; 1581 1582 while( 1 ) { 1583 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1584 vnew = vold; 1585 1586 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1587 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1588 break; 1589 } 1590 vnew.p.ub -= (remaining >> 2); 1591 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1592 #pragma warning( push ) 1593 // disable warning on pointless comparison of unsigned with 0 1594 #pragma warning( disable: 186 ) 1595 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1596 #pragma warning( pop ) 1597 // TODO: Should this be acquire or release? 1598 if ( KMP_COMPARE_AND_STORE_ACQ64( 1599 ( volatile kmp_int64 * )&victim->u.p.count, 1600 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1601 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1602 status = 1; 1603 while_index = 0; 1604 // now update own count and ub 1605 #if KMP_ARCH_X86 1606 // stealing executed on non-KMP_ARCH_X86 only 1607 // Atomic 64-bit write on ia32 is 1608 // unavailable, so we do this in steps. 1609 // This code is not tested. 1610 init = vold.p.count; 1611 pr->u.p.ub = 0; 1612 pr->u.p.count = init + 1; 1613 pr->u.p.ub = vnew.p.count; 1614 #else 1615 init = vnew.p.ub; 1616 vold.p.count = init + 1; 1617 // TODO: is it safe and enough? 1618 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1619 #endif // KMP_ARCH_X86 1620 break; 1621 } // if 1622 KMP_CPU_PAUSE(); 1623 } // while (1) 1624 } // while 1625 } // if 1626 } // if 1627 if ( !status ) { 1628 *p_lb = 0; 1629 *p_ub = 0; 1630 if ( p_st != NULL ) *p_st = 0; 1631 } else { 1632 start = pr->u.p.parm2; 1633 init *= chunk; 1634 limit = chunk + init - 1; 1635 incr = pr->u.p.st; 1636 1637 KMP_DEBUG_ASSERT(init <= trip); 1638 if ( (last = (limit >= trip)) != 0 ) 1639 limit = trip; 1640 if ( p_st != NULL ) *p_st = incr; 1641 1642 if ( incr == 1 ) { 1643 *p_lb = start + init; 1644 *p_ub = start + limit; 1645 } else { 1646 *p_lb = start + init * incr; 1647 *p_ub = start + limit * incr; 1648 } 1649 1650 if ( pr->ordered ) { 1651 pr->u.p.ordered_lower = init; 1652 pr->u.p.ordered_upper = limit; 1653 #ifdef KMP_DEBUG 1654 { 1655 const char * buff; 1656 // create format specifiers before the debug output 1657 buff = __kmp_str_format( 1658 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1659 traits_t< UT >::spec, traits_t< UT >::spec ); 1660 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1661 __kmp_str_free( &buff ); 1662 } 1663 #endif 1664 } // if 1665 } // if 1666 break; 1667 } // case 1668 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1669 case kmp_sch_static_balanced: 1670 { 1671 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1672 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1673 pr->u.p.count = 1; 1674 *p_lb = pr->u.p.lb; 1675 *p_ub = pr->u.p.ub; 1676 last = pr->u.p.parm1; 1677 if ( p_st != NULL ) 1678 *p_st = pr->u.p.st; 1679 } else { /* no iterations to do */ 1680 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1681 } 1682 if ( pr->ordered ) { 1683 #ifdef KMP_DEBUG 1684 { 1685 const char * buff; 1686 // create format specifiers before the debug output 1687 buff = __kmp_str_format( 1688 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1689 traits_t< UT >::spec, traits_t< UT >::spec ); 1690 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1691 __kmp_str_free( &buff ); 1692 } 1693 #endif 1694 } // if 1695 } // case 1696 break; 1697 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1698 case kmp_sch_static_chunked: 1699 { 1700 T parm1; 1701 1702 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1703 gtid ) ); 1704 parm1 = pr->u.p.parm1; 1705 1706 trip = pr->u.p.tc - 1; 1707 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1708 1709 if ( (status = (init <= trip)) != 0 ) { 1710 start = pr->u.p.lb; 1711 incr = pr->u.p.st; 1712 limit = parm1 + init - 1; 1713 1714 if ( (last = (limit >= trip)) != 0 ) 1715 limit = trip; 1716 1717 if ( p_st != NULL ) *p_st = incr; 1718 1719 pr->u.p.count += team->t.t_nproc; 1720 1721 if ( incr == 1 ) { 1722 *p_lb = start + init; 1723 *p_ub = start + limit; 1724 } 1725 else { 1726 *p_lb = start + init * incr; 1727 *p_ub = start + limit * incr; 1728 } 1729 1730 if ( pr->ordered ) { 1731 pr->u.p.ordered_lower = init; 1732 pr->u.p.ordered_upper = limit; 1733 #ifdef KMP_DEBUG 1734 { 1735 const char * buff; 1736 // create format specifiers before the debug output 1737 buff = __kmp_str_format( 1738 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1739 traits_t< UT >::spec, traits_t< UT >::spec ); 1740 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1741 __kmp_str_free( &buff ); 1742 } 1743 #endif 1744 } // if 1745 } // if 1746 } // case 1747 break; 1748 1749 case kmp_sch_dynamic_chunked: 1750 { 1751 T chunk = pr->u.p.parm1; 1752 1753 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1754 gtid ) ); 1755 1756 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1757 trip = pr->u.p.tc - 1; 1758 1759 if ( (status = (init <= trip)) == 0 ) { 1760 *p_lb = 0; 1761 *p_ub = 0; 1762 if ( p_st != NULL ) *p_st = 0; 1763 } else { 1764 start = pr->u.p.lb; 1765 limit = chunk + init - 1; 1766 incr = pr->u.p.st; 1767 1768 if ( (last = (limit >= trip)) != 0 ) 1769 limit = trip; 1770 1771 if ( p_st != NULL ) *p_st = incr; 1772 1773 if ( incr == 1 ) { 1774 *p_lb = start + init; 1775 *p_ub = start + limit; 1776 } else { 1777 *p_lb = start + init * incr; 1778 *p_ub = start + limit * incr; 1779 } 1780 1781 if ( pr->ordered ) { 1782 pr->u.p.ordered_lower = init; 1783 pr->u.p.ordered_upper = limit; 1784 #ifdef KMP_DEBUG 1785 { 1786 const char * buff; 1787 // create format specifiers before the debug output 1788 buff = __kmp_str_format( 1789 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1790 traits_t< UT >::spec, traits_t< UT >::spec ); 1791 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1792 __kmp_str_free( &buff ); 1793 } 1794 #endif 1795 } // if 1796 } // if 1797 } // case 1798 break; 1799 1800 case kmp_sch_guided_iterative_chunked: 1801 { 1802 T chunkspec = pr->u.p.parm1; 1803 KD_TRACE(100, 1804 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1805 trip = pr->u.p.tc; 1806 // Start atomic part of calculations 1807 while(1) { 1808 ST remaining; // signed, because can be < 0 1809 init = sh->u.s.iteration; // shared value 1810 remaining = trip - init; 1811 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1812 // nothing to do, don't try atomic op 1813 status = 0; 1814 break; 1815 } 1816 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1817 // use dynamic-style shcedule 1818 // atomically inrement iterations, get old value 1819 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1820 remaining = trip - init; 1821 if (remaining <= 0) { 1822 status = 0; // all iterations got by other threads 1823 } else { 1824 // got some iterations to work on 1825 status = 1; 1826 if ( (T)remaining > chunkspec ) { 1827 limit = init + chunkspec - 1; 1828 } else { 1829 last = 1; // the last chunk 1830 limit = init + remaining - 1; 1831 } // if 1832 } // if 1833 break; 1834 } // if 1835 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1836 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1837 // CAS was successful, chunk obtained 1838 status = 1; 1839 --limit; 1840 break; 1841 } // if 1842 } // while 1843 if ( status != 0 ) { 1844 start = pr->u.p.lb; 1845 incr = pr->u.p.st; 1846 if ( p_st != NULL ) 1847 *p_st = incr; 1848 *p_lb = start + init * incr; 1849 *p_ub = start + limit * incr; 1850 if ( pr->ordered ) { 1851 pr->u.p.ordered_lower = init; 1852 pr->u.p.ordered_upper = limit; 1853 #ifdef KMP_DEBUG 1854 { 1855 const char * buff; 1856 // create format specifiers before the debug output 1857 buff = __kmp_str_format( 1858 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1859 traits_t< UT >::spec, traits_t< UT >::spec ); 1860 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1861 __kmp_str_free( &buff ); 1862 } 1863 #endif 1864 } // if 1865 } else { 1866 *p_lb = 0; 1867 *p_ub = 0; 1868 if ( p_st != NULL ) 1869 *p_st = 0; 1870 } // if 1871 } // case 1872 break; 1873 1874 case kmp_sch_guided_analytical_chunked: 1875 { 1876 T chunkspec = pr->u.p.parm1; 1877 UT chunkIdx; 1878 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1879 /* for storing original FPCW value for Windows* OS on 1880 IA-32 architecture 8-byte version */ 1881 unsigned int oldFpcw; 1882 unsigned int fpcwSet = 0; 1883 #endif 1884 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1885 gtid ) ); 1886 1887 trip = pr->u.p.tc; 1888 1889 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1890 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1891 1892 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1893 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1894 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1895 --trip; 1896 /* use dynamic-style scheduling */ 1897 init = chunkIdx * chunkspec + pr->u.p.count; 1898 /* need to verify init > 0 in case of overflow in the above calculation */ 1899 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1900 limit = init + chunkspec -1; 1901 1902 if ( (last = (limit >= trip)) != 0 ) 1903 limit = trip; 1904 } 1905 break; 1906 } else { 1907 /* use exponential-style scheduling */ 1908 /* The following check is to workaround the lack of long double precision on Windows* OS. 1909 This check works around the possible effect that init != 0 for chunkIdx == 0. 1910 */ 1911 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1912 /* If we haven't already done so, save original 1913 FPCW and set precision to 64-bit, as Windows* OS 1914 on IA-32 architecture defaults to 53-bit */ 1915 if ( !fpcwSet ) { 1916 oldFpcw = _control87(0,0); 1917 _control87(_PC_64,_MCW_PC); 1918 fpcwSet = 0x30000; 1919 } 1920 #endif 1921 if ( chunkIdx ) { 1922 init = __kmp_dispatch_guided_remaining< T >( 1923 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1924 KMP_DEBUG_ASSERT(init); 1925 init = trip - init; 1926 } else 1927 init = 0; 1928 limit = trip - __kmp_dispatch_guided_remaining< T >( 1929 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1930 KMP_ASSERT(init <= limit); 1931 if ( init < limit ) { 1932 KMP_DEBUG_ASSERT(limit <= trip); 1933 --limit; 1934 status = 1; 1935 break; 1936 } // if 1937 } // if 1938 } // while (1) 1939 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1940 /* restore FPCW if necessary 1941 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1942 */ 1943 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1944 _control87(oldFpcw,_MCW_PC); 1945 #endif 1946 if ( status != 0 ) { 1947 start = pr->u.p.lb; 1948 incr = pr->u.p.st; 1949 if ( p_st != NULL ) 1950 *p_st = incr; 1951 *p_lb = start + init * incr; 1952 *p_ub = start + limit * incr; 1953 if ( pr->ordered ) { 1954 pr->u.p.ordered_lower = init; 1955 pr->u.p.ordered_upper = limit; 1956 #ifdef KMP_DEBUG 1957 { 1958 const char * buff; 1959 // create format specifiers before the debug output 1960 buff = __kmp_str_format( 1961 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1962 traits_t< UT >::spec, traits_t< UT >::spec ); 1963 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1964 __kmp_str_free( &buff ); 1965 } 1966 #endif 1967 } 1968 } else { 1969 *p_lb = 0; 1970 *p_ub = 0; 1971 if ( p_st != NULL ) 1972 *p_st = 0; 1973 } 1974 } // case 1975 break; 1976 1977 case kmp_sch_trapezoidal: 1978 { 1979 UT index; 1980 T parm2 = pr->u.p.parm2; 1981 T parm3 = pr->u.p.parm3; 1982 T parm4 = pr->u.p.parm4; 1983 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 1984 gtid ) ); 1985 1986 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 1987 1988 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 1989 trip = pr->u.p.tc - 1; 1990 1991 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 1992 *p_lb = 0; 1993 *p_ub = 0; 1994 if ( p_st != NULL ) *p_st = 0; 1995 } else { 1996 start = pr->u.p.lb; 1997 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 1998 incr = pr->u.p.st; 1999 2000 if ( (last = (limit >= trip)) != 0 ) 2001 limit = trip; 2002 2003 if ( p_st != NULL ) *p_st = incr; 2004 2005 if ( incr == 1 ) { 2006 *p_lb = start + init; 2007 *p_ub = start + limit; 2008 } else { 2009 *p_lb = start + init * incr; 2010 *p_ub = start + limit * incr; 2011 } 2012 2013 if ( pr->ordered ) { 2014 pr->u.p.ordered_lower = init; 2015 pr->u.p.ordered_upper = limit; 2016 #ifdef KMP_DEBUG 2017 { 2018 const char * buff; 2019 // create format specifiers before the debug output 2020 buff = __kmp_str_format( 2021 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2022 traits_t< UT >::spec, traits_t< UT >::spec ); 2023 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2024 __kmp_str_free( &buff ); 2025 } 2026 #endif 2027 } // if 2028 } // if 2029 } // case 2030 break; 2031 default: 2032 { 2033 status = 0; // to avoid complaints on uninitialized variable use 2034 __kmp_msg( 2035 kmp_ms_fatal, // Severity 2036 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2037 KMP_HNT( GetNewerLibrary ), // Hint 2038 __kmp_msg_null // Variadic argument list terminator 2039 ); 2040 } 2041 break; 2042 } // switch 2043 } // if tc == 0; 2044 2045 if ( status == 0 ) { 2046 UT num_done; 2047 2048 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2049 #ifdef KMP_DEBUG 2050 { 2051 const char * buff; 2052 // create format specifiers before the debug output 2053 buff = __kmp_str_format( 2054 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2055 traits_t< UT >::spec ); 2056 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2057 __kmp_str_free( &buff ); 2058 } 2059 #endif 2060 2061 if ( (ST)num_done == team->t.t_nproc-1 ) { 2062 /* NOTE: release this buffer to be reused */ 2063 2064 KMP_MB(); /* Flush all pending memory write invalidates. */ 2065 2066 sh->u.s.num_done = 0; 2067 sh->u.s.iteration = 0; 2068 2069 /* TODO replace with general release procedure? */ 2070 if ( pr->ordered ) { 2071 sh->u.s.ordered_iteration = 0; 2072 } 2073 2074 KMP_MB(); /* Flush all pending memory write invalidates. */ 2075 2076 sh -> buffer_index += KMP_MAX_DISP_BUF; 2077 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2078 gtid, sh->buffer_index) ); 2079 2080 KMP_MB(); /* Flush all pending memory write invalidates. */ 2081 2082 } // if 2083 if ( __kmp_env_consistency_check ) { 2084 if ( pr->pushed_ws != ct_none ) { 2085 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2086 } 2087 } 2088 2089 th -> th.th_dispatch -> th_deo_fcn = NULL; 2090 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2091 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2092 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2093 } // if (status == 0) 2094 #if KMP_OS_WINDOWS 2095 else if ( last ) { 2096 pr->u.p.last_upper = pr->u.p.ub; 2097 } 2098 #endif /* KMP_OS_WINDOWS */ 2099 if ( p_last != NULL && status != 0 ) 2100 *p_last = last; 2101 } // if 2102 2103 #ifdef KMP_DEBUG 2104 { 2105 const char * buff; 2106 // create format specifiers before the debug output 2107 buff = __kmp_str_format( 2108 "__kmp_dispatch_next: T#%%d normal case: " \ 2109 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2110 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2111 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2112 __kmp_str_free( &buff ); 2113 } 2114 #endif 2115 #if INCLUDE_SSC_MARKS 2116 SSC_MARK_DISPATCH_NEXT(); 2117 #endif 2118 return status; 2119 } 2120 2121 template< typename T > 2122 static void 2123 __kmp_dist_get_bounds( 2124 ident_t *loc, 2125 kmp_int32 gtid, 2126 kmp_int32 *plastiter, 2127 T *plower, 2128 T *pupper, 2129 typename traits_t< T >::signed_t incr 2130 ) { 2131 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); 2132 typedef typename traits_t< T >::unsigned_t UT; 2133 typedef typename traits_t< T >::signed_t ST; 2134 register kmp_uint32 team_id; 2135 register kmp_uint32 nteams; 2136 register UT trip_count; 2137 register kmp_team_t *team; 2138 kmp_info_t * th; 2139 2140 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2141 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2142 #ifdef KMP_DEBUG 2143 { 2144 const char * buff; 2145 // create format specifiers before the debug output 2146 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2147 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2148 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2149 traits_t< T >::spec ); 2150 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2151 __kmp_str_free( &buff ); 2152 } 2153 #endif 2154 2155 if( __kmp_env_consistency_check ) { 2156 if( incr == 0 ) { 2157 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2158 } 2159 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2160 // The loop is illegal. 2161 // Some zero-trip loops maintained by compiler, e.g.: 2162 // for(i=10;i<0;++i) // lower >= upper - run-time check 2163 // for(i=0;i>10;--i) // lower <= upper - run-time check 2164 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2165 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2166 // Compiler does not check the following illegal loops: 2167 // for(i=0;i<10;i+=incr) // where incr<0 2168 // for(i=10;i>0;i-=incr) // where incr<0 2169 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2170 } 2171 } 2172 th = __kmp_threads[gtid]; 2173 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2174 team = th->th.th_team; 2175 #if OMP_40_ENABLED 2176 nteams = th->th.th_teams_size.nteams; 2177 #endif 2178 team_id = team->t.t_master_tid; 2179 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2180 2181 // compute global trip count 2182 if( incr == 1 ) { 2183 trip_count = *pupper - *plower + 1; 2184 } else if(incr == -1) { 2185 trip_count = *plower - *pupper + 1; 2186 } else { 2187 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2188 } 2189 if( trip_count <= nteams ) { 2190 KMP_DEBUG_ASSERT( 2191 __kmp_static == kmp_sch_static_greedy || \ 2192 __kmp_static == kmp_sch_static_balanced 2193 ); // Unknown static scheduling type. 2194 // only some teams get single iteration, others get nothing 2195 if( team_id < trip_count ) { 2196 *pupper = *plower = *plower + team_id * incr; 2197 } else { 2198 *plower = *pupper + incr; // zero-trip loop 2199 } 2200 if( plastiter != NULL ) 2201 *plastiter = ( team_id == trip_count - 1 ); 2202 } else { 2203 if( __kmp_static == kmp_sch_static_balanced ) { 2204 register UT chunk = trip_count / nteams; 2205 register UT extras = trip_count % nteams; 2206 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2207 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2208 if( plastiter != NULL ) 2209 *plastiter = ( team_id == nteams - 1 ); 2210 } else { 2211 register T chunk_inc_count = 2212 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2213 register T upper = *pupper; 2214 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2215 // Unknown static scheduling type. 2216 *plower += team_id * chunk_inc_count; 2217 *pupper = *plower + chunk_inc_count - incr; 2218 // Check/correct bounds if needed 2219 if( incr > 0 ) { 2220 if( *pupper < *plower ) 2221 *pupper = i_maxmin< T >::mx; 2222 if( plastiter != NULL ) 2223 *plastiter = *plower <= upper && *pupper > upper - incr; 2224 if( *pupper > upper ) 2225 *pupper = upper; // tracker C73258 2226 } else { 2227 if( *pupper > *plower ) 2228 *pupper = i_maxmin< T >::mn; 2229 if( plastiter != NULL ) 2230 *plastiter = *plower >= upper && *pupper < upper - incr; 2231 if( *pupper < upper ) 2232 *pupper = upper; // tracker C73258 2233 } 2234 } 2235 } 2236 } 2237 2238 //----------------------------------------------------------------------------------------- 2239 // Dispatch routines 2240 // Transfer call to template< type T > 2241 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2242 // T lb, T ub, ST st, ST chunk ) 2243 extern "C" { 2244 2245 /*! 2246 @ingroup WORK_SHARING 2247 @{ 2248 @param loc Source location 2249 @param gtid Global thread id 2250 @param schedule Schedule type 2251 @param lb Lower bound 2252 @param ub Upper bound 2253 @param st Step (or increment if you prefer) 2254 @param chunk The chunk size to block with 2255 2256 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2257 These functions are all identical apart from the types of the arguments. 2258 */ 2259 2260 void 2261 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2262 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2263 { 2264 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2265 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2266 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2267 } 2268 /*! 2269 See @ref __kmpc_dispatch_init_4 2270 */ 2271 void 2272 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2273 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2274 { 2275 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2276 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2277 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2278 } 2279 2280 /*! 2281 See @ref __kmpc_dispatch_init_4 2282 */ 2283 void 2284 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2285 kmp_int64 lb, kmp_int64 ub, 2286 kmp_int64 st, kmp_int64 chunk ) 2287 { 2288 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2289 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2290 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2291 } 2292 2293 /*! 2294 See @ref __kmpc_dispatch_init_4 2295 */ 2296 void 2297 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2298 kmp_uint64 lb, kmp_uint64 ub, 2299 kmp_int64 st, kmp_int64 chunk ) 2300 { 2301 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2302 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2303 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2304 } 2305 2306 /*! 2307 See @ref __kmpc_dispatch_init_4 2308 2309 Difference from __kmpc_dispatch_init set of functions is these functions 2310 are called for composite distribute parallel for construct. Thus before 2311 regular iterations dispatching we need to calc per-team iteration space. 2312 2313 These functions are all identical apart from the types of the arguments. 2314 */ 2315 void 2316 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2317 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2318 { 2319 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2320 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2321 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2322 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2323 } 2324 2325 void 2326 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2327 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2328 { 2329 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2330 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2331 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2332 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2333 } 2334 2335 void 2336 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2337 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2338 { 2339 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2340 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2341 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2342 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2343 } 2344 2345 void 2346 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2347 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2348 { 2349 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2350 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2351 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2352 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2353 } 2354 2355 /*! 2356 @param loc Source code location 2357 @param gtid Global thread id 2358 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2359 @param p_lb Pointer to the lower bound for the next chunk of work 2360 @param p_ub Pointer to the upper bound for the next chunk of work 2361 @param p_st Pointer to the stride for the next chunk of work 2362 @return one if there is work to be done, zero otherwise 2363 2364 Get the next dynamically allocated chunk of work for this thread. 2365 If there is no more work, then the lb,ub and stride need not be modified. 2366 */ 2367 int 2368 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2369 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2370 { 2371 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2372 } 2373 2374 /*! 2375 See @ref __kmpc_dispatch_next_4 2376 */ 2377 int 2378 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2379 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2380 { 2381 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2382 } 2383 2384 /*! 2385 See @ref __kmpc_dispatch_next_4 2386 */ 2387 int 2388 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2389 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2390 { 2391 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2392 } 2393 2394 /*! 2395 See @ref __kmpc_dispatch_next_4 2396 */ 2397 int 2398 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2399 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2400 { 2401 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2402 } 2403 2404 /*! 2405 @param loc Source code location 2406 @param gtid Global thread id 2407 2408 Mark the end of a dynamic loop. 2409 */ 2410 void 2411 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2412 { 2413 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2414 } 2415 2416 /*! 2417 See @ref __kmpc_dispatch_fini_4 2418 */ 2419 void 2420 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2421 { 2422 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2423 } 2424 2425 /*! 2426 See @ref __kmpc_dispatch_fini_4 2427 */ 2428 void 2429 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2430 { 2431 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2432 } 2433 2434 /*! 2435 See @ref __kmpc_dispatch_fini_4 2436 */ 2437 void 2438 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2439 { 2440 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2441 } 2442 /*! @} */ 2443 2444 //----------------------------------------------------------------------------------------- 2445 //Non-template routines from kmp_dispatch.c used in other sources 2446 2447 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2448 return value == checker; 2449 } 2450 2451 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2452 return value != checker; 2453 } 2454 2455 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2456 return value < checker; 2457 } 2458 2459 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2460 return value >= checker; 2461 } 2462 2463 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2464 return value <= checker; 2465 } 2466 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2467 return value == checker; 2468 } 2469 2470 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2471 return value != checker; 2472 } 2473 2474 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2475 return value < checker; 2476 } 2477 2478 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2479 return value >= checker; 2480 } 2481 2482 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2483 return value <= checker; 2484 } 2485 2486 kmp_uint32 2487 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2488 kmp_uint32 checker, 2489 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2490 , void * obj // Higher-level synchronization object, or NULL. 2491 ) 2492 { 2493 // note: we may not belong to a team at this point 2494 register volatile kmp_uint32 * spin = spinner; 2495 register kmp_uint32 check = checker; 2496 register kmp_uint32 spins; 2497 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2498 register kmp_uint32 r; 2499 2500 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2501 KMP_INIT_YIELD( spins ); 2502 // main wait spin loop 2503 while(!f(r = TCR_4(*spin), check)) { 2504 KMP_FSYNC_SPIN_PREPARE( obj ); 2505 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2506 It causes problems with infinite recursion because of exit lock */ 2507 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2508 __kmp_abort_thread(); */ 2509 2510 /* if we have waited a bit, or are oversubscribed, yield */ 2511 /* pause is in the following code */ 2512 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2513 KMP_YIELD_SPIN( spins ); 2514 } 2515 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2516 return r; 2517 } 2518 2519 kmp_uint64 2520 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2521 kmp_uint64 checker, 2522 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2523 , void * obj // Higher-level synchronization object, or NULL. 2524 ) 2525 { 2526 // note: we may not belong to a team at this point 2527 register volatile kmp_uint64 * spin = spinner; 2528 register kmp_uint64 check = checker; 2529 register kmp_uint32 spins; 2530 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2531 register kmp_uint64 r; 2532 2533 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2534 KMP_INIT_YIELD( spins ); 2535 // main wait spin loop 2536 while(!f(r = *spin, check)) 2537 { 2538 KMP_FSYNC_SPIN_PREPARE( obj ); 2539 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2540 It causes problems with infinite recursion because of exit lock */ 2541 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2542 __kmp_abort_thread(); */ 2543 2544 // if we are oversubscribed, 2545 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2546 // pause is in the following code 2547 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2548 KMP_YIELD_SPIN( spins ); 2549 } 2550 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2551 return r; 2552 } 2553 2554 } // extern "C" 2555 2556 #ifdef KMP_GOMP_COMPAT 2557 2558 void 2559 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2560 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2561 kmp_int32 chunk, int push_ws ) 2562 { 2563 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2564 push_ws ); 2565 } 2566 2567 void 2568 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2569 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2570 kmp_int32 chunk, int push_ws ) 2571 { 2572 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2573 push_ws ); 2574 } 2575 2576 void 2577 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2578 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2579 kmp_int64 chunk, int push_ws ) 2580 { 2581 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2582 push_ws ); 2583 } 2584 2585 void 2586 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2587 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2588 kmp_int64 chunk, int push_ws ) 2589 { 2590 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2591 push_ws ); 2592 } 2593 2594 void 2595 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2596 { 2597 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2598 } 2599 2600 void 2601 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2602 { 2603 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2604 } 2605 2606 void 2607 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2608 { 2609 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2610 } 2611 2612 void 2613 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2614 { 2615 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2616 } 2617 2618 #endif /* KMP_GOMP_COMPAT */ 2619 2620 /* ------------------------------------------------------------------------ */ 2621 /* ------------------------------------------------------------------------ */ 2622 2623