1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 /* 17 * Dynamic scheduling initialization and dispatch. 18 * 19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 20 * it may change values between parallel regions. __kmp_max_nth 21 * is the largest value __kmp_nth may take, 1 is the smallest. 22 * 23 */ 24 25 /* ------------------------------------------------------------------------ */ 26 /* ------------------------------------------------------------------------ */ 27 28 #include "kmp.h" 29 #include "kmp_i18n.h" 30 #include "kmp_itt.h" 31 #include "kmp_str.h" 32 #include "kmp_error.h" 33 #include "kmp_stats.h" 34 #if KMP_OS_WINDOWS && KMP_ARCH_X86 35 #include <float.h> 36 #endif 37 38 /* ------------------------------------------------------------------------ */ 39 /* ------------------------------------------------------------------------ */ 40 41 // template for type limits 42 template< typename T > 43 struct i_maxmin { 44 static const T mx; 45 static const T mn; 46 }; 47 template<> 48 struct i_maxmin< int > { 49 static const int mx = 0x7fffffff; 50 static const int mn = 0x80000000; 51 }; 52 template<> 53 struct i_maxmin< unsigned int > { 54 static const unsigned int mx = 0xffffffff; 55 static const unsigned int mn = 0x00000000; 56 }; 57 template<> 58 struct i_maxmin< long long > { 59 static const long long mx = 0x7fffffffffffffffLL; 60 static const long long mn = 0x8000000000000000LL; 61 }; 62 template<> 63 struct i_maxmin< unsigned long long > { 64 static const unsigned long long mx = 0xffffffffffffffffLL; 65 static const unsigned long long mn = 0x0000000000000000LL; 66 }; 67 //------------------------------------------------------------------------- 68 69 #ifdef KMP_STATIC_STEAL_ENABLED 70 71 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 72 template< typename T > 73 struct dispatch_private_infoXX_template { 74 typedef typename traits_t< T >::unsigned_t UT; 75 typedef typename traits_t< T >::signed_t ST; 76 UT count; // unsigned 77 T ub; 78 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 79 T lb; 80 ST st; // signed 81 UT tc; // unsigned 82 T static_steal_counter; // for static_steal only; maybe better to put after ub 83 84 /* parm[1-4] are used in different ways by different scheduling algorithms */ 85 86 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 87 // a) parm3 is properly aligned and 88 // b) all parm1-4 are in the same cache line. 89 // Because of parm1-4 are used together, performance seems to be better 90 // if they are in the same line (not measured though). 91 92 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 93 T parm1; 94 T parm2; 95 T parm3; 96 T parm4; 97 }; 98 99 UT ordered_lower; // unsigned 100 UT ordered_upper; // unsigned 101 #if KMP_OS_WINDOWS 102 T last_upper; 103 #endif /* KMP_OS_WINDOWS */ 104 }; 105 106 #else /* KMP_STATIC_STEAL_ENABLED */ 107 108 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 109 template< typename T > 110 struct dispatch_private_infoXX_template { 111 typedef typename traits_t< T >::unsigned_t UT; 112 typedef typename traits_t< T >::signed_t ST; 113 T lb; 114 T ub; 115 ST st; // signed 116 UT tc; // unsigned 117 118 T parm1; 119 T parm2; 120 T parm3; 121 T parm4; 122 123 UT count; // unsigned 124 125 UT ordered_lower; // unsigned 126 UT ordered_upper; // unsigned 127 #if KMP_OS_WINDOWS 128 T last_upper; 129 #endif /* KMP_OS_WINDOWS */ 130 }; 131 132 #endif /* KMP_STATIC_STEAL_ENABLED */ 133 134 // replaces dispatch_private_info structure and dispatch_private_info_t type 135 template< typename T > 136 struct KMP_ALIGN_CACHE dispatch_private_info_template { 137 // duplicate alignment here, otherwise size of structure is not correct in our compiler 138 union KMP_ALIGN_CACHE private_info_tmpl { 139 dispatch_private_infoXX_template< T > p; 140 dispatch_private_info64_t p64; 141 } u; 142 enum sched_type schedule; /* scheduling algorithm */ 143 kmp_uint32 ordered; /* ordered clause specified */ 144 kmp_uint32 ordered_bumped; 145 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 146 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 147 kmp_uint32 nomerge; /* don't merge iters if serialized */ 148 kmp_uint32 type_size; 149 enum cons_type pushed_ws; 150 }; 151 152 153 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 154 template< typename UT > 155 struct dispatch_shared_infoXX_template { 156 /* chunk index under dynamic, number of idle threads under static-steal; 157 iteration index otherwise */ 158 volatile UT iteration; 159 volatile UT num_done; 160 volatile UT ordered_iteration; 161 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 162 }; 163 164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 165 template< typename UT > 166 struct dispatch_shared_info_template { 167 // we need union here to keep the structure size 168 union shared_info_tmpl { 169 dispatch_shared_infoXX_template< UT > s; 170 dispatch_shared_info64_t s64; 171 } u; 172 volatile kmp_uint32 buffer_index; 173 }; 174 175 /* ------------------------------------------------------------------------ */ 176 /* ------------------------------------------------------------------------ */ 177 178 #undef USE_TEST_LOCKS 179 180 // test_then_add template (general template should NOT be used) 181 template< typename T > 182 static __forceinline T 183 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 184 185 template<> 186 __forceinline kmp_int32 187 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 188 { 189 kmp_int32 r; 190 r = KMP_TEST_THEN_ADD32( p, d ); 191 return r; 192 } 193 194 template<> 195 __forceinline kmp_int64 196 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 197 { 198 kmp_int64 r; 199 r = KMP_TEST_THEN_ADD64( p, d ); 200 return r; 201 } 202 203 // test_then_inc_acq template (general template should NOT be used) 204 template< typename T > 205 static __forceinline T 206 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 207 208 template<> 209 __forceinline kmp_int32 210 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 211 { 212 kmp_int32 r; 213 r = KMP_TEST_THEN_INC_ACQ32( p ); 214 return r; 215 } 216 217 template<> 218 __forceinline kmp_int64 219 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 220 { 221 kmp_int64 r; 222 r = KMP_TEST_THEN_INC_ACQ64( p ); 223 return r; 224 } 225 226 // test_then_inc template (general template should NOT be used) 227 template< typename T > 228 static __forceinline T 229 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 230 231 template<> 232 __forceinline kmp_int32 233 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 234 { 235 kmp_int32 r; 236 r = KMP_TEST_THEN_INC32( p ); 237 return r; 238 } 239 240 template<> 241 __forceinline kmp_int64 242 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 243 { 244 kmp_int64 r; 245 r = KMP_TEST_THEN_INC64( p ); 246 return r; 247 } 248 249 // compare_and_swap template (general template should NOT be used) 250 template< typename T > 251 static __forceinline kmp_int32 252 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 253 254 template<> 255 __forceinline kmp_int32 256 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 257 { 258 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 259 } 260 261 template<> 262 __forceinline kmp_int32 263 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 264 { 265 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 266 } 267 268 /* 269 Spin wait loop that first does pause, then yield. 270 Waits until function returns non-zero when called with *spinner and check. 271 Does NOT put threads to sleep. 272 #if USE_ITT_BUILD 273 Arguments: 274 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 275 locks consistently. For example, if lock is acquired immediately, its address is 276 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 277 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 278 address, not an address of low-level spinner. 279 #endif // USE_ITT_BUILD 280 */ 281 template< typename UT > 282 // ToDo: make inline function (move to header file for icl) 283 static UT // unsigned 4- or 8-byte type 284 __kmp_wait_yield( volatile UT * spinner, 285 UT checker, 286 kmp_uint32 (* pred)( UT, UT ) 287 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 288 ) 289 { 290 // note: we may not belong to a team at this point 291 register volatile UT * spin = spinner; 292 register UT check = checker; 293 register kmp_uint32 spins; 294 register kmp_uint32 (*f) ( UT, UT ) = pred; 295 register UT r; 296 297 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 298 KMP_INIT_YIELD( spins ); 299 // main wait spin loop 300 while(!f(r = *spin, check)) 301 { 302 KMP_FSYNC_SPIN_PREPARE( obj ); 303 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 304 It causes problems with infinite recursion because of exit lock */ 305 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 306 __kmp_abort_thread(); */ 307 308 // if we are oversubscribed, 309 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 310 // pause is in the following code 311 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 312 KMP_YIELD_SPIN( spins ); 313 } 314 KMP_FSYNC_SPIN_ACQUIRED( obj ); 315 return r; 316 } 317 318 template< typename UT > 319 static kmp_uint32 __kmp_eq( UT value, UT checker) { 320 return value == checker; 321 } 322 323 template< typename UT > 324 static kmp_uint32 __kmp_neq( UT value, UT checker) { 325 return value != checker; 326 } 327 328 template< typename UT > 329 static kmp_uint32 __kmp_lt( UT value, UT checker) { 330 return value < checker; 331 } 332 333 template< typename UT > 334 static kmp_uint32 __kmp_ge( UT value, UT checker) { 335 return value >= checker; 336 } 337 338 template< typename UT > 339 static kmp_uint32 __kmp_le( UT value, UT checker) { 340 return value <= checker; 341 } 342 343 344 /* ------------------------------------------------------------------------ */ 345 /* ------------------------------------------------------------------------ */ 346 347 static void 348 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 349 { 350 kmp_info_t *th; 351 352 KMP_DEBUG_ASSERT( gtid_ref ); 353 354 if ( __kmp_env_consistency_check ) { 355 th = __kmp_threads[*gtid_ref]; 356 if ( th -> th.th_root -> r.r_active 357 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 358 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 359 } 360 } 361 } 362 363 template< typename UT > 364 static void 365 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 366 { 367 typedef typename traits_t< UT >::signed_t ST; 368 dispatch_private_info_template< UT > * pr; 369 370 int gtid = *gtid_ref; 371 // int cid = *cid_ref; 372 kmp_info_t *th = __kmp_threads[ gtid ]; 373 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 374 375 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 376 if ( __kmp_env_consistency_check ) { 377 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 378 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 379 if ( pr -> pushed_ws != ct_none ) { 380 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 381 } 382 } 383 384 if ( ! th -> th.th_team -> t.t_serialized ) { 385 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 386 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 387 UT lower; 388 389 if ( ! __kmp_env_consistency_check ) { 390 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 391 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 392 } 393 lower = pr->u.p.ordered_lower; 394 395 #if ! defined( KMP_GOMP_COMPAT ) 396 if ( __kmp_env_consistency_check ) { 397 if ( pr->ordered_bumped ) { 398 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 399 __kmp_error_construct2( 400 kmp_i18n_msg_CnsMultipleNesting, 401 ct_ordered_in_pdo, loc_ref, 402 & p->stack_data[ p->w_top ] 403 ); 404 } 405 } 406 #endif /* !defined(KMP_GOMP_COMPAT) */ 407 408 KMP_MB(); 409 #ifdef KMP_DEBUG 410 { 411 const char * buff; 412 // create format specifiers before the debug output 413 buff = __kmp_str_format( 414 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 415 traits_t< UT >::spec, traits_t< UT >::spec ); 416 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 417 __kmp_str_free( &buff ); 418 } 419 #endif 420 421 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 422 USE_ITT_BUILD_ARG( NULL ) 423 ); 424 KMP_MB(); /* is this necessary? */ 425 #ifdef KMP_DEBUG 426 { 427 const char * buff; 428 // create format specifiers before the debug output 429 buff = __kmp_str_format( 430 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 431 traits_t< UT >::spec, traits_t< UT >::spec ); 432 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 433 __kmp_str_free( &buff ); 434 } 435 #endif 436 } 437 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 438 } 439 440 static void 441 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 442 { 443 kmp_info_t *th; 444 445 if ( __kmp_env_consistency_check ) { 446 th = __kmp_threads[*gtid_ref]; 447 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 448 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 449 } 450 } 451 } 452 453 template< typename UT > 454 static void 455 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 456 { 457 typedef typename traits_t< UT >::signed_t ST; 458 dispatch_private_info_template< UT > * pr; 459 460 int gtid = *gtid_ref; 461 // int cid = *cid_ref; 462 kmp_info_t *th = __kmp_threads[ gtid ]; 463 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 464 465 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 466 if ( __kmp_env_consistency_check ) { 467 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 468 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 469 if ( pr -> pushed_ws != ct_none ) { 470 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 471 } 472 } 473 474 if ( ! th -> th.th_team -> t.t_serialized ) { 475 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 476 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 477 478 if ( ! __kmp_env_consistency_check ) { 479 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 480 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 481 } 482 483 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 484 #if ! defined( KMP_GOMP_COMPAT ) 485 if ( __kmp_env_consistency_check ) { 486 if ( pr->ordered_bumped != 0 ) { 487 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 488 /* How to test it? - OM */ 489 __kmp_error_construct2( 490 kmp_i18n_msg_CnsMultipleNesting, 491 ct_ordered_in_pdo, loc_ref, 492 & p->stack_data[ p->w_top ] 493 ); 494 } 495 } 496 #endif /* !defined(KMP_GOMP_COMPAT) */ 497 498 KMP_MB(); /* Flush all pending memory write invalidates. */ 499 500 pr->ordered_bumped += 1; 501 502 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 503 gtid, pr->ordered_bumped ) ); 504 505 KMP_MB(); /* Flush all pending memory write invalidates. */ 506 507 /* TODO use general release procedure? */ 508 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 509 510 KMP_MB(); /* Flush all pending memory write invalidates. */ 511 } 512 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 513 } 514 515 /* Computes and returns x to the power of y, where y must a non-negative integer */ 516 template< typename UT > 517 static __forceinline long double 518 __kmp_pow(long double x, UT y) { 519 long double s=1.0L; 520 521 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 522 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 523 while(y) { 524 if ( y & 1 ) 525 s *= x; 526 x *= x; 527 y >>= 1; 528 } 529 return s; 530 } 531 532 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 533 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 534 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 535 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 536 */ 537 template< typename T > 538 static __inline typename traits_t< T >::unsigned_t 539 __kmp_dispatch_guided_remaining( 540 T tc, 541 typename traits_t< T >::floating_t base, 542 typename traits_t< T >::unsigned_t idx 543 ) { 544 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 545 least for ICL 8.1, long double arithmetic may not really have 546 long double precision, even with /Qlong_double. Currently, we 547 workaround that in the caller code, by manipulating the FPCW for 548 Windows* OS on IA-32 architecture. The lack of precision is not 549 expected to be a correctness issue, though. 550 */ 551 typedef typename traits_t< T >::unsigned_t UT; 552 553 long double x = tc * __kmp_pow< UT >(base, idx); 554 UT r = (UT) x; 555 if ( x == r ) 556 return r; 557 return r + 1; 558 } 559 560 // Parameters of the guided-iterative algorithm: 561 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 562 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 563 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 564 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 565 static int guided_int_param = 2; 566 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 567 568 // UT - unsigned flavor of T, ST - signed flavor of T, 569 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 570 template< typename T > 571 static void 572 __kmp_dispatch_init( 573 ident_t * loc, 574 int gtid, 575 enum sched_type schedule, 576 T lb, 577 T ub, 578 typename traits_t< T >::signed_t st, 579 typename traits_t< T >::signed_t chunk, 580 int push_ws 581 ) { 582 typedef typename traits_t< T >::unsigned_t UT; 583 typedef typename traits_t< T >::signed_t ST; 584 typedef typename traits_t< T >::floating_t DBL; 585 static const int ___kmp_size_type = sizeof( UT ); 586 587 int active; 588 T tc; 589 kmp_info_t * th; 590 kmp_team_t * team; 591 kmp_uint32 my_buffer_index; 592 dispatch_private_info_template< T > * pr; 593 dispatch_shared_info_template< UT > volatile * sh; 594 595 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 596 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 597 598 if ( ! TCR_4( __kmp_init_parallel ) ) 599 __kmp_parallel_initialize(); 600 601 #if INCLUDE_SSC_MARKS 602 SSC_MARK_DISPATCH_INIT(); 603 #endif 604 #ifdef KMP_DEBUG 605 { 606 const char * buff; 607 // create format specifiers before the debug output 608 buff = __kmp_str_format( 609 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 610 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 611 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 612 __kmp_str_free( &buff ); 613 } 614 #endif 615 /* setup data */ 616 th = __kmp_threads[ gtid ]; 617 team = th -> th.th_team; 618 active = ! team -> t.t_serialized; 619 th->th.th_ident = loc; 620 621 #if USE_ITT_BUILD 622 kmp_uint64 cur_chunk = chunk; 623 #endif 624 if ( ! active ) { 625 pr = reinterpret_cast< dispatch_private_info_template< T >* > 626 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 627 } else { 628 KMP_DEBUG_ASSERT( th->th.th_dispatch == 629 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 630 631 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 632 633 /* What happens when number of threads changes, need to resize buffer? */ 634 pr = reinterpret_cast< dispatch_private_info_template< T > * > 635 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 636 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 637 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 638 } 639 640 /* Pick up the nomerge/ordered bits from the scheduling type */ 641 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 642 pr->nomerge = TRUE; 643 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 644 } else { 645 pr->nomerge = FALSE; 646 } 647 pr->type_size = ___kmp_size_type; // remember the size of variables 648 if ( kmp_ord_lower & schedule ) { 649 pr->ordered = TRUE; 650 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 651 } else { 652 pr->ordered = FALSE; 653 } 654 if ( schedule == kmp_sch_static ) { 655 schedule = __kmp_static; 656 } else { 657 if ( schedule == kmp_sch_runtime ) { 658 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 659 schedule = team -> t.t_sched.r_sched_type; 660 // Detail the schedule if needed (global controls are differentiated appropriately) 661 if ( schedule == kmp_sch_guided_chunked ) { 662 schedule = __kmp_guided; 663 } else if ( schedule == kmp_sch_static ) { 664 schedule = __kmp_static; 665 } 666 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 667 chunk = team -> t.t_sched.chunk; 668 669 #ifdef KMP_DEBUG 670 { 671 const char * buff; 672 // create format specifiers before the debug output 673 buff = __kmp_str_format( 674 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 675 traits_t< ST >::spec ); 676 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 677 __kmp_str_free( &buff ); 678 } 679 #endif 680 } else { 681 if ( schedule == kmp_sch_guided_chunked ) { 682 schedule = __kmp_guided; 683 } 684 if ( chunk <= 0 ) { 685 chunk = KMP_DEFAULT_CHUNK; 686 } 687 } 688 689 if ( schedule == kmp_sch_auto ) { 690 // mapping and differentiation: in the __kmp_do_serial_initialize() 691 schedule = __kmp_auto; 692 #ifdef KMP_DEBUG 693 { 694 const char * buff; 695 // create format specifiers before the debug output 696 buff = __kmp_str_format( 697 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 698 traits_t< ST >::spec ); 699 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 700 __kmp_str_free( &buff ); 701 } 702 #endif 703 } 704 705 /* guided analytical not safe for too many threads */ 706 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 707 schedule = kmp_sch_guided_iterative_chunked; 708 KMP_WARNING( DispatchManyThreads ); 709 } 710 pr->u.p.parm1 = chunk; 711 } 712 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 713 "unknown scheduling type" ); 714 715 pr->u.p.count = 0; 716 717 if ( __kmp_env_consistency_check ) { 718 if ( st == 0 ) { 719 __kmp_error_construct( 720 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 721 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 722 ); 723 } 724 } 725 726 tc = ( ub - lb + st ); 727 if ( st != 1 ) { 728 if ( st < 0 ) { 729 if ( lb < ub ) { 730 tc = 0; // zero-trip 731 } else { // lb >= ub 732 tc = (ST)tc / st; // convert to signed division 733 } 734 } else { // st > 0 735 if ( ub < lb ) { 736 tc = 0; // zero-trip 737 } else { // lb >= ub 738 tc /= st; 739 } 740 } 741 } else if ( ub < lb ) { // st == 1 742 tc = 0; // zero-trip 743 } 744 745 pr->u.p.lb = lb; 746 pr->u.p.ub = ub; 747 pr->u.p.st = st; 748 pr->u.p.tc = tc; 749 750 #if KMP_OS_WINDOWS 751 pr->u.p.last_upper = ub + st; 752 #endif /* KMP_OS_WINDOWS */ 753 754 /* NOTE: only the active parallel region(s) has active ordered sections */ 755 756 if ( active ) { 757 if ( pr->ordered == 0 ) { 758 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 759 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 760 } else { 761 pr->ordered_bumped = 0; 762 763 pr->u.p.ordered_lower = 1; 764 pr->u.p.ordered_upper = 0; 765 766 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 767 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 768 } 769 } 770 771 if ( __kmp_env_consistency_check ) { 772 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 773 if ( push_ws ) { 774 __kmp_push_workshare( gtid, ws, loc ); 775 pr->pushed_ws = ws; 776 } else { 777 __kmp_check_workshare( gtid, ws, loc ); 778 pr->pushed_ws = ct_none; 779 } 780 } 781 782 switch ( schedule ) { 783 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 784 case kmp_sch_static_steal: 785 { 786 T nproc = team->t.t_nproc; 787 T ntc, init; 788 789 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 790 791 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 792 if ( nproc > 1 && ntc >= nproc ) { 793 T id = __kmp_tid_from_gtid(gtid); 794 T small_chunk, extras; 795 796 small_chunk = ntc / nproc; 797 extras = ntc % nproc; 798 799 init = id * small_chunk + ( id < extras ? id : extras ); 800 pr->u.p.count = init; 801 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 802 803 pr->u.p.parm2 = lb; 804 //pr->pfields.parm3 = 0; // it's not used in static_steal 805 pr->u.p.parm4 = id; 806 pr->u.p.st = st; 807 break; 808 } else { 809 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 810 gtid ) ); 811 schedule = kmp_sch_static_balanced; 812 /* too few iterations: fall-through to kmp_sch_static_balanced */ 813 } // if 814 /* FALL-THROUGH to static balanced */ 815 } // case 816 #endif 817 case kmp_sch_static_balanced: 818 { 819 T nproc = team->t.t_nproc; 820 T init, limit; 821 822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 823 gtid ) ); 824 825 if ( nproc > 1 ) { 826 T id = __kmp_tid_from_gtid(gtid); 827 828 if ( tc < nproc ) { 829 if ( id < tc ) { 830 init = id; 831 limit = id; 832 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 833 } else { 834 pr->u.p.count = 1; /* means no more chunks to execute */ 835 pr->u.p.parm1 = FALSE; 836 break; 837 } 838 } else { 839 T small_chunk = tc / nproc; 840 T extras = tc % nproc; 841 init = id * small_chunk + (id < extras ? id : extras); 842 limit = init + small_chunk - (id < extras ? 0 : 1); 843 pr->u.p.parm1 = (id == nproc - 1); 844 } 845 } else { 846 if ( tc > 0 ) { 847 init = 0; 848 limit = tc - 1; 849 pr->u.p.parm1 = TRUE; 850 } else { 851 // zero trip count 852 pr->u.p.count = 1; /* means no more chunks to execute */ 853 pr->u.p.parm1 = FALSE; 854 break; 855 } 856 } 857 #if USE_ITT_BUILD 858 // Calculate chunk for metadata report 859 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 860 cur_chunk = limit - init + 1; 861 } 862 #endif 863 if ( st == 1 ) { 864 pr->u.p.lb = lb + init; 865 pr->u.p.ub = lb + limit; 866 } else { 867 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 868 pr->u.p.lb = lb + init * st; 869 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 870 if ( st > 0 ) { 871 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 872 } else { 873 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 874 } 875 } 876 if ( pr->ordered ) { 877 pr->u.p.ordered_lower = init; 878 pr->u.p.ordered_upper = limit; 879 } 880 break; 881 } // case 882 case kmp_sch_guided_iterative_chunked : 883 { 884 T nproc = team->t.t_nproc; 885 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 886 887 if ( nproc > 1 ) { 888 if ( (2L * chunk + 1 ) * nproc >= tc ) { 889 /* chunk size too large, switch to dynamic */ 890 schedule = kmp_sch_dynamic_chunked; 891 } else { 892 // when remaining iters become less than parm2 - switch to dynamic 893 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 894 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 895 } 896 } else { 897 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 898 schedule = kmp_sch_static_greedy; 899 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 900 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 901 pr->u.p.parm1 = tc; 902 } // if 903 } // case 904 break; 905 case kmp_sch_guided_analytical_chunked: 906 { 907 T nproc = team->t.t_nproc; 908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 909 910 if ( nproc > 1 ) { 911 if ( (2L * chunk + 1 ) * nproc >= tc ) { 912 /* chunk size too large, switch to dynamic */ 913 schedule = kmp_sch_dynamic_chunked; 914 } else { 915 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 916 DBL x; 917 918 #if KMP_OS_WINDOWS && KMP_ARCH_X86 919 /* Linux* OS already has 64-bit computation by default for 920 long double, and on Windows* OS on Intel(R) 64, 921 /Qlong_double doesn't work. On Windows* OS 922 on IA-32 architecture, we need to set precision to 923 64-bit instead of the default 53-bit. Even though long 924 double doesn't work on Windows* OS on Intel(R) 64, the 925 resulting lack of precision is not expected to impact 926 the correctness of the algorithm, but this has not been 927 mathematically proven. 928 */ 929 // save original FPCW and set precision to 64-bit, as 930 // Windows* OS on IA-32 architecture defaults to 53-bit 931 unsigned int oldFpcw = _control87(0,0); 932 _control87(_PC_64,_MCW_PC); // 0,0x30000 933 #endif 934 /* value used for comparison in solver for cross-over point */ 935 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 936 937 /* crossover point--chunk indexes equal to or greater than 938 this point switch to dynamic-style scheduling */ 939 UT cross; 940 941 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 942 x = (long double)1.0 - (long double)0.5 / nproc; 943 944 #ifdef KMP_DEBUG 945 { // test natural alignment 946 struct _test_a { 947 char a; 948 union { 949 char b; 950 DBL d; 951 }; 952 } t; 953 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 954 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 955 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 956 } 957 #endif // KMP_DEBUG 958 959 /* save the term in thread private dispatch structure */ 960 *(DBL*)&pr->u.p.parm3 = x; 961 962 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 963 { 964 UT left, right, mid; 965 long double p; 966 967 /* estimate initial upper and lower bound */ 968 969 /* doesn't matter what value right is as long as it is positive, but 970 it affects performance of the solver 971 */ 972 right = 229; 973 p = __kmp_pow< UT >(x,right); 974 if ( p > target ) { 975 do{ 976 p *= p; 977 right <<= 1; 978 } while(p>target && right < (1<<27)); 979 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 980 } else { 981 left = 0; 982 } 983 984 /* bisection root-finding method */ 985 while ( left + 1 < right ) { 986 mid = (left + right) / 2; 987 if ( __kmp_pow< UT >(x,mid) > target ) { 988 left = mid; 989 } else { 990 right = mid; 991 } 992 } // while 993 cross = right; 994 } 995 /* assert sanity of computed crossover point */ 996 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 997 998 /* save the crossover point in thread private dispatch structure */ 999 pr->u.p.parm2 = cross; 1000 1001 // C75803 1002 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 1003 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 1004 #else 1005 #define GUIDED_ANALYTICAL_WORKAROUND (x) 1006 #endif 1007 /* dynamic-style scheduling offset */ 1008 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 1009 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1010 // restore FPCW 1011 _control87(oldFpcw,_MCW_PC); 1012 #endif 1013 } // if 1014 } else { 1015 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1016 gtid ) ); 1017 schedule = kmp_sch_static_greedy; 1018 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1019 pr->u.p.parm1 = tc; 1020 } // if 1021 } // case 1022 break; 1023 case kmp_sch_static_greedy: 1024 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1025 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1026 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1027 tc; 1028 break; 1029 case kmp_sch_static_chunked : 1030 case kmp_sch_dynamic_chunked : 1031 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1032 break; 1033 case kmp_sch_trapezoidal : 1034 { 1035 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1036 1037 T parm1, parm2, parm3, parm4; 1038 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1039 1040 parm1 = chunk; 1041 1042 /* F : size of the first cycle */ 1043 parm2 = ( tc / (2 * team->t.t_nproc) ); 1044 1045 if ( parm2 < 1 ) { 1046 parm2 = 1; 1047 } 1048 1049 /* L : size of the last cycle. Make sure the last cycle 1050 * is not larger than the first cycle. 1051 */ 1052 if ( parm1 < 1 ) { 1053 parm1 = 1; 1054 } else if ( parm1 > parm2 ) { 1055 parm1 = parm2; 1056 } 1057 1058 /* N : number of cycles */ 1059 parm3 = ( parm2 + parm1 ); 1060 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1061 1062 if ( parm3 < 2 ) { 1063 parm3 = 2; 1064 } 1065 1066 /* sigma : decreasing incr of the trapezoid */ 1067 parm4 = ( parm3 - 1 ); 1068 parm4 = ( parm2 - parm1 ) / parm4; 1069 1070 // pointless check, because parm4 >= 0 always 1071 //if ( parm4 < 0 ) { 1072 // parm4 = 0; 1073 //} 1074 1075 pr->u.p.parm1 = parm1; 1076 pr->u.p.parm2 = parm2; 1077 pr->u.p.parm3 = parm3; 1078 pr->u.p.parm4 = parm4; 1079 } // case 1080 break; 1081 1082 default: 1083 { 1084 __kmp_msg( 1085 kmp_ms_fatal, // Severity 1086 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1087 KMP_HNT( GetNewerLibrary ), // Hint 1088 __kmp_msg_null // Variadic argument list terminator 1089 ); 1090 } 1091 break; 1092 } // switch 1093 pr->schedule = schedule; 1094 if ( active ) { 1095 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1096 1097 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1098 gtid, my_buffer_index, sh->buffer_index) ); 1099 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1100 USE_ITT_BUILD_ARG( NULL ) 1101 ); 1102 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1103 // *always* 32-bit integers. 1104 KMP_MB(); /* is this necessary? */ 1105 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1106 gtid, my_buffer_index, sh->buffer_index) ); 1107 1108 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1109 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1110 #if USE_ITT_BUILD 1111 if ( pr->ordered ) { 1112 __kmp_itt_ordered_init( gtid ); 1113 }; // if 1114 #endif /* USE_ITT_BUILD */ 1115 }; // if 1116 1117 #if USE_ITT_BUILD 1118 // Report loop metadata 1119 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { 1120 kmp_uint32 tid = __kmp_tid_from_gtid( gtid ); 1121 if (KMP_MASTER_TID(tid)) { 1122 kmp_uint64 schedtype = 0; 1123 1124 switch ( schedule ) { 1125 case kmp_sch_static_chunked: 1126 case kmp_sch_static_balanced:// Chunk is calculated in the switch above 1127 break; 1128 case kmp_sch_static_greedy: 1129 cur_chunk = pr->u.p.parm1; 1130 break; 1131 case kmp_sch_dynamic_chunked: 1132 schedtype = 1; 1133 break; 1134 case kmp_sch_guided_iterative_chunked: 1135 case kmp_sch_guided_analytical_chunked: 1136 schedtype = 2; 1137 break; 1138 default: 1139 // Should we put this case under "static"? 1140 // case kmp_sch_static_steal: 1141 schedtype = 3; 1142 break; 1143 } 1144 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); 1145 } 1146 } 1147 #endif /* USE_ITT_BUILD */ 1148 1149 #ifdef KMP_DEBUG 1150 { 1151 const char * buff; 1152 // create format specifiers before the debug output 1153 buff = __kmp_str_format( 1154 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1155 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1156 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1157 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1158 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1159 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1160 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1161 KD_TRACE(10, ( buff, 1162 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1163 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1164 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1165 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1166 __kmp_str_free( &buff ); 1167 } 1168 #endif 1169 #if ( KMP_STATIC_STEAL_ENABLED ) 1170 if ( ___kmp_size_type < 8 ) { 1171 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1172 // all the parm3 variables will contain the same value. 1173 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1174 // rather than program life-time increment. 1175 // So the dedicated variable is required. The 'static_steal_counter' is used. 1176 if( schedule == kmp_sch_static_steal ) { 1177 // Other threads will inspect this variable when searching for a victim. 1178 // This is a flag showing that other threads may steal from this thread since then. 1179 volatile T * p = &pr->u.p.static_steal_counter; 1180 *p = *p + 1; 1181 } 1182 } 1183 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1184 } 1185 1186 /* 1187 * For ordered loops, either __kmp_dispatch_finish() should be called after 1188 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1189 * every chunk of iterations. If the ordered section(s) were not executed 1190 * for this iteration (or every iteration in this chunk), we need to set the 1191 * ordered iteration counters so that the next thread can proceed. 1192 */ 1193 template< typename UT > 1194 static void 1195 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1196 { 1197 typedef typename traits_t< UT >::signed_t ST; 1198 kmp_info_t *th = __kmp_threads[ gtid ]; 1199 1200 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1201 if ( ! th -> th.th_team -> t.t_serialized ) { 1202 1203 dispatch_private_info_template< UT > * pr = 1204 reinterpret_cast< dispatch_private_info_template< UT >* > 1205 ( th->th.th_dispatch->th_dispatch_pr_current ); 1206 dispatch_shared_info_template< UT > volatile * sh = 1207 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1208 ( th->th.th_dispatch->th_dispatch_sh_current ); 1209 KMP_DEBUG_ASSERT( pr ); 1210 KMP_DEBUG_ASSERT( sh ); 1211 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1212 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1213 1214 if ( pr->ordered_bumped ) { 1215 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1216 gtid ) ); 1217 pr->ordered_bumped = 0; 1218 } else { 1219 UT lower = pr->u.p.ordered_lower; 1220 1221 #ifdef KMP_DEBUG 1222 { 1223 const char * buff; 1224 // create format specifiers before the debug output 1225 buff = __kmp_str_format( 1226 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1227 traits_t< UT >::spec, traits_t< UT >::spec ); 1228 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1229 __kmp_str_free( &buff ); 1230 } 1231 #endif 1232 1233 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1234 USE_ITT_BUILD_ARG(NULL) 1235 ); 1236 KMP_MB(); /* is this necessary? */ 1237 #ifdef KMP_DEBUG 1238 { 1239 const char * buff; 1240 // create format specifiers before the debug output 1241 buff = __kmp_str_format( 1242 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1243 traits_t< UT >::spec, traits_t< UT >::spec ); 1244 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1245 __kmp_str_free( &buff ); 1246 } 1247 #endif 1248 1249 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1250 } // if 1251 } // if 1252 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1253 } 1254 1255 #ifdef KMP_GOMP_COMPAT 1256 1257 template< typename UT > 1258 static void 1259 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1260 { 1261 typedef typename traits_t< UT >::signed_t ST; 1262 kmp_info_t *th = __kmp_threads[ gtid ]; 1263 1264 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1265 if ( ! th -> th.th_team -> t.t_serialized ) { 1266 // int cid; 1267 dispatch_private_info_template< UT > * pr = 1268 reinterpret_cast< dispatch_private_info_template< UT >* > 1269 ( th->th.th_dispatch->th_dispatch_pr_current ); 1270 dispatch_shared_info_template< UT > volatile * sh = 1271 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1272 ( th->th.th_dispatch->th_dispatch_sh_current ); 1273 KMP_DEBUG_ASSERT( pr ); 1274 KMP_DEBUG_ASSERT( sh ); 1275 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1276 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1277 1278 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1279 UT lower = pr->u.p.ordered_lower; 1280 UT upper = pr->u.p.ordered_upper; 1281 UT inc = upper - lower + 1; 1282 1283 if ( pr->ordered_bumped == inc ) { 1284 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1285 gtid ) ); 1286 pr->ordered_bumped = 0; 1287 } else { 1288 inc -= pr->ordered_bumped; 1289 1290 #ifdef KMP_DEBUG 1291 { 1292 const char * buff; 1293 // create format specifiers before the debug output 1294 buff = __kmp_str_format( 1295 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1296 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1297 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1298 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1299 __kmp_str_free( &buff ); 1300 } 1301 #endif 1302 1303 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1304 USE_ITT_BUILD_ARG(NULL) 1305 ); 1306 1307 KMP_MB(); /* is this necessary? */ 1308 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1309 gtid ) ); 1310 pr->ordered_bumped = 0; 1311 //!!!!! TODO check if the inc should be unsigned, or signed??? 1312 #ifdef KMP_DEBUG 1313 { 1314 const char * buff; 1315 // create format specifiers before the debug output 1316 buff = __kmp_str_format( 1317 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1318 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1319 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1320 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1321 __kmp_str_free( &buff ); 1322 } 1323 #endif 1324 1325 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1326 } 1327 // } 1328 } 1329 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1330 } 1331 1332 #endif /* KMP_GOMP_COMPAT */ 1333 1334 template< typename T > 1335 static int 1336 __kmp_dispatch_next( 1337 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1338 ) { 1339 1340 typedef typename traits_t< T >::unsigned_t UT; 1341 typedef typename traits_t< T >::signed_t ST; 1342 typedef typename traits_t< T >::floating_t DBL; 1343 static const int ___kmp_size_type = sizeof( UT ); 1344 1345 int status; 1346 dispatch_private_info_template< T > * pr; 1347 kmp_info_t * th = __kmp_threads[ gtid ]; 1348 kmp_team_t * team = th -> th.th_team; 1349 1350 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL 1351 #ifdef KMP_DEBUG 1352 { 1353 const char * buff; 1354 // create format specifiers before the debug output 1355 buff = __kmp_str_format( 1356 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1357 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1358 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1359 __kmp_str_free( &buff ); 1360 } 1361 #endif 1362 1363 if ( team -> t.t_serialized ) { 1364 /* NOTE: serialize this dispatch becase we are not at the active level */ 1365 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1366 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1367 KMP_DEBUG_ASSERT( pr ); 1368 1369 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1370 *p_lb = 0; 1371 *p_ub = 0; 1372 // if ( p_last != NULL ) 1373 // *p_last = 0; 1374 if ( p_st != NULL ) 1375 *p_st = 0; 1376 if ( __kmp_env_consistency_check ) { 1377 if ( pr->pushed_ws != ct_none ) { 1378 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1379 } 1380 } 1381 } else if ( pr->nomerge ) { 1382 kmp_int32 last; 1383 T start; 1384 UT limit, trip, init; 1385 ST incr; 1386 T chunk = pr->u.p.parm1; 1387 1388 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1389 1390 init = chunk * pr->u.p.count++; 1391 trip = pr->u.p.tc - 1; 1392 1393 if ( (status = (init <= trip)) == 0 ) { 1394 *p_lb = 0; 1395 *p_ub = 0; 1396 // if ( p_last != NULL ) 1397 // *p_last = 0; 1398 if ( p_st != NULL ) 1399 *p_st = 0; 1400 if ( __kmp_env_consistency_check ) { 1401 if ( pr->pushed_ws != ct_none ) { 1402 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1403 } 1404 } 1405 } else { 1406 start = pr->u.p.lb; 1407 limit = chunk + init - 1; 1408 incr = pr->u.p.st; 1409 1410 if ( (last = (limit >= trip)) != 0 ) { 1411 limit = trip; 1412 #if KMP_OS_WINDOWS 1413 pr->u.p.last_upper = pr->u.p.ub; 1414 #endif /* KMP_OS_WINDOWS */ 1415 } 1416 if ( p_last != NULL ) 1417 *p_last = last; 1418 if ( p_st != NULL ) 1419 *p_st = incr; 1420 if ( incr == 1 ) { 1421 *p_lb = start + init; 1422 *p_ub = start + limit; 1423 } else { 1424 *p_lb = start + init * incr; 1425 *p_ub = start + limit * incr; 1426 } 1427 1428 if ( pr->ordered ) { 1429 pr->u.p.ordered_lower = init; 1430 pr->u.p.ordered_upper = limit; 1431 #ifdef KMP_DEBUG 1432 { 1433 const char * buff; 1434 // create format specifiers before the debug output 1435 buff = __kmp_str_format( 1436 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1437 traits_t< UT >::spec, traits_t< UT >::spec ); 1438 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1439 __kmp_str_free( &buff ); 1440 } 1441 #endif 1442 } // if 1443 } // if 1444 } else { 1445 pr->u.p.tc = 0; 1446 *p_lb = pr->u.p.lb; 1447 *p_ub = pr->u.p.ub; 1448 #if KMP_OS_WINDOWS 1449 pr->u.p.last_upper = *p_ub; 1450 #endif /* KMP_OS_WINDOWS */ 1451 if ( p_last != NULL ) 1452 *p_last = TRUE; 1453 if ( p_st != NULL ) 1454 *p_st = pr->u.p.st; 1455 } // if 1456 #ifdef KMP_DEBUG 1457 { 1458 const char * buff; 1459 // create format specifiers before the debug output 1460 buff = __kmp_str_format( 1461 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1462 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", 1463 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1464 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) ); 1465 __kmp_str_free( &buff ); 1466 } 1467 #endif 1468 #if INCLUDE_SSC_MARKS 1469 SSC_MARK_DISPATCH_NEXT(); 1470 #endif 1471 return status; 1472 } else { 1473 kmp_int32 last = 0; 1474 dispatch_shared_info_template< UT > *sh; 1475 T start; 1476 ST incr; 1477 UT limit, trip, init; 1478 1479 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1480 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1481 1482 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1483 ( th->th.th_dispatch->th_dispatch_pr_current ); 1484 KMP_DEBUG_ASSERT( pr ); 1485 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1486 ( th->th.th_dispatch->th_dispatch_sh_current ); 1487 KMP_DEBUG_ASSERT( sh ); 1488 1489 if ( pr->u.p.tc == 0 ) { 1490 // zero trip count 1491 status = 0; 1492 } else { 1493 switch (pr->schedule) { 1494 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1495 case kmp_sch_static_steal: 1496 { 1497 T chunk = pr->u.p.parm1; 1498 1499 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1500 1501 trip = pr->u.p.tc - 1; 1502 1503 if ( ___kmp_size_type > 4 ) { 1504 // Other threads do not look into the data of this thread, 1505 // so it's not necessary to make volatile casting. 1506 init = ( pr->u.p.count )++; 1507 status = ( init < (UT)pr->u.p.ub ); 1508 } else { 1509 typedef union { 1510 struct { 1511 UT count; 1512 T ub; 1513 } p; 1514 kmp_int64 b; 1515 } union_i4; 1516 // All operations on 'count' or 'ub' must be combined atomically together. 1517 // stealing implemented only for 4-byte indexes 1518 { 1519 union_i4 vold, vnew; 1520 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1521 vnew = vold; 1522 vnew.p.count++; 1523 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1524 ( volatile kmp_int64* )&pr->u.p.count, 1525 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1526 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1527 KMP_CPU_PAUSE(); 1528 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1529 vnew = vold; 1530 vnew.p.count++; 1531 } 1532 vnew = vold; 1533 init = vnew.p.count; 1534 status = ( init < (UT)vnew.p.ub ) ; 1535 } 1536 1537 if( !status ) { 1538 kmp_info_t **other_threads = team->t.t_threads; 1539 int while_limit = 10; 1540 int while_index = 0; 1541 1542 // TODO: algorithm of searching for a victim 1543 // should be cleaned up and measured 1544 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1545 union_i4 vold, vnew; 1546 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1547 T victimIdx = pr->u.p.parm4; 1548 T oldVictimIdx = victimIdx; 1549 dispatch_private_info_template< T > * victim; 1550 1551 do { 1552 if( !victimIdx ) { 1553 victimIdx = team->t.t_nproc - 1; 1554 } else { 1555 --victimIdx; 1556 } 1557 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1558 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1559 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1560 // TODO: think about a proper place of this test 1561 if ( ( !victim ) || 1562 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1563 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1564 // TODO: delay would be nice 1565 continue; 1566 // the victim is not ready yet to participate in stealing 1567 // because the victim is still in kmp_init_dispatch 1568 } 1569 if ( oldVictimIdx == victimIdx ) { 1570 break; 1571 } 1572 pr->u.p.parm4 = victimIdx; 1573 1574 while( 1 ) { 1575 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1576 vnew = vold; 1577 1578 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1579 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1580 break; 1581 } 1582 vnew.p.ub -= (remaining >> 2); 1583 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1584 #pragma warning( push ) 1585 // disable warning on pointless comparison of unsigned with 0 1586 #pragma warning( disable: 186 ) 1587 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1588 #pragma warning( pop ) 1589 // TODO: Should this be acquire or release? 1590 if ( KMP_COMPARE_AND_STORE_ACQ64( 1591 ( volatile kmp_int64 * )&victim->u.p.count, 1592 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1593 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1594 status = 1; 1595 while_index = 0; 1596 // now update own count and ub 1597 #if KMP_ARCH_X86 1598 // stealing executed on non-KMP_ARCH_X86 only 1599 // Atomic 64-bit write on ia32 is 1600 // unavailable, so we do this in steps. 1601 // This code is not tested. 1602 init = vold.p.count; 1603 pr->u.p.ub = 0; 1604 pr->u.p.count = init + 1; 1605 pr->u.p.ub = vnew.p.count; 1606 #else 1607 init = vnew.p.ub; 1608 vold.p.count = init + 1; 1609 // TODO: is it safe and enough? 1610 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1611 #endif // KMP_ARCH_X86 1612 break; 1613 } // if 1614 KMP_CPU_PAUSE(); 1615 } // while (1) 1616 } // while 1617 } // if 1618 } // if 1619 if ( !status ) { 1620 *p_lb = 0; 1621 *p_ub = 0; 1622 if ( p_st != NULL ) *p_st = 0; 1623 } else { 1624 start = pr->u.p.parm2; 1625 init *= chunk; 1626 limit = chunk + init - 1; 1627 incr = pr->u.p.st; 1628 1629 KMP_DEBUG_ASSERT(init <= trip); 1630 if ( (last = (limit >= trip)) != 0 ) 1631 limit = trip; 1632 if ( p_st != NULL ) *p_st = incr; 1633 1634 if ( incr == 1 ) { 1635 *p_lb = start + init; 1636 *p_ub = start + limit; 1637 } else { 1638 *p_lb = start + init * incr; 1639 *p_ub = start + limit * incr; 1640 } 1641 1642 if ( pr->ordered ) { 1643 pr->u.p.ordered_lower = init; 1644 pr->u.p.ordered_upper = limit; 1645 #ifdef KMP_DEBUG 1646 { 1647 const char * buff; 1648 // create format specifiers before the debug output 1649 buff = __kmp_str_format( 1650 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1651 traits_t< UT >::spec, traits_t< UT >::spec ); 1652 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1653 __kmp_str_free( &buff ); 1654 } 1655 #endif 1656 } // if 1657 } // if 1658 break; 1659 } // case 1660 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1661 case kmp_sch_static_balanced: 1662 { 1663 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1664 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1665 pr->u.p.count = 1; 1666 *p_lb = pr->u.p.lb; 1667 *p_ub = pr->u.p.ub; 1668 last = pr->u.p.parm1; 1669 if ( p_st != NULL ) 1670 *p_st = pr->u.p.st; 1671 } else { /* no iterations to do */ 1672 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1673 } 1674 if ( pr->ordered ) { 1675 #ifdef KMP_DEBUG 1676 { 1677 const char * buff; 1678 // create format specifiers before the debug output 1679 buff = __kmp_str_format( 1680 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1681 traits_t< UT >::spec, traits_t< UT >::spec ); 1682 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1683 __kmp_str_free( &buff ); 1684 } 1685 #endif 1686 } // if 1687 } // case 1688 break; 1689 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1690 case kmp_sch_static_chunked: 1691 { 1692 T parm1; 1693 1694 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1695 gtid ) ); 1696 parm1 = pr->u.p.parm1; 1697 1698 trip = pr->u.p.tc - 1; 1699 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1700 1701 if ( (status = (init <= trip)) != 0 ) { 1702 start = pr->u.p.lb; 1703 incr = pr->u.p.st; 1704 limit = parm1 + init - 1; 1705 1706 if ( (last = (limit >= trip)) != 0 ) 1707 limit = trip; 1708 1709 if ( p_st != NULL ) *p_st = incr; 1710 1711 pr->u.p.count += team->t.t_nproc; 1712 1713 if ( incr == 1 ) { 1714 *p_lb = start + init; 1715 *p_ub = start + limit; 1716 } 1717 else { 1718 *p_lb = start + init * incr; 1719 *p_ub = start + limit * incr; 1720 } 1721 1722 if ( pr->ordered ) { 1723 pr->u.p.ordered_lower = init; 1724 pr->u.p.ordered_upper = limit; 1725 #ifdef KMP_DEBUG 1726 { 1727 const char * buff; 1728 // create format specifiers before the debug output 1729 buff = __kmp_str_format( 1730 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1731 traits_t< UT >::spec, traits_t< UT >::spec ); 1732 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1733 __kmp_str_free( &buff ); 1734 } 1735 #endif 1736 } // if 1737 } // if 1738 } // case 1739 break; 1740 1741 case kmp_sch_dynamic_chunked: 1742 { 1743 T chunk = pr->u.p.parm1; 1744 1745 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1746 gtid ) ); 1747 1748 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1749 trip = pr->u.p.tc - 1; 1750 1751 if ( (status = (init <= trip)) == 0 ) { 1752 *p_lb = 0; 1753 *p_ub = 0; 1754 if ( p_st != NULL ) *p_st = 0; 1755 } else { 1756 start = pr->u.p.lb; 1757 limit = chunk + init - 1; 1758 incr = pr->u.p.st; 1759 1760 if ( (last = (limit >= trip)) != 0 ) 1761 limit = trip; 1762 1763 if ( p_st != NULL ) *p_st = incr; 1764 1765 if ( incr == 1 ) { 1766 *p_lb = start + init; 1767 *p_ub = start + limit; 1768 } else { 1769 *p_lb = start + init * incr; 1770 *p_ub = start + limit * incr; 1771 } 1772 1773 if ( pr->ordered ) { 1774 pr->u.p.ordered_lower = init; 1775 pr->u.p.ordered_upper = limit; 1776 #ifdef KMP_DEBUG 1777 { 1778 const char * buff; 1779 // create format specifiers before the debug output 1780 buff = __kmp_str_format( 1781 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1782 traits_t< UT >::spec, traits_t< UT >::spec ); 1783 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1784 __kmp_str_free( &buff ); 1785 } 1786 #endif 1787 } // if 1788 } // if 1789 } // case 1790 break; 1791 1792 case kmp_sch_guided_iterative_chunked: 1793 { 1794 T chunkspec = pr->u.p.parm1; 1795 KD_TRACE(100, 1796 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1797 trip = pr->u.p.tc; 1798 // Start atomic part of calculations 1799 while(1) { 1800 ST remaining; // signed, because can be < 0 1801 init = sh->u.s.iteration; // shared value 1802 remaining = trip - init; 1803 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1804 // nothing to do, don't try atomic op 1805 status = 0; 1806 break; 1807 } 1808 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1809 // use dynamic-style shcedule 1810 // atomically inrement iterations, get old value 1811 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1812 remaining = trip - init; 1813 if (remaining <= 0) { 1814 status = 0; // all iterations got by other threads 1815 } else { 1816 // got some iterations to work on 1817 status = 1; 1818 if ( (T)remaining > chunkspec ) { 1819 limit = init + chunkspec - 1; 1820 } else { 1821 last = 1; // the last chunk 1822 limit = init + remaining - 1; 1823 } // if 1824 } // if 1825 break; 1826 } // if 1827 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1828 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1829 // CAS was successful, chunk obtained 1830 status = 1; 1831 --limit; 1832 break; 1833 } // if 1834 } // while 1835 if ( status != 0 ) { 1836 start = pr->u.p.lb; 1837 incr = pr->u.p.st; 1838 if ( p_st != NULL ) 1839 *p_st = incr; 1840 *p_lb = start + init * incr; 1841 *p_ub = start + limit * incr; 1842 if ( pr->ordered ) { 1843 pr->u.p.ordered_lower = init; 1844 pr->u.p.ordered_upper = limit; 1845 #ifdef KMP_DEBUG 1846 { 1847 const char * buff; 1848 // create format specifiers before the debug output 1849 buff = __kmp_str_format( 1850 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1851 traits_t< UT >::spec, traits_t< UT >::spec ); 1852 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1853 __kmp_str_free( &buff ); 1854 } 1855 #endif 1856 } // if 1857 } else { 1858 *p_lb = 0; 1859 *p_ub = 0; 1860 if ( p_st != NULL ) 1861 *p_st = 0; 1862 } // if 1863 } // case 1864 break; 1865 1866 case kmp_sch_guided_analytical_chunked: 1867 { 1868 T chunkspec = pr->u.p.parm1; 1869 UT chunkIdx; 1870 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1871 /* for storing original FPCW value for Windows* OS on 1872 IA-32 architecture 8-byte version */ 1873 unsigned int oldFpcw; 1874 unsigned int fpcwSet = 0; 1875 #endif 1876 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1877 gtid ) ); 1878 1879 trip = pr->u.p.tc; 1880 1881 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1882 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1883 1884 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1885 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1886 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1887 --trip; 1888 /* use dynamic-style scheduling */ 1889 init = chunkIdx * chunkspec + pr->u.p.count; 1890 /* need to verify init > 0 in case of overflow in the above calculation */ 1891 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1892 limit = init + chunkspec -1; 1893 1894 if ( (last = (limit >= trip)) != 0 ) 1895 limit = trip; 1896 } 1897 break; 1898 } else { 1899 /* use exponential-style scheduling */ 1900 /* The following check is to workaround the lack of long double precision on Windows* OS. 1901 This check works around the possible effect that init != 0 for chunkIdx == 0. 1902 */ 1903 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1904 /* If we haven't already done so, save original 1905 FPCW and set precision to 64-bit, as Windows* OS 1906 on IA-32 architecture defaults to 53-bit */ 1907 if ( !fpcwSet ) { 1908 oldFpcw = _control87(0,0); 1909 _control87(_PC_64,_MCW_PC); 1910 fpcwSet = 0x30000; 1911 } 1912 #endif 1913 if ( chunkIdx ) { 1914 init = __kmp_dispatch_guided_remaining< T >( 1915 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1916 KMP_DEBUG_ASSERT(init); 1917 init = trip - init; 1918 } else 1919 init = 0; 1920 limit = trip - __kmp_dispatch_guided_remaining< T >( 1921 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1922 KMP_ASSERT(init <= limit); 1923 if ( init < limit ) { 1924 KMP_DEBUG_ASSERT(limit <= trip); 1925 --limit; 1926 status = 1; 1927 break; 1928 } // if 1929 } // if 1930 } // while (1) 1931 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1932 /* restore FPCW if necessary 1933 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1934 */ 1935 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1936 _control87(oldFpcw,_MCW_PC); 1937 #endif 1938 if ( status != 0 ) { 1939 start = pr->u.p.lb; 1940 incr = pr->u.p.st; 1941 if ( p_st != NULL ) 1942 *p_st = incr; 1943 *p_lb = start + init * incr; 1944 *p_ub = start + limit * incr; 1945 if ( pr->ordered ) { 1946 pr->u.p.ordered_lower = init; 1947 pr->u.p.ordered_upper = limit; 1948 #ifdef KMP_DEBUG 1949 { 1950 const char * buff; 1951 // create format specifiers before the debug output 1952 buff = __kmp_str_format( 1953 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1954 traits_t< UT >::spec, traits_t< UT >::spec ); 1955 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1956 __kmp_str_free( &buff ); 1957 } 1958 #endif 1959 } 1960 } else { 1961 *p_lb = 0; 1962 *p_ub = 0; 1963 if ( p_st != NULL ) 1964 *p_st = 0; 1965 } 1966 } // case 1967 break; 1968 1969 case kmp_sch_trapezoidal: 1970 { 1971 UT index; 1972 T parm2 = pr->u.p.parm2; 1973 T parm3 = pr->u.p.parm3; 1974 T parm4 = pr->u.p.parm4; 1975 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 1976 gtid ) ); 1977 1978 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 1979 1980 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 1981 trip = pr->u.p.tc - 1; 1982 1983 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 1984 *p_lb = 0; 1985 *p_ub = 0; 1986 if ( p_st != NULL ) *p_st = 0; 1987 } else { 1988 start = pr->u.p.lb; 1989 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 1990 incr = pr->u.p.st; 1991 1992 if ( (last = (limit >= trip)) != 0 ) 1993 limit = trip; 1994 1995 if ( p_st != NULL ) *p_st = incr; 1996 1997 if ( incr == 1 ) { 1998 *p_lb = start + init; 1999 *p_ub = start + limit; 2000 } else { 2001 *p_lb = start + init * incr; 2002 *p_ub = start + limit * incr; 2003 } 2004 2005 if ( pr->ordered ) { 2006 pr->u.p.ordered_lower = init; 2007 pr->u.p.ordered_upper = limit; 2008 #ifdef KMP_DEBUG 2009 { 2010 const char * buff; 2011 // create format specifiers before the debug output 2012 buff = __kmp_str_format( 2013 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 2014 traits_t< UT >::spec, traits_t< UT >::spec ); 2015 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 2016 __kmp_str_free( &buff ); 2017 } 2018 #endif 2019 } // if 2020 } // if 2021 } // case 2022 break; 2023 default: 2024 { 2025 status = 0; // to avoid complaints on uninitialized variable use 2026 __kmp_msg( 2027 kmp_ms_fatal, // Severity 2028 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 2029 KMP_HNT( GetNewerLibrary ), // Hint 2030 __kmp_msg_null // Variadic argument list terminator 2031 ); 2032 } 2033 break; 2034 } // switch 2035 } // if tc == 0; 2036 2037 if ( status == 0 ) { 2038 UT num_done; 2039 2040 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2041 #ifdef KMP_DEBUG 2042 { 2043 const char * buff; 2044 // create format specifiers before the debug output 2045 buff = __kmp_str_format( 2046 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2047 traits_t< UT >::spec ); 2048 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2049 __kmp_str_free( &buff ); 2050 } 2051 #endif 2052 2053 if ( (ST)num_done == team->t.t_nproc-1 ) { 2054 /* NOTE: release this buffer to be reused */ 2055 2056 KMP_MB(); /* Flush all pending memory write invalidates. */ 2057 2058 sh->u.s.num_done = 0; 2059 sh->u.s.iteration = 0; 2060 2061 /* TODO replace with general release procedure? */ 2062 if ( pr->ordered ) { 2063 sh->u.s.ordered_iteration = 0; 2064 } 2065 2066 KMP_MB(); /* Flush all pending memory write invalidates. */ 2067 2068 sh -> buffer_index += KMP_MAX_DISP_BUF; 2069 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2070 gtid, sh->buffer_index) ); 2071 2072 KMP_MB(); /* Flush all pending memory write invalidates. */ 2073 2074 } // if 2075 if ( __kmp_env_consistency_check ) { 2076 if ( pr->pushed_ws != ct_none ) { 2077 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2078 } 2079 } 2080 2081 th -> th.th_dispatch -> th_deo_fcn = NULL; 2082 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2083 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2084 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2085 } // if (status == 0) 2086 #if KMP_OS_WINDOWS 2087 else if ( last ) { 2088 pr->u.p.last_upper = pr->u.p.ub; 2089 } 2090 #endif /* KMP_OS_WINDOWS */ 2091 if ( p_last != NULL && status != 0 ) 2092 *p_last = last; 2093 } // if 2094 2095 #ifdef KMP_DEBUG 2096 { 2097 const char * buff; 2098 // create format specifiers before the debug output 2099 buff = __kmp_str_format( 2100 "__kmp_dispatch_next: T#%%d normal case: " \ 2101 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2102 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2103 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2104 __kmp_str_free( &buff ); 2105 } 2106 #endif 2107 #if INCLUDE_SSC_MARKS 2108 SSC_MARK_DISPATCH_NEXT(); 2109 #endif 2110 return status; 2111 } 2112 2113 template< typename T > 2114 static void 2115 __kmp_dist_get_bounds( 2116 ident_t *loc, 2117 kmp_int32 gtid, 2118 kmp_int32 *plastiter, 2119 T *plower, 2120 T *pupper, 2121 typename traits_t< T >::signed_t incr 2122 ) { 2123 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic); 2124 typedef typename traits_t< T >::unsigned_t UT; 2125 typedef typename traits_t< T >::signed_t ST; 2126 register kmp_uint32 team_id; 2127 register kmp_uint32 nteams; 2128 register UT trip_count; 2129 register kmp_team_t *team; 2130 kmp_info_t * th; 2131 2132 KMP_DEBUG_ASSERT( plastiter && plower && pupper ); 2133 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid)); 2134 #ifdef KMP_DEBUG 2135 { 2136 const char * buff; 2137 // create format specifiers before the debug output 2138 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\ 2139 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", 2140 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, 2141 traits_t< T >::spec ); 2142 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) ); 2143 __kmp_str_free( &buff ); 2144 } 2145 #endif 2146 2147 if( __kmp_env_consistency_check ) { 2148 if( incr == 0 ) { 2149 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc ); 2150 } 2151 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) { 2152 // The loop is illegal. 2153 // Some zero-trip loops maintained by compiler, e.g.: 2154 // for(i=10;i<0;++i) // lower >= upper - run-time check 2155 // for(i=0;i>10;--i) // lower <= upper - run-time check 2156 // for(i=0;i>10;++i) // incr > 0 - compile-time check 2157 // for(i=10;i<0;--i) // incr < 0 - compile-time check 2158 // Compiler does not check the following illegal loops: 2159 // for(i=0;i<10;i+=incr) // where incr<0 2160 // for(i=10;i>0;i-=incr) // where incr<0 2161 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc ); 2162 } 2163 } 2164 th = __kmp_threads[gtid]; 2165 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct 2166 team = th->th.th_team; 2167 #if OMP_40_ENABLED 2168 nteams = th->th.th_teams_size.nteams; 2169 #endif 2170 team_id = team->t.t_master_tid; 2171 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc); 2172 2173 // compute global trip count 2174 if( incr == 1 ) { 2175 trip_count = *pupper - *plower + 1; 2176 } else if(incr == -1) { 2177 trip_count = *plower - *pupper + 1; 2178 } else { 2179 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case 2180 } 2181 if( trip_count <= nteams ) { 2182 KMP_DEBUG_ASSERT( 2183 __kmp_static == kmp_sch_static_greedy || \ 2184 __kmp_static == kmp_sch_static_balanced 2185 ); // Unknown static scheduling type. 2186 // only some teams get single iteration, others get nothing 2187 if( team_id < trip_count ) { 2188 *pupper = *plower = *plower + team_id * incr; 2189 } else { 2190 *plower = *pupper + incr; // zero-trip loop 2191 } 2192 if( plastiter != NULL ) 2193 *plastiter = ( team_id == trip_count - 1 ); 2194 } else { 2195 if( __kmp_static == kmp_sch_static_balanced ) { 2196 register UT chunk = trip_count / nteams; 2197 register UT extras = trip_count % nteams; 2198 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) ); 2199 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr ); 2200 if( plastiter != NULL ) 2201 *plastiter = ( team_id == nteams - 1 ); 2202 } else { 2203 register T chunk_inc_count = 2204 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr; 2205 register T upper = *pupper; 2206 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy ); 2207 // Unknown static scheduling type. 2208 *plower += team_id * chunk_inc_count; 2209 *pupper = *plower + chunk_inc_count - incr; 2210 // Check/correct bounds if needed 2211 if( incr > 0 ) { 2212 if( *pupper < *plower ) 2213 *pupper = i_maxmin< T >::mx; 2214 if( plastiter != NULL ) 2215 *plastiter = *plower <= upper && *pupper > upper - incr; 2216 if( *pupper > upper ) 2217 *pupper = upper; // tracker C73258 2218 } else { 2219 if( *pupper > *plower ) 2220 *pupper = i_maxmin< T >::mn; 2221 if( plastiter != NULL ) 2222 *plastiter = *plower >= upper && *pupper < upper - incr; 2223 if( *pupper < upper ) 2224 *pupper = upper; // tracker C73258 2225 } 2226 } 2227 } 2228 } 2229 2230 //----------------------------------------------------------------------------------------- 2231 // Dispatch routines 2232 // Transfer call to template< type T > 2233 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2234 // T lb, T ub, ST st, ST chunk ) 2235 extern "C" { 2236 2237 /*! 2238 @ingroup WORK_SHARING 2239 @{ 2240 @param loc Source location 2241 @param gtid Global thread id 2242 @param schedule Schedule type 2243 @param lb Lower bound 2244 @param ub Upper bound 2245 @param st Step (or increment if you prefer) 2246 @param chunk The chunk size to block with 2247 2248 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2249 These functions are all identical apart from the types of the arguments. 2250 */ 2251 2252 void 2253 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2254 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2255 { 2256 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2257 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2258 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2259 } 2260 /*! 2261 See @ref __kmpc_dispatch_init_4 2262 */ 2263 void 2264 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2265 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2266 { 2267 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2268 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2269 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2270 } 2271 2272 /*! 2273 See @ref __kmpc_dispatch_init_4 2274 */ 2275 void 2276 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2277 kmp_int64 lb, kmp_int64 ub, 2278 kmp_int64 st, kmp_int64 chunk ) 2279 { 2280 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2281 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2282 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2283 } 2284 2285 /*! 2286 See @ref __kmpc_dispatch_init_4 2287 */ 2288 void 2289 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2290 kmp_uint64 lb, kmp_uint64 ub, 2291 kmp_int64 st, kmp_int64 chunk ) 2292 { 2293 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2294 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2295 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2296 } 2297 2298 /*! 2299 See @ref __kmpc_dispatch_init_4 2300 2301 Difference from __kmpc_dispatch_init set of functions is these functions 2302 are called for composite distribute parallel for construct. Thus before 2303 regular iterations dispatching we need to calc per-team iteration space. 2304 2305 These functions are all identical apart from the types of the arguments. 2306 */ 2307 void 2308 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2309 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2310 { 2311 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2312 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2313 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st ); 2314 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2315 } 2316 2317 void 2318 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2319 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2320 { 2321 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2322 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2323 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st ); 2324 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2325 } 2326 2327 void 2328 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2329 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk ) 2330 { 2331 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2332 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2333 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st ); 2334 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2335 } 2336 2337 void 2338 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2339 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk ) 2340 { 2341 KMP_COUNT_BLOCK(OMP_FOR_dynamic); 2342 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2343 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st ); 2344 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2345 } 2346 2347 /*! 2348 @param loc Source code location 2349 @param gtid Global thread id 2350 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2351 @param p_lb Pointer to the lower bound for the next chunk of work 2352 @param p_ub Pointer to the upper bound for the next chunk of work 2353 @param p_st Pointer to the stride for the next chunk of work 2354 @return one if there is work to be done, zero otherwise 2355 2356 Get the next dynamically allocated chunk of work for this thread. 2357 If there is no more work, then the lb,ub and stride need not be modified. 2358 */ 2359 int 2360 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2361 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2362 { 2363 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2364 } 2365 2366 /*! 2367 See @ref __kmpc_dispatch_next_4 2368 */ 2369 int 2370 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2371 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2372 { 2373 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2374 } 2375 2376 /*! 2377 See @ref __kmpc_dispatch_next_4 2378 */ 2379 int 2380 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2381 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2382 { 2383 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2384 } 2385 2386 /*! 2387 See @ref __kmpc_dispatch_next_4 2388 */ 2389 int 2390 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2391 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2392 { 2393 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2394 } 2395 2396 /*! 2397 @param loc Source code location 2398 @param gtid Global thread id 2399 2400 Mark the end of a dynamic loop. 2401 */ 2402 void 2403 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2404 { 2405 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2406 } 2407 2408 /*! 2409 See @ref __kmpc_dispatch_fini_4 2410 */ 2411 void 2412 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2413 { 2414 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2415 } 2416 2417 /*! 2418 See @ref __kmpc_dispatch_fini_4 2419 */ 2420 void 2421 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2422 { 2423 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2424 } 2425 2426 /*! 2427 See @ref __kmpc_dispatch_fini_4 2428 */ 2429 void 2430 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2431 { 2432 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2433 } 2434 /*! @} */ 2435 2436 //----------------------------------------------------------------------------------------- 2437 //Non-template routines from kmp_dispatch.c used in other sources 2438 2439 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2440 return value == checker; 2441 } 2442 2443 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2444 return value != checker; 2445 } 2446 2447 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2448 return value < checker; 2449 } 2450 2451 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2452 return value >= checker; 2453 } 2454 2455 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2456 return value <= checker; 2457 } 2458 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2459 return value == checker; 2460 } 2461 2462 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2463 return value != checker; 2464 } 2465 2466 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2467 return value < checker; 2468 } 2469 2470 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2471 return value >= checker; 2472 } 2473 2474 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2475 return value <= checker; 2476 } 2477 2478 kmp_uint32 2479 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2480 kmp_uint32 checker, 2481 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2482 , void * obj // Higher-level synchronization object, or NULL. 2483 ) 2484 { 2485 // note: we may not belong to a team at this point 2486 register volatile kmp_uint32 * spin = spinner; 2487 register kmp_uint32 check = checker; 2488 register kmp_uint32 spins; 2489 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2490 register kmp_uint32 r; 2491 2492 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2493 KMP_INIT_YIELD( spins ); 2494 // main wait spin loop 2495 while(!f(r = TCR_4(*spin), check)) { 2496 KMP_FSYNC_SPIN_PREPARE( obj ); 2497 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2498 It causes problems with infinite recursion because of exit lock */ 2499 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2500 __kmp_abort_thread(); */ 2501 2502 /* if we have waited a bit, or are oversubscribed, yield */ 2503 /* pause is in the following code */ 2504 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2505 KMP_YIELD_SPIN( spins ); 2506 } 2507 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2508 return r; 2509 } 2510 2511 kmp_uint64 2512 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2513 kmp_uint64 checker, 2514 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2515 , void * obj // Higher-level synchronization object, or NULL. 2516 ) 2517 { 2518 // note: we may not belong to a team at this point 2519 register volatile kmp_uint64 * spin = spinner; 2520 register kmp_uint64 check = checker; 2521 register kmp_uint32 spins; 2522 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2523 register kmp_uint64 r; 2524 2525 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2526 KMP_INIT_YIELD( spins ); 2527 // main wait spin loop 2528 while(!f(r = *spin, check)) 2529 { 2530 KMP_FSYNC_SPIN_PREPARE( obj ); 2531 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2532 It causes problems with infinite recursion because of exit lock */ 2533 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2534 __kmp_abort_thread(); */ 2535 2536 // if we are oversubscribed, 2537 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2538 // pause is in the following code 2539 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2540 KMP_YIELD_SPIN( spins ); 2541 } 2542 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2543 return r; 2544 } 2545 2546 } // extern "C" 2547 2548 #ifdef KMP_GOMP_COMPAT 2549 2550 void 2551 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2552 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2553 kmp_int32 chunk, int push_ws ) 2554 { 2555 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2556 push_ws ); 2557 } 2558 2559 void 2560 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2561 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2562 kmp_int32 chunk, int push_ws ) 2563 { 2564 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2565 push_ws ); 2566 } 2567 2568 void 2569 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2570 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2571 kmp_int64 chunk, int push_ws ) 2572 { 2573 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2574 push_ws ); 2575 } 2576 2577 void 2578 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2579 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2580 kmp_int64 chunk, int push_ws ) 2581 { 2582 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2583 push_ws ); 2584 } 2585 2586 void 2587 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2588 { 2589 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2590 } 2591 2592 void 2593 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2594 { 2595 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2596 } 2597 2598 void 2599 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2600 { 2601 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2602 } 2603 2604 void 2605 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2606 { 2607 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2608 } 2609 2610 #endif /* KMP_GOMP_COMPAT */ 2611 2612 /* ------------------------------------------------------------------------ */ 2613 /* ------------------------------------------------------------------------ */ 2614 2615