1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 * $Revision: 42624 $ 4 * $Date: 2013-08-27 10:53:11 -0500 (Tue, 27 Aug 2013) $ 5 */ 6 7 8 //===----------------------------------------------------------------------===// 9 // 10 // The LLVM Compiler Infrastructure 11 // 12 // This file is dual licensed under the MIT and the University of Illinois Open 13 // Source Licenses. See LICENSE.txt for details. 14 // 15 //===----------------------------------------------------------------------===// 16 17 18 /* 19 * Dynamic scheduling initialization and dispatch. 20 * 21 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 22 * it may change values between parallel regions. __kmp_max_nth 23 * is the largest value __kmp_nth may take, 1 is the smallest. 24 * 25 */ 26 27 /* ------------------------------------------------------------------------ */ 28 /* ------------------------------------------------------------------------ */ 29 30 #include "kmp.h" 31 #include "kmp_i18n.h" 32 #include "kmp_itt.h" 33 #include "kmp_str.h" 34 #include "kmp_error.h" 35 #if KMP_OS_WINDOWS && KMP_ARCH_X86 36 #include <float.h> 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 #ifdef KMP_STATIC_STEAL_ENABLED 43 44 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 45 template< typename T > 46 struct dispatch_private_infoXX_template { 47 typedef typename traits_t< T >::unsigned_t UT; 48 typedef typename traits_t< T >::signed_t ST; 49 UT count; // unsigned 50 T ub; 51 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 52 T lb; 53 ST st; // signed 54 UT tc; // unsigned 55 T static_steal_counter; // for static_steal only; maybe better to put after ub 56 57 /* parm[1-4] are used in different ways by different scheduling algorithms */ 58 59 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 60 // a) parm3 is properly aligned and 61 // b) all parm1-4 are in the same cache line. 62 // Because of parm1-4 are used together, performance seems to be better 63 // if they are in the same line (not measured though). 64 65 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 66 T parm1; 67 T parm2; 68 T parm3; 69 T parm4; 70 }; 71 72 UT ordered_lower; // unsigned 73 UT ordered_upper; // unsigned 74 #if KMP_OS_WINDOWS 75 T last_upper; 76 #endif /* KMP_OS_WINDOWS */ 77 }; 78 79 #else /* KMP_STATIC_STEAL_ENABLED */ 80 81 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 82 template< typename T > 83 struct dispatch_private_infoXX_template { 84 typedef typename traits_t< T >::unsigned_t UT; 85 typedef typename traits_t< T >::signed_t ST; 86 T lb; 87 T ub; 88 ST st; // signed 89 UT tc; // unsigned 90 91 T parm1; 92 T parm2; 93 T parm3; 94 T parm4; 95 96 UT count; // unsigned 97 98 UT ordered_lower; // unsigned 99 UT ordered_upper; // unsigned 100 #if KMP_OS_WINDOWS 101 T last_upper; 102 #endif /* KMP_OS_WINDOWS */ 103 }; 104 105 #endif /* KMP_STATIC_STEAL_ENABLED */ 106 107 // replaces dispatch_private_info structure and dispatch_private_info_t type 108 template< typename T > 109 struct KMP_ALIGN_CACHE dispatch_private_info_template { 110 // duplicate alignment here, otherwise size of structure is not correct in our compiler 111 union KMP_ALIGN_CACHE private_info_tmpl { 112 dispatch_private_infoXX_template< T > p; 113 dispatch_private_info64_t p64; 114 } u; 115 enum sched_type schedule; /* scheduling algorithm */ 116 kmp_uint32 ordered; /* ordered clause specified */ 117 kmp_uint32 ordered_bumped; 118 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 119 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 120 kmp_uint32 nomerge; /* don't merge iters if serialized */ 121 kmp_uint32 type_size; 122 enum cons_type pushed_ws; 123 }; 124 125 126 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 127 template< typename UT > 128 struct dispatch_shared_infoXX_template { 129 /* chunk index under dynamic, number of idle threads under static-steal; 130 iteration index otherwise */ 131 volatile UT iteration; 132 volatile UT num_done; 133 volatile UT ordered_iteration; 134 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 135 }; 136 137 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 138 template< typename UT > 139 struct dispatch_shared_info_template { 140 // we need union here to keep the structure size 141 union shared_info_tmpl { 142 dispatch_shared_infoXX_template< UT > s; 143 dispatch_shared_info64_t s64; 144 } u; 145 volatile kmp_uint32 buffer_index; 146 }; 147 148 /* ------------------------------------------------------------------------ */ 149 /* ------------------------------------------------------------------------ */ 150 151 static void 152 __kmp_static_delay( int arg ) 153 { 154 /* Work around weird code-gen bug that causes assert to trip */ 155 #if KMP_ARCH_X86_64 && KMP_OS_LINUX 156 #else 157 KMP_ASSERT( arg >= 0 ); 158 #endif 159 } 160 161 static void 162 __kmp_static_yield( int arg ) 163 { 164 __kmp_yield( arg ); 165 } 166 167 #undef USE_TEST_LOCKS 168 169 // test_then_add template (general template should NOT be used) 170 template< typename T > 171 static __forceinline T 172 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 173 174 template<> 175 __forceinline kmp_int32 176 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 177 { 178 kmp_int32 r; 179 r = KMP_TEST_THEN_ADD32( p, d ); 180 return r; 181 } 182 183 template<> 184 __forceinline kmp_int64 185 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 186 { 187 kmp_int64 r; 188 r = KMP_TEST_THEN_ADD64( p, d ); 189 return r; 190 } 191 192 // test_then_inc_acq template (general template should NOT be used) 193 template< typename T > 194 static __forceinline T 195 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 196 197 template<> 198 __forceinline kmp_int32 199 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 200 { 201 kmp_int32 r; 202 r = KMP_TEST_THEN_INC_ACQ32( p ); 203 return r; 204 } 205 206 template<> 207 __forceinline kmp_int64 208 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 209 { 210 kmp_int64 r; 211 r = KMP_TEST_THEN_INC_ACQ64( p ); 212 return r; 213 } 214 215 // test_then_inc template (general template should NOT be used) 216 template< typename T > 217 static __forceinline T 218 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 219 220 template<> 221 __forceinline kmp_int32 222 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 223 { 224 kmp_int32 r; 225 r = KMP_TEST_THEN_INC32( p ); 226 return r; 227 } 228 229 template<> 230 __forceinline kmp_int64 231 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 232 { 233 kmp_int64 r; 234 r = KMP_TEST_THEN_INC64( p ); 235 return r; 236 } 237 238 // compare_and_swap template (general template should NOT be used) 239 template< typename T > 240 static __forceinline kmp_int32 241 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 242 243 template<> 244 __forceinline kmp_int32 245 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 246 { 247 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 248 } 249 250 template<> 251 __forceinline kmp_int32 252 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 253 { 254 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 255 } 256 257 /* 258 Spin wait loop that first does pause, then yield. 259 Waits until function returns non-zero when called with *spinner and check. 260 Does NOT put threads to sleep. 261 #if USE_ITT_BUILD 262 Arguments: 263 obj -- is higher-level syncronization object to report to ittnotify. It is used to report 264 locks consistently. For example, if lock is acquired immediately, its address is 265 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 266 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 267 address, not an address of low-level spinner. 268 #endif // USE_ITT_BUILD 269 */ 270 template< typename UT > 271 // ToDo: make inline function (move to header file for icl) 272 static UT // unsigned 4- or 8-byte type 273 __kmp_wait_yield( volatile UT * spinner, 274 UT checker, 275 kmp_uint32 (* pred)( UT, UT ) 276 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 277 ) 278 { 279 // note: we may not belong to a team at this point 280 register volatile UT * spin = spinner; 281 register UT check = checker; 282 register kmp_uint32 spins; 283 register kmp_uint32 (*f) ( UT, UT ) = pred; 284 register UT r; 285 286 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 287 KMP_INIT_YIELD( spins ); 288 // main wait spin loop 289 while(!f(r = *spin, check)) 290 { 291 KMP_FSYNC_SPIN_PREPARE( obj ); 292 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 293 It causes problems with infinite recursion because of exit lock */ 294 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 295 __kmp_abort_thread(); */ 296 297 __kmp_static_delay(TRUE); 298 299 // if we are oversubscribed, 300 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 301 // pause is in the following code 302 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 303 KMP_YIELD_SPIN( spins ); 304 } 305 KMP_FSYNC_SPIN_ACQUIRED( obj ); 306 return r; 307 } 308 309 template< typename UT > 310 static kmp_uint32 __kmp_eq( UT value, UT checker) { 311 return value == checker; 312 } 313 314 template< typename UT > 315 static kmp_uint32 __kmp_neq( UT value, UT checker) { 316 return value != checker; 317 } 318 319 template< typename UT > 320 static kmp_uint32 __kmp_lt( UT value, UT checker) { 321 return value < checker; 322 } 323 324 template< typename UT > 325 static kmp_uint32 __kmp_ge( UT value, UT checker) { 326 return value >= checker; 327 } 328 329 template< typename UT > 330 static kmp_uint32 __kmp_le( UT value, UT checker) { 331 return value <= checker; 332 } 333 334 335 /* ------------------------------------------------------------------------ */ 336 /* ------------------------------------------------------------------------ */ 337 338 static void 339 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 340 { 341 kmp_info_t *th; 342 343 KMP_DEBUG_ASSERT( gtid_ref ); 344 345 if ( __kmp_env_consistency_check ) { 346 th = __kmp_threads[*gtid_ref]; 347 if ( th -> th.th_root -> r.r_active 348 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 349 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 350 } 351 } 352 } 353 354 template< typename UT > 355 static void 356 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 357 { 358 typedef typename traits_t< UT >::signed_t ST; 359 dispatch_private_info_template< UT > * pr; 360 361 int gtid = *gtid_ref; 362 // int cid = *cid_ref; 363 kmp_info_t *th = __kmp_threads[ gtid ]; 364 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 365 366 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 367 if ( __kmp_env_consistency_check ) { 368 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 369 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 370 if ( pr -> pushed_ws != ct_none ) { 371 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 372 } 373 } 374 375 if ( ! th -> th.th_team -> t.t_serialized ) { 376 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 377 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 378 UT lower; 379 380 if ( ! __kmp_env_consistency_check ) { 381 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 382 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 383 } 384 lower = pr->u.p.ordered_lower; 385 386 #if ! defined( KMP_GOMP_COMPAT ) 387 if ( __kmp_env_consistency_check ) { 388 if ( pr->ordered_bumped ) { 389 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 390 __kmp_error_construct2( 391 kmp_i18n_msg_CnsMultipleNesting, 392 ct_ordered_in_pdo, loc_ref, 393 & p->stack_data[ p->w_top ] 394 ); 395 } 396 } 397 #endif /* !defined(KMP_GOMP_COMPAT) */ 398 399 KMP_MB(); 400 #ifdef KMP_DEBUG 401 { 402 const char * buff; 403 // create format specifiers before the debug output 404 buff = __kmp_str_format( 405 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 406 traits_t< UT >::spec, traits_t< UT >::spec ); 407 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 408 __kmp_str_free( &buff ); 409 } 410 #endif 411 412 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 413 USE_ITT_BUILD_ARG( NULL ) 414 ); 415 KMP_MB(); /* is this necessary? */ 416 #ifdef KMP_DEBUG 417 { 418 const char * buff; 419 // create format specifiers before the debug output 420 buff = __kmp_str_format( 421 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 422 traits_t< UT >::spec, traits_t< UT >::spec ); 423 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 424 __kmp_str_free( &buff ); 425 } 426 #endif 427 } 428 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 429 } 430 431 static void 432 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 433 { 434 kmp_info_t *th; 435 436 if ( __kmp_env_consistency_check ) { 437 th = __kmp_threads[*gtid_ref]; 438 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 439 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 440 } 441 } 442 } 443 444 template< typename UT > 445 static void 446 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 447 { 448 typedef typename traits_t< UT >::signed_t ST; 449 dispatch_private_info_template< UT > * pr; 450 451 int gtid = *gtid_ref; 452 // int cid = *cid_ref; 453 kmp_info_t *th = __kmp_threads[ gtid ]; 454 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 455 456 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 457 if ( __kmp_env_consistency_check ) { 458 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 459 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 460 if ( pr -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 465 if ( ! th -> th.th_team -> t.t_serialized ) { 466 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 467 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 468 469 if ( ! __kmp_env_consistency_check ) { 470 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 471 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 472 } 473 474 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 475 #if ! defined( KMP_GOMP_COMPAT ) 476 if ( __kmp_env_consistency_check ) { 477 if ( pr->ordered_bumped != 0 ) { 478 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 479 /* How to test it? - OM */ 480 __kmp_error_construct2( 481 kmp_i18n_msg_CnsMultipleNesting, 482 ct_ordered_in_pdo, loc_ref, 483 & p->stack_data[ p->w_top ] 484 ); 485 } 486 } 487 #endif /* !defined(KMP_GOMP_COMPAT) */ 488 489 KMP_MB(); /* Flush all pending memory write invalidates. */ 490 491 pr->ordered_bumped += 1; 492 493 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 494 gtid, pr->ordered_bumped ) ); 495 496 KMP_MB(); /* Flush all pending memory write invalidates. */ 497 498 /* TODO use general release procedure? */ 499 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 500 501 KMP_MB(); /* Flush all pending memory write invalidates. */ 502 } 503 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 504 } 505 506 /* Computes and returns x to the power of y, where y must a non-negative integer */ 507 template< typename UT > 508 static __forceinline long double 509 __kmp_pow(long double x, UT y) { 510 long double s=1.0L; 511 512 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 513 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 514 while(y) { 515 if ( y & 1 ) 516 s *= x; 517 x *= x; 518 y >>= 1; 519 } 520 return s; 521 } 522 523 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 524 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 525 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 526 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 527 */ 528 template< typename T > 529 static __inline typename traits_t< T >::unsigned_t 530 __kmp_dispatch_guided_remaining( 531 T tc, 532 typename traits_t< T >::floating_t base, 533 typename traits_t< T >::unsigned_t idx 534 ) { 535 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 536 least for ICL 8.1, long double arithmetic may not really have 537 long double precision, even with /Qlong_double. Currently, we 538 workaround that in the caller code, by manipulating the FPCW for 539 Windows* OS on IA-32 architecture. The lack of precision is not 540 expected to be a correctness issue, though. 541 */ 542 typedef typename traits_t< T >::unsigned_t UT; 543 544 long double x = tc * __kmp_pow< UT >(base, idx); 545 UT r = (UT) x; 546 if ( x == r ) 547 return r; 548 return r + 1; 549 } 550 551 // Parameters of the guided-iterative algorithm: 552 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 553 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 554 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 555 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 556 static int guided_int_param = 2; 557 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 558 559 // UT - unsigned flavor of T, ST - signed flavor of T, 560 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 561 template< typename T > 562 static void 563 __kmp_dispatch_init( 564 ident_t * loc, 565 int gtid, 566 enum sched_type schedule, 567 T lb, 568 T ub, 569 typename traits_t< T >::signed_t st, 570 typename traits_t< T >::signed_t chunk, 571 int push_ws 572 ) { 573 typedef typename traits_t< T >::unsigned_t UT; 574 typedef typename traits_t< T >::signed_t ST; 575 typedef typename traits_t< T >::floating_t DBL; 576 static const int ___kmp_size_type = sizeof( UT ); 577 578 int active; 579 T tc; 580 kmp_info_t * th; 581 kmp_team_t * team; 582 kmp_uint32 my_buffer_index; 583 dispatch_private_info_template< T > * pr; 584 dispatch_shared_info_template< UT > volatile * sh; 585 586 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 587 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 588 589 if ( ! TCR_4( __kmp_init_parallel ) ) 590 __kmp_parallel_initialize(); 591 592 #ifdef KMP_DEBUG 593 { 594 const char * buff; 595 // create format specifiers before the debug output 596 buff = __kmp_str_format( 597 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 598 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 599 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 600 __kmp_str_free( &buff ); 601 } 602 #endif 603 /* setup data */ 604 th = __kmp_threads[ gtid ]; 605 team = th -> th.th_team; 606 active = ! team -> t.t_serialized; 607 th->th.th_ident = loc; 608 609 if ( ! active ) { 610 pr = reinterpret_cast< dispatch_private_info_template< T >* > 611 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 612 } else { 613 KMP_DEBUG_ASSERT( th->th.th_dispatch == 614 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 615 616 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 617 618 /* What happens when number of threads changes, need to resize buffer? */ 619 pr = reinterpret_cast< dispatch_private_info_template< T > * > 620 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 621 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 622 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 623 } 624 625 /* Pick up the nomerge/ordered bits from the scheduling type */ 626 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 627 pr->nomerge = TRUE; 628 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 629 } else { 630 pr->nomerge = FALSE; 631 } 632 pr->type_size = ___kmp_size_type; // remember the size of variables 633 if ( kmp_ord_lower & schedule ) { 634 pr->ordered = TRUE; 635 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 636 } else { 637 pr->ordered = FALSE; 638 } 639 if ( schedule == kmp_sch_static ) { 640 schedule = __kmp_static; 641 } else { 642 if ( schedule == kmp_sch_runtime ) { 643 #if OMP_30_ENABLED 644 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 645 schedule = team -> t.t_sched.r_sched_type; 646 // Detail the schedule if needed (global controls are differentiated appropriately) 647 if ( schedule == kmp_sch_guided_chunked ) { 648 schedule = __kmp_guided; 649 } else if ( schedule == kmp_sch_static ) { 650 schedule = __kmp_static; 651 } 652 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 653 chunk = team -> t.t_sched.chunk; 654 #else 655 kmp_r_sched_t r_sched = __kmp_get_schedule_global(); 656 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default 657 schedule = r_sched.r_sched_type; 658 chunk = r_sched.chunk; 659 #endif 660 661 #ifdef KMP_DEBUG 662 { 663 const char * buff; 664 // create format specifiers before the debug output 665 buff = __kmp_str_format( 666 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 667 traits_t< ST >::spec ); 668 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 669 __kmp_str_free( &buff ); 670 } 671 #endif 672 } else { 673 if ( schedule == kmp_sch_guided_chunked ) { 674 schedule = __kmp_guided; 675 } 676 if ( chunk <= 0 ) { 677 chunk = KMP_DEFAULT_CHUNK; 678 } 679 } 680 681 #if OMP_30_ENABLED 682 if ( schedule == kmp_sch_auto ) { 683 // mapping and differentiation: in the __kmp_do_serial_initialize() 684 schedule = __kmp_auto; 685 #ifdef KMP_DEBUG 686 { 687 const char * buff; 688 // create format specifiers before the debug output 689 buff = __kmp_str_format( 690 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 691 traits_t< ST >::spec ); 692 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 693 __kmp_str_free( &buff ); 694 } 695 #endif 696 } 697 #endif // OMP_30_ENABLED 698 699 /* guided analytical not safe for too many threads */ 700 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 701 schedule = kmp_sch_guided_iterative_chunked; 702 KMP_WARNING( DispatchManyThreads ); 703 } 704 pr->u.p.parm1 = chunk; 705 } 706 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 707 "unknown scheduling type" ); 708 709 pr->u.p.count = 0; 710 711 if ( __kmp_env_consistency_check ) { 712 if ( st == 0 ) { 713 __kmp_error_construct( 714 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 715 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 716 ); 717 } 718 } 719 720 tc = ( ub - lb + st ); 721 if ( st != 1 ) { 722 if ( st < 0 ) { 723 if ( lb < ub ) { 724 tc = 0; // zero-trip 725 } else { // lb >= ub 726 tc = (ST)tc / st; // convert to signed division 727 } 728 } else { // st > 0 729 if ( ub < lb ) { 730 tc = 0; // zero-trip 731 } else { // lb >= ub 732 tc /= st; 733 } 734 } 735 } else if ( ub < lb ) { // st == 1 736 tc = 0; // zero-trip 737 } 738 739 pr->u.p.lb = lb; 740 pr->u.p.ub = ub; 741 pr->u.p.st = st; 742 pr->u.p.tc = tc; 743 744 #if KMP_OS_WINDOWS 745 pr->u.p.last_upper = ub + st; 746 #endif /* KMP_OS_WINDOWS */ 747 748 /* NOTE: only the active parallel region(s) has active ordered sections */ 749 750 if ( active ) { 751 if ( pr->ordered == 0 ) { 752 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 753 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 754 } else { 755 pr->ordered_bumped = 0; 756 757 pr->u.p.ordered_lower = 1; 758 pr->u.p.ordered_upper = 0; 759 760 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 761 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 762 } 763 } 764 765 if ( __kmp_env_consistency_check ) { 766 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 767 if ( push_ws ) { 768 __kmp_push_workshare( gtid, ws, loc ); 769 pr->pushed_ws = ws; 770 } else { 771 __kmp_check_workshare( gtid, ws, loc ); 772 pr->pushed_ws = ct_none; 773 } 774 } 775 776 switch ( schedule ) { 777 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 778 case kmp_sch_static_steal: 779 { 780 T nproc = team->t.t_nproc; 781 T ntc, init; 782 783 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 784 785 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 786 if ( nproc > 1 && ntc >= nproc ) { 787 T id = __kmp_tid_from_gtid(gtid); 788 T small_chunk, extras; 789 790 small_chunk = ntc / nproc; 791 extras = ntc % nproc; 792 793 init = id * small_chunk + ( id < extras ? id : extras ); 794 pr->u.p.count = init; 795 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 796 797 pr->u.p.parm2 = lb; 798 //pr->pfields.parm3 = 0; // it's not used in static_steal 799 pr->u.p.parm4 = id; 800 pr->u.p.st = st; 801 break; 802 } else { 803 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 804 gtid ) ); 805 schedule = kmp_sch_static_balanced; 806 /* too few iterations: fall-through to kmp_sch_static_balanced */ 807 } // if 808 /* FALL-THROUGH to static balanced */ 809 } // case 810 #endif 811 case kmp_sch_static_balanced: 812 { 813 T nproc = team->t.t_nproc; 814 T init, limit; 815 816 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 817 gtid ) ); 818 819 if ( nproc > 1 ) { 820 T id = __kmp_tid_from_gtid(gtid); 821 822 if ( tc < nproc ) { 823 if ( id < tc ) { 824 init = id; 825 limit = id; 826 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 827 } else { 828 pr->u.p.count = 1; /* means no more chunks to execute */ 829 pr->u.p.parm1 = FALSE; 830 break; 831 } 832 } else { 833 T small_chunk = tc / nproc; 834 T extras = tc % nproc; 835 init = id * small_chunk + (id < extras ? id : extras); 836 limit = init + small_chunk - (id < extras ? 0 : 1); 837 pr->u.p.parm1 = (id == nproc - 1); 838 } 839 } else { 840 if ( tc > 0 ) { 841 init = 0; 842 limit = tc - 1; 843 pr->u.p.parm1 = TRUE; 844 } else { 845 // zero trip count 846 pr->u.p.count = 1; /* means no more chunks to execute */ 847 pr->u.p.parm1 = FALSE; 848 break; 849 } 850 } 851 if ( st == 1 ) { 852 pr->u.p.lb = lb + init; 853 pr->u.p.ub = lb + limit; 854 } else { 855 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 856 pr->u.p.lb = lb + init * st; 857 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 858 if ( st > 0 ) { 859 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 860 } else { 861 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 862 } 863 } 864 if ( pr->ordered ) { 865 pr->u.p.ordered_lower = init; 866 pr->u.p.ordered_upper = limit; 867 } 868 break; 869 } // case 870 case kmp_sch_guided_iterative_chunked : 871 { 872 T nproc = team->t.t_nproc; 873 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 874 875 if ( nproc > 1 ) { 876 if ( (2L * chunk + 1 ) * nproc >= tc ) { 877 /* chunk size too large, switch to dynamic */ 878 schedule = kmp_sch_dynamic_chunked; 879 } else { 880 // when remaining iters become less than parm2 - switch to dynamic 881 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 882 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 883 } 884 } else { 885 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 886 schedule = kmp_sch_static_greedy; 887 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 888 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 889 pr->u.p.parm1 = tc; 890 } // if 891 } // case 892 break; 893 case kmp_sch_guided_analytical_chunked: 894 { 895 T nproc = team->t.t_nproc; 896 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 897 898 if ( nproc > 1 ) { 899 if ( (2L * chunk + 1 ) * nproc >= tc ) { 900 /* chunk size too large, switch to dynamic */ 901 schedule = kmp_sch_dynamic_chunked; 902 } else { 903 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 904 DBL x; 905 906 #if KMP_OS_WINDOWS && KMP_ARCH_X86 907 /* Linux* OS already has 64-bit computation by default for 908 long double, and on Windows* OS on Intel(R) 64, 909 /Qlong_double doesn't work. On Windows* OS 910 on IA-32 architecture, we need to set precision to 911 64-bit instead of the default 53-bit. Even though long 912 double doesn't work on Windows* OS on Intel(R) 64, the 913 resulting lack of precision is not expected to impact 914 the correctness of the algorithm, but this has not been 915 mathematically proven. 916 */ 917 // save original FPCW and set precision to 64-bit, as 918 // Windows* OS on IA-32 architecture defaults to 53-bit 919 unsigned int oldFpcw = _control87(0,0x30000); 920 #endif 921 /* value used for comparison in solver for cross-over point */ 922 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 923 924 /* crossover point--chunk indexes equal to or greater than 925 this point switch to dynamic-style scheduling */ 926 UT cross; 927 928 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 929 x = (long double)1.0 - (long double)0.5 / nproc; 930 931 #ifdef KMP_DEBUG 932 { // test natural alignment 933 struct _test_a { 934 char a; 935 union { 936 char b; 937 DBL d; 938 }; 939 } t; 940 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 941 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 942 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 943 } 944 #endif // KMP_DEBUG 945 946 /* save the term in thread private dispatch structure */ 947 *(DBL*)&pr->u.p.parm3 = x; 948 949 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 950 { 951 UT left, right, mid; 952 long double p; 953 954 /* estimate initial upper and lower bound */ 955 956 /* doesn't matter what value right is as long as it is positive, but 957 it affects performance of the solver 958 */ 959 right = 229; 960 p = __kmp_pow< UT >(x,right); 961 if ( p > target ) { 962 do{ 963 p *= p; 964 right <<= 1; 965 } while(p>target && right < (1<<27)); 966 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 967 } else { 968 left = 0; 969 } 970 971 /* bisection root-finding method */ 972 while ( left + 1 < right ) { 973 mid = (left + right) / 2; 974 if ( __kmp_pow< UT >(x,mid) > target ) { 975 left = mid; 976 } else { 977 right = mid; 978 } 979 } // while 980 cross = right; 981 } 982 /* assert sanity of computed crossover point */ 983 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 984 985 /* save the crossover point in thread private dispatch structure */ 986 pr->u.p.parm2 = cross; 987 988 // C75803 989 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 990 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 991 #else 992 #define GUIDED_ANALYTICAL_WORKAROUND (x) 993 #endif 994 /* dynamic-style scheduling offset */ 995 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 996 #if KMP_OS_WINDOWS && KMP_ARCH_X86 997 // restore FPCW 998 _control87(oldFpcw,0x30000); 999 #endif 1000 } // if 1001 } else { 1002 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1003 gtid ) ); 1004 schedule = kmp_sch_static_greedy; 1005 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1006 pr->u.p.parm1 = tc; 1007 } // if 1008 } // case 1009 break; 1010 case kmp_sch_static_greedy: 1011 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1012 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1013 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1014 tc; 1015 break; 1016 case kmp_sch_static_chunked : 1017 case kmp_sch_dynamic_chunked : 1018 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1019 break; 1020 case kmp_sch_trapezoidal : 1021 { 1022 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1023 1024 T parm1, parm2, parm3, parm4; 1025 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1026 1027 parm1 = chunk; 1028 1029 /* F : size of the first cycle */ 1030 parm2 = ( tc / (2 * team->t.t_nproc) ); 1031 1032 if ( parm2 < 1 ) { 1033 parm2 = 1; 1034 } 1035 1036 /* L : size of the last cycle. Make sure the last cycle 1037 * is not larger than the first cycle. 1038 */ 1039 if ( parm1 < 1 ) { 1040 parm1 = 1; 1041 } else if ( parm1 > parm2 ) { 1042 parm1 = parm2; 1043 } 1044 1045 /* N : number of cycles */ 1046 parm3 = ( parm2 + parm1 ); 1047 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1048 1049 if ( parm3 < 2 ) { 1050 parm3 = 2; 1051 } 1052 1053 /* sigma : decreasing incr of the trapezoid */ 1054 parm4 = ( parm3 - 1 ); 1055 parm4 = ( parm2 - parm1 ) / parm4; 1056 1057 // pointless check, because parm4 >= 0 always 1058 //if ( parm4 < 0 ) { 1059 // parm4 = 0; 1060 //} 1061 1062 pr->u.p.parm1 = parm1; 1063 pr->u.p.parm2 = parm2; 1064 pr->u.p.parm3 = parm3; 1065 pr->u.p.parm4 = parm4; 1066 } // case 1067 break; 1068 1069 default: 1070 { 1071 __kmp_msg( 1072 kmp_ms_fatal, // Severity 1073 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1074 KMP_HNT( GetNewerLibrary ), // Hint 1075 __kmp_msg_null // Variadic argument list terminator 1076 ); 1077 } 1078 break; 1079 } // switch 1080 pr->schedule = schedule; 1081 if ( active ) { 1082 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1083 1084 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1085 gtid, my_buffer_index, sh->buffer_index) ); 1086 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1087 USE_ITT_BUILD_ARG( NULL ) 1088 ); 1089 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1090 // *always* 32-bit integers. 1091 KMP_MB(); /* is this necessary? */ 1092 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1093 gtid, my_buffer_index, sh->buffer_index) ); 1094 1095 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1096 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1097 #if USE_ITT_BUILD 1098 if ( pr->ordered ) { 1099 __kmp_itt_ordered_init( gtid ); 1100 }; // if 1101 #endif /* USE_ITT_BUILD */ 1102 }; // if 1103 #ifdef KMP_DEBUG 1104 { 1105 const char * buff; 1106 // create format specifiers before the debug output 1107 buff = __kmp_str_format( 1108 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1109 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1110 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1111 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1112 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1113 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1114 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1115 KD_TRACE(10, ( buff, 1116 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1117 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1118 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1119 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1120 __kmp_str_free( &buff ); 1121 } 1122 #endif 1123 #if ( KMP_STATIC_STEAL_ENABLED ) 1124 if ( ___kmp_size_type < 8 ) { 1125 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1126 // all the parm3 variables will contain the same value. 1127 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1128 // rather than program life-time increment. 1129 // So the dedicated variable is required. The 'static_steal_counter' is used. 1130 if( schedule == kmp_sch_static_steal ) { 1131 // Other threads will inspect this variable when searching for a victim. 1132 // This is a flag showing that other threads may steal from this thread since then. 1133 volatile T * p = &pr->u.p.static_steal_counter; 1134 *p = *p + 1; 1135 } 1136 } 1137 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1138 } 1139 1140 /* 1141 * For ordered loops, either __kmp_dispatch_finish() should be called after 1142 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1143 * every chunk of iterations. If the ordered section(s) were not executed 1144 * for this iteration (or every iteration in this chunk), we need to set the 1145 * ordered iteration counters so that the next thread can proceed. 1146 */ 1147 template< typename UT > 1148 static void 1149 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1150 { 1151 typedef typename traits_t< UT >::signed_t ST; 1152 kmp_info_t *th = __kmp_threads[ gtid ]; 1153 1154 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1155 if ( ! th -> th.th_team -> t.t_serialized ) { 1156 1157 dispatch_private_info_template< UT > * pr = 1158 reinterpret_cast< dispatch_private_info_template< UT >* > 1159 ( th->th.th_dispatch->th_dispatch_pr_current ); 1160 dispatch_shared_info_template< UT > volatile * sh = 1161 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1162 ( th->th.th_dispatch->th_dispatch_sh_current ); 1163 KMP_DEBUG_ASSERT( pr ); 1164 KMP_DEBUG_ASSERT( sh ); 1165 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1166 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1167 1168 if ( pr->ordered_bumped ) { 1169 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1170 gtid ) ); 1171 pr->ordered_bumped = 0; 1172 } else { 1173 UT lower = pr->u.p.ordered_lower; 1174 1175 #ifdef KMP_DEBUG 1176 { 1177 const char * buff; 1178 // create format specifiers before the debug output 1179 buff = __kmp_str_format( 1180 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1181 traits_t< UT >::spec, traits_t< UT >::spec ); 1182 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1183 __kmp_str_free( &buff ); 1184 } 1185 #endif 1186 1187 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1188 USE_ITT_BUILD_ARG(NULL) 1189 ); 1190 KMP_MB(); /* is this necessary? */ 1191 #ifdef KMP_DEBUG 1192 { 1193 const char * buff; 1194 // create format specifiers before the debug output 1195 buff = __kmp_str_format( 1196 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1197 traits_t< UT >::spec, traits_t< UT >::spec ); 1198 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1199 __kmp_str_free( &buff ); 1200 } 1201 #endif 1202 1203 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1204 } // if 1205 } // if 1206 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1207 } 1208 1209 #ifdef KMP_GOMP_COMPAT 1210 1211 template< typename UT > 1212 static void 1213 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1214 { 1215 typedef typename traits_t< UT >::signed_t ST; 1216 kmp_info_t *th = __kmp_threads[ gtid ]; 1217 1218 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1219 if ( ! th -> th.th_team -> t.t_serialized ) { 1220 // int cid; 1221 dispatch_private_info_template< UT > * pr = 1222 reinterpret_cast< dispatch_private_info_template< UT >* > 1223 ( th->th.th_dispatch->th_dispatch_pr_current ); 1224 dispatch_shared_info_template< UT > volatile * sh = 1225 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1226 ( th->th.th_dispatch->th_dispatch_sh_current ); 1227 KMP_DEBUG_ASSERT( pr ); 1228 KMP_DEBUG_ASSERT( sh ); 1229 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1230 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1231 1232 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1233 UT lower = pr->u.p.ordered_lower; 1234 UT upper = pr->u.p.ordered_upper; 1235 UT inc = upper - lower + 1; 1236 1237 if ( pr->ordered_bumped == inc ) { 1238 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1239 gtid ) ); 1240 pr->ordered_bumped = 0; 1241 } else { 1242 inc -= pr->ordered_bumped; 1243 1244 #ifdef KMP_DEBUG 1245 { 1246 const char * buff; 1247 // create format specifiers before the debug output 1248 buff = __kmp_str_format( 1249 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1250 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1251 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1252 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1253 __kmp_str_free( &buff ); 1254 } 1255 #endif 1256 1257 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1258 USE_ITT_BUILD_ARG(NULL) 1259 ); 1260 1261 KMP_MB(); /* is this necessary? */ 1262 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1263 gtid ) ); 1264 pr->ordered_bumped = 0; 1265 //!!!!! TODO check if the inc should be unsigned, or signed??? 1266 #ifdef KMP_DEBUG 1267 { 1268 const char * buff; 1269 // create format specifiers before the debug output 1270 buff = __kmp_str_format( 1271 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1272 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1273 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1274 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1275 __kmp_str_free( &buff ); 1276 } 1277 #endif 1278 1279 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1280 } 1281 // } 1282 } 1283 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1284 } 1285 1286 #endif /* KMP_GOMP_COMPAT */ 1287 1288 template< typename T > 1289 static int 1290 __kmp_dispatch_next( 1291 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1292 ) { 1293 1294 typedef typename traits_t< T >::unsigned_t UT; 1295 typedef typename traits_t< T >::signed_t ST; 1296 typedef typename traits_t< T >::floating_t DBL; 1297 static const int ___kmp_size_type = sizeof( UT ); 1298 1299 int status; 1300 dispatch_private_info_template< T > * pr; 1301 kmp_info_t * th = __kmp_threads[ gtid ]; 1302 kmp_team_t * team = th -> th.th_team; 1303 1304 #ifdef KMP_DEBUG 1305 { 1306 const char * buff; 1307 // create format specifiers before the debug output 1308 buff = __kmp_str_format( 1309 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1310 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1311 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1312 __kmp_str_free( &buff ); 1313 } 1314 #endif 1315 1316 if ( team -> t.t_serialized ) { 1317 /* NOTE: serialize this dispatch becase we are not at the active level */ 1318 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1319 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1320 KMP_DEBUG_ASSERT( pr ); 1321 1322 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1323 *p_lb = 0; 1324 *p_ub = 0; 1325 if ( p_st != 0 ) { 1326 *p_st = 0; 1327 } 1328 if ( __kmp_env_consistency_check ) { 1329 if ( pr->pushed_ws != ct_none ) { 1330 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1331 } 1332 } 1333 } else if ( pr->nomerge ) { 1334 kmp_int32 last; 1335 T start; 1336 UT limit, trip, init; 1337 ST incr; 1338 T chunk = pr->u.p.parm1; 1339 1340 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1341 1342 init = chunk * pr->u.p.count++; 1343 trip = pr->u.p.tc - 1; 1344 1345 if ( (status = (init <= trip)) == 0 ) { 1346 *p_lb = 0; 1347 *p_ub = 0; 1348 if ( p_st != 0 ) *p_st = 0; 1349 if ( __kmp_env_consistency_check ) { 1350 if ( pr->pushed_ws != ct_none ) { 1351 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1352 } 1353 } 1354 } else { 1355 start = pr->u.p.lb; 1356 limit = chunk + init - 1; 1357 incr = pr->u.p.st; 1358 1359 if ( (last = (limit >= trip)) != 0 ) { 1360 limit = trip; 1361 #if KMP_OS_WINDOWS 1362 pr->u.p.last_upper = pr->u.p.ub; 1363 #endif /* KMP_OS_WINDOWS */ 1364 } 1365 if ( p_last ) { 1366 *p_last = last; 1367 } 1368 if ( p_st != 0 ) { 1369 *p_st = incr; 1370 } 1371 if ( incr == 1 ) { 1372 *p_lb = start + init; 1373 *p_ub = start + limit; 1374 } else { 1375 *p_lb = start + init * incr; 1376 *p_ub = start + limit * incr; 1377 } 1378 1379 if ( pr->ordered ) { 1380 pr->u.p.ordered_lower = init; 1381 pr->u.p.ordered_upper = limit; 1382 #ifdef KMP_DEBUG 1383 { 1384 const char * buff; 1385 // create format specifiers before the debug output 1386 buff = __kmp_str_format( 1387 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1388 traits_t< UT >::spec, traits_t< UT >::spec ); 1389 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1390 __kmp_str_free( &buff ); 1391 } 1392 #endif 1393 } // if 1394 } // if 1395 } else { 1396 pr->u.p.tc = 0; 1397 1398 *p_lb = pr->u.p.lb; 1399 *p_ub = pr->u.p.ub; 1400 #if KMP_OS_WINDOWS 1401 pr->u.p.last_upper = *p_ub; 1402 #endif /* KMP_OS_WINDOWS */ 1403 1404 if ( p_st != 0 ) { 1405 *p_st = pr->u.p.st; 1406 } 1407 if ( p_last ) { 1408 *p_last = TRUE; 1409 } 1410 } // if 1411 #ifdef KMP_DEBUG 1412 { 1413 const char * buff; 1414 // create format specifiers before the debug output 1415 buff = __kmp_str_format( 1416 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1417 "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 1418 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1419 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) ); 1420 __kmp_str_free( &buff ); 1421 } 1422 #endif 1423 return status; 1424 } else { 1425 kmp_int32 last = 0; 1426 dispatch_shared_info_template< UT > *sh; 1427 T start; 1428 ST incr; 1429 UT limit, trip, init; 1430 1431 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1432 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1433 1434 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1435 ( th->th.th_dispatch->th_dispatch_pr_current ); 1436 KMP_DEBUG_ASSERT( pr ); 1437 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1438 ( th->th.th_dispatch->th_dispatch_sh_current ); 1439 KMP_DEBUG_ASSERT( sh ); 1440 1441 if ( pr->u.p.tc == 0 ) { 1442 // zero trip count 1443 status = 0; 1444 } else { 1445 switch (pr->schedule) { 1446 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1447 case kmp_sch_static_steal: 1448 { 1449 T chunk = pr->u.p.parm1; 1450 1451 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1452 1453 trip = pr->u.p.tc - 1; 1454 1455 if ( ___kmp_size_type > 4 ) { 1456 // Other threads do not look into the data of this thread, 1457 // so it's not necessary to make volatile casting. 1458 init = ( pr->u.p.count )++; 1459 status = ( init < (UT)pr->u.p.ub ); 1460 } else { 1461 typedef union { 1462 struct { 1463 UT count; 1464 T ub; 1465 } p; 1466 kmp_int64 b; 1467 } union_i4; 1468 // All operations on 'count' or 'ub' must be combined atomically together. 1469 // stealing implemented only for 4-byte indexes 1470 { 1471 union_i4 vold, vnew; 1472 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1473 vnew = vold; 1474 vnew.p.count++; 1475 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1476 ( volatile kmp_int64* )&pr->u.p.count, 1477 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1478 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1479 KMP_CPU_PAUSE(); 1480 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1481 vnew = vold; 1482 vnew.p.count++; 1483 } 1484 vnew = vold; 1485 init = vnew.p.count; 1486 status = ( init < (UT)vnew.p.ub ) ; 1487 } 1488 1489 if( !status ) { 1490 kmp_info_t **other_threads = team->t.t_threads; 1491 int while_limit = 10; 1492 int while_index = 0; 1493 1494 // TODO: algorithm of searching for a victim 1495 // should be cleaned up and measured 1496 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1497 union_i4 vold, vnew; 1498 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1499 T victimIdx = pr->u.p.parm4; 1500 T oldVictimIdx = victimIdx; 1501 dispatch_private_info_template< T > * victim; 1502 1503 do { 1504 if( !victimIdx ) { 1505 victimIdx = team->t.t_nproc - 1; 1506 } else { 1507 --victimIdx; 1508 } 1509 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1510 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1511 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1512 // TODO: think about a proper place of this test 1513 if ( ( !victim ) || 1514 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1515 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1516 // TODO: delay would be nice 1517 continue; 1518 // the victim is not ready yet to participate in stealing 1519 // because the victim is still in kmp_init_dispatch 1520 } 1521 if ( oldVictimIdx == victimIdx ) { 1522 break; 1523 } 1524 pr->u.p.parm4 = victimIdx; 1525 1526 while( 1 ) { 1527 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1528 vnew = vold; 1529 1530 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1531 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1532 break; 1533 } 1534 vnew.p.ub -= (remaining >> 2); 1535 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1536 #pragma warning( push ) 1537 // disable warning on pointless comparison of unsigned with 0 1538 #pragma warning( disable: 186 ) 1539 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1540 #pragma warning( pop ) 1541 // TODO: Should this be acquire or release? 1542 if ( KMP_COMPARE_AND_STORE_ACQ64( 1543 ( volatile kmp_int64 * )&victim->u.p.count, 1544 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1545 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1546 status = 1; 1547 while_index = 0; 1548 // now update own count and ub 1549 #if KMP_ARCH_X86 1550 // stealing executed on non-KMP_ARCH_X86 only 1551 // Atomic 64-bit write on ia32 is 1552 // unavailable, so we do this in steps. 1553 // This code is not tested. 1554 init = vold.p.count; 1555 pr->u.p.ub = 0; 1556 pr->u.p.count = init + 1; 1557 pr->u.p.ub = vnew.p.count; 1558 #else 1559 init = vnew.p.ub; 1560 vold.p.count = init + 1; 1561 // TODO: is it safe and enough? 1562 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1563 #endif // KMP_ARCH_X86 1564 break; 1565 } // if 1566 KMP_CPU_PAUSE(); 1567 } // while (1) 1568 } // while 1569 } // if 1570 } // if 1571 if ( !status ) { 1572 *p_lb = 0; 1573 *p_ub = 0; 1574 if ( p_st != 0 ) *p_st = 0; 1575 } else { 1576 start = pr->u.p.parm2; 1577 init *= chunk; 1578 limit = chunk + init - 1; 1579 incr = pr->u.p.st; 1580 1581 KMP_DEBUG_ASSERT(init <= trip); 1582 if ( (last = (limit >= trip)) != 0 ) 1583 limit = trip; 1584 if ( p_last ) { 1585 *p_last = last; 1586 } 1587 if ( p_st != 0 ) *p_st = incr; 1588 1589 if ( incr == 1 ) { 1590 *p_lb = start + init; 1591 *p_ub = start + limit; 1592 } else { 1593 *p_lb = start + init * incr; 1594 *p_ub = start + limit * incr; 1595 } 1596 1597 if ( pr->ordered ) { 1598 pr->u.p.ordered_lower = init; 1599 pr->u.p.ordered_upper = limit; 1600 #ifdef KMP_DEBUG 1601 { 1602 const char * buff; 1603 // create format specifiers before the debug output 1604 buff = __kmp_str_format( 1605 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1606 traits_t< UT >::spec, traits_t< UT >::spec ); 1607 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1608 __kmp_str_free( &buff ); 1609 } 1610 #endif 1611 } // if 1612 } // if 1613 break; 1614 } // case 1615 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1616 case kmp_sch_static_balanced: 1617 { 1618 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1619 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1620 pr->u.p.count = 1; 1621 *p_lb = pr->u.p.lb; 1622 *p_ub = pr->u.p.ub; 1623 last = pr->u.p.parm1; 1624 if ( p_last ) { 1625 *p_last = last; 1626 } 1627 if ( p_st ) 1628 *p_st = pr->u.p.st; 1629 } else { /* no iterations to do */ 1630 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1631 } 1632 if ( pr->ordered ) { 1633 #ifdef KMP_DEBUG 1634 { 1635 const char * buff; 1636 // create format specifiers before the debug output 1637 buff = __kmp_str_format( 1638 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1639 traits_t< UT >::spec, traits_t< UT >::spec ); 1640 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1641 __kmp_str_free( &buff ); 1642 } 1643 #endif 1644 } // if 1645 } // case 1646 break; 1647 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1648 case kmp_sch_static_chunked: 1649 { 1650 T parm1; 1651 1652 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1653 gtid ) ); 1654 parm1 = pr->u.p.parm1; 1655 1656 trip = pr->u.p.tc - 1; 1657 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1658 1659 if ( (status = (init <= trip)) != 0 ) { 1660 start = pr->u.p.lb; 1661 incr = pr->u.p.st; 1662 limit = parm1 + init - 1; 1663 1664 if ( (last = (limit >= trip)) != 0 ) 1665 limit = trip; 1666 1667 if ( p_last ) { 1668 *p_last = last; 1669 } 1670 if ( p_st != 0 ) *p_st = incr; 1671 1672 pr->u.p.count += team->t.t_nproc; 1673 1674 if ( incr == 1 ) { 1675 *p_lb = start + init; 1676 *p_ub = start + limit; 1677 } 1678 else { 1679 *p_lb = start + init * incr; 1680 *p_ub = start + limit * incr; 1681 } 1682 1683 if ( pr->ordered ) { 1684 pr->u.p.ordered_lower = init; 1685 pr->u.p.ordered_upper = limit; 1686 #ifdef KMP_DEBUG 1687 { 1688 const char * buff; 1689 // create format specifiers before the debug output 1690 buff = __kmp_str_format( 1691 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1692 traits_t< UT >::spec, traits_t< UT >::spec ); 1693 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1694 __kmp_str_free( &buff ); 1695 } 1696 #endif 1697 } // if 1698 } // if 1699 } // case 1700 break; 1701 1702 case kmp_sch_dynamic_chunked: 1703 { 1704 T chunk = pr->u.p.parm1; 1705 1706 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1707 gtid ) ); 1708 1709 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1710 trip = pr->u.p.tc - 1; 1711 1712 if ( (status = (init <= trip)) == 0 ) { 1713 *p_lb = 0; 1714 *p_ub = 0; 1715 if ( p_st != 0 ) *p_st = 0; 1716 } else { 1717 start = pr->u.p.lb; 1718 limit = chunk + init - 1; 1719 incr = pr->u.p.st; 1720 1721 if ( (last = (limit >= trip)) != 0 ) 1722 limit = trip; 1723 if ( p_last ) { 1724 *p_last = last; 1725 } 1726 if ( p_st != 0 ) *p_st = incr; 1727 1728 if ( incr == 1 ) { 1729 *p_lb = start + init; 1730 *p_ub = start + limit; 1731 } else { 1732 *p_lb = start + init * incr; 1733 *p_ub = start + limit * incr; 1734 } 1735 1736 if ( pr->ordered ) { 1737 pr->u.p.ordered_lower = init; 1738 pr->u.p.ordered_upper = limit; 1739 #ifdef KMP_DEBUG 1740 { 1741 const char * buff; 1742 // create format specifiers before the debug output 1743 buff = __kmp_str_format( 1744 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1745 traits_t< UT >::spec, traits_t< UT >::spec ); 1746 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1747 __kmp_str_free( &buff ); 1748 } 1749 #endif 1750 } // if 1751 } // if 1752 } // case 1753 break; 1754 1755 case kmp_sch_guided_iterative_chunked: 1756 { 1757 T chunkspec = pr->u.p.parm1; 1758 KD_TRACE(100, 1759 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1760 trip = pr->u.p.tc; 1761 // Start atomic part of calculations 1762 while(1) { 1763 ST remaining; // signed, because can be < 0 1764 init = sh->u.s.iteration; // shared value 1765 remaining = trip - init; 1766 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1767 // nothing to do, don't try atomic op 1768 status = 0; 1769 break; 1770 } 1771 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1772 // use dynamic-style shcedule 1773 // atomically inrement iterations, get old value 1774 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1775 remaining = trip - init; 1776 if (remaining <= 0) { 1777 status = 0; // all iterations got by other threads 1778 } else { 1779 // got some iterations to work on 1780 status = 1; 1781 if ( (T)remaining > chunkspec ) { 1782 limit = init + chunkspec - 1; 1783 } else { 1784 last = 1; // the last chunk 1785 limit = init + remaining - 1; 1786 } // if 1787 } // if 1788 break; 1789 } // if 1790 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1791 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1792 // CAS was successful, chunk obtained 1793 status = 1; 1794 --limit; 1795 break; 1796 } // if 1797 } // while 1798 if ( status != 0 ) { 1799 start = pr->u.p.lb; 1800 incr = pr->u.p.st; 1801 if ( p_st != NULL ) 1802 *p_st = incr; 1803 if ( p_last != NULL ) 1804 *p_last = last; 1805 *p_lb = start + init * incr; 1806 *p_ub = start + limit * incr; 1807 if ( pr->ordered ) { 1808 pr->u.p.ordered_lower = init; 1809 pr->u.p.ordered_upper = limit; 1810 #ifdef KMP_DEBUG 1811 { 1812 const char * buff; 1813 // create format specifiers before the debug output 1814 buff = __kmp_str_format( 1815 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1816 traits_t< UT >::spec, traits_t< UT >::spec ); 1817 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1818 __kmp_str_free( &buff ); 1819 } 1820 #endif 1821 } // if 1822 } else { 1823 *p_lb = 0; 1824 *p_ub = 0; 1825 if ( p_st != NULL ) 1826 *p_st = 0; 1827 } // if 1828 } // case 1829 break; 1830 1831 case kmp_sch_guided_analytical_chunked: 1832 { 1833 T chunkspec = pr->u.p.parm1; 1834 UT chunkIdx; 1835 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1836 /* for storing original FPCW value for Windows* OS on 1837 IA-32 architecture 8-byte version */ 1838 unsigned int oldFpcw; 1839 int fpcwSet = 0; 1840 #endif 1841 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1842 gtid ) ); 1843 1844 trip = pr->u.p.tc; 1845 1846 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1847 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1848 1849 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1850 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1851 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1852 --trip; 1853 /* use dynamic-style scheduling */ 1854 init = chunkIdx * chunkspec + pr->u.p.count; 1855 /* need to verify init > 0 in case of overflow in the above calculation */ 1856 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1857 limit = init + chunkspec -1; 1858 1859 if ( (last = (limit >= trip)) != 0 ) 1860 limit = trip; 1861 } 1862 break; 1863 } else { 1864 /* use exponential-style scheduling */ 1865 /* The following check is to workaround the lack of long double precision on Windows* OS. 1866 This check works around the possible effect that init != 0 for chunkIdx == 0. 1867 */ 1868 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1869 /* If we haven't already done so, save original 1870 FPCW and set precision to 64-bit, as Windows* OS 1871 on IA-32 architecture defaults to 53-bit */ 1872 if ( !fpcwSet ) { 1873 oldFpcw = _control87(0,0x30000); 1874 fpcwSet = 0x30000; 1875 } 1876 #endif 1877 if ( chunkIdx ) { 1878 init = __kmp_dispatch_guided_remaining< T >( 1879 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1880 KMP_DEBUG_ASSERT(init); 1881 init = trip - init; 1882 } else 1883 init = 0; 1884 limit = trip - __kmp_dispatch_guided_remaining< T >( 1885 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1886 KMP_ASSERT(init <= limit); 1887 if ( init < limit ) { 1888 KMP_DEBUG_ASSERT(limit <= trip); 1889 --limit; 1890 status = 1; 1891 break; 1892 } // if 1893 } // if 1894 } // while (1) 1895 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1896 /* restore FPCW if necessary */ 1897 if ( oldFpcw & fpcwSet != 0 ) 1898 _control87(oldFpcw,0x30000); 1899 #endif 1900 if ( status != 0 ) { 1901 start = pr->u.p.lb; 1902 incr = pr->u.p.st; 1903 if ( p_st != NULL ) 1904 *p_st = incr; 1905 if ( p_last != NULL ) 1906 *p_last = last; 1907 *p_lb = start + init * incr; 1908 *p_ub = start + limit * incr; 1909 if ( pr->ordered ) { 1910 pr->u.p.ordered_lower = init; 1911 pr->u.p.ordered_upper = limit; 1912 #ifdef KMP_DEBUG 1913 { 1914 const char * buff; 1915 // create format specifiers before the debug output 1916 buff = __kmp_str_format( 1917 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1918 traits_t< UT >::spec, traits_t< UT >::spec ); 1919 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1920 __kmp_str_free( &buff ); 1921 } 1922 #endif 1923 } 1924 } else { 1925 *p_lb = 0; 1926 *p_ub = 0; 1927 if ( p_st != NULL ) 1928 *p_st = 0; 1929 } 1930 } // case 1931 break; 1932 1933 case kmp_sch_trapezoidal: 1934 { 1935 UT index; 1936 T parm2 = pr->u.p.parm2; 1937 T parm3 = pr->u.p.parm3; 1938 T parm4 = pr->u.p.parm4; 1939 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 1940 gtid ) ); 1941 1942 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 1943 1944 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 1945 trip = pr->u.p.tc - 1; 1946 1947 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 1948 *p_lb = 0; 1949 *p_ub = 0; 1950 if ( p_st != 0 ) *p_st = 0; 1951 } else { 1952 start = pr->u.p.lb; 1953 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 1954 incr = pr->u.p.st; 1955 1956 if ( (last = (limit >= trip)) != 0 ) 1957 limit = trip; 1958 1959 if ( p_last != 0 ) { 1960 *p_last = last; 1961 } 1962 if ( p_st != 0 ) *p_st = incr; 1963 1964 if ( incr == 1 ) { 1965 *p_lb = start + init; 1966 *p_ub = start + limit; 1967 } else { 1968 *p_lb = start + init * incr; 1969 *p_ub = start + limit * incr; 1970 } 1971 1972 if ( pr->ordered ) { 1973 pr->u.p.ordered_lower = init; 1974 pr->u.p.ordered_upper = limit; 1975 #ifdef KMP_DEBUG 1976 { 1977 const char * buff; 1978 // create format specifiers before the debug output 1979 buff = __kmp_str_format( 1980 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1981 traits_t< UT >::spec, traits_t< UT >::spec ); 1982 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1983 __kmp_str_free( &buff ); 1984 } 1985 #endif 1986 } // if 1987 } // if 1988 } // case 1989 break; 1990 } // switch 1991 } // if tc == 0; 1992 1993 if ( status == 0 ) { 1994 UT num_done; 1995 1996 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 1997 #ifdef KMP_DEBUG 1998 { 1999 const char * buff; 2000 // create format specifiers before the debug output 2001 buff = __kmp_str_format( 2002 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2003 traits_t< UT >::spec ); 2004 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2005 __kmp_str_free( &buff ); 2006 } 2007 #endif 2008 2009 if ( num_done == team->t.t_nproc-1 ) { 2010 /* NOTE: release this buffer to be reused */ 2011 2012 KMP_MB(); /* Flush all pending memory write invalidates. */ 2013 2014 sh->u.s.num_done = 0; 2015 sh->u.s.iteration = 0; 2016 2017 /* TODO replace with general release procedure? */ 2018 if ( pr->ordered ) { 2019 sh->u.s.ordered_iteration = 0; 2020 } 2021 2022 KMP_MB(); /* Flush all pending memory write invalidates. */ 2023 2024 sh -> buffer_index += KMP_MAX_DISP_BUF; 2025 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2026 gtid, sh->buffer_index) ); 2027 2028 KMP_MB(); /* Flush all pending memory write invalidates. */ 2029 2030 } // if 2031 if ( __kmp_env_consistency_check ) { 2032 if ( pr->pushed_ws != ct_none ) { 2033 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2034 } 2035 } 2036 2037 th -> th.th_dispatch -> th_deo_fcn = NULL; 2038 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2039 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2040 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2041 } // if (status == 0) 2042 #if KMP_OS_WINDOWS 2043 else if ( last ) { 2044 pr->u.p.last_upper = pr->u.p.ub; 2045 } 2046 #endif /* KMP_OS_WINDOWS */ 2047 } // if 2048 2049 #ifdef KMP_DEBUG 2050 { 2051 const char * buff; 2052 // create format specifiers before the debug output 2053 buff = __kmp_str_format( 2054 "__kmp_dispatch_next: T#%%d normal case: " \ 2055 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2056 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2057 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2058 __kmp_str_free( &buff ); 2059 } 2060 #endif 2061 return status; 2062 } 2063 2064 //----------------------------------------------------------------------------------------- 2065 // Dispatch routines 2066 // Transfer call to template< type T > 2067 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2068 // T lb, T ub, ST st, ST chunk ) 2069 extern "C" { 2070 2071 /*! 2072 @ingroup WORK_SHARING 2073 @{ 2074 @param loc Source location 2075 @param gtid Global thread id 2076 @param schedule Schedule type 2077 @param lb Lower bound 2078 @param ub Upper bound 2079 @param st Step (or increment if you prefer) 2080 @param chunk The chunk size to block with 2081 2082 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2083 These functions are all identical apart from the types of the arguments. 2084 */ 2085 2086 void 2087 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2088 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2089 { 2090 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2091 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2092 } 2093 /*! 2094 See @ref __kmpc_dispatch_init_4 2095 */ 2096 void 2097 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2098 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2099 { 2100 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2101 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2102 } 2103 2104 /*! 2105 See @ref __kmpc_dispatch_init_4 2106 */ 2107 void 2108 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2109 kmp_int64 lb, kmp_int64 ub, 2110 kmp_int64 st, kmp_int64 chunk ) 2111 { 2112 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2113 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2114 } 2115 2116 /*! 2117 See @ref __kmpc_dispatch_init_4 2118 */ 2119 void 2120 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2121 kmp_uint64 lb, kmp_uint64 ub, 2122 kmp_int64 st, kmp_int64 chunk ) 2123 { 2124 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2125 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2126 } 2127 2128 /*! 2129 @param loc Source code location 2130 @param gtid Global thread id 2131 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2132 @param p_lb Pointer to the lower bound for the next chunk of work 2133 @param p_ub Pointer to the upper bound for the next chunk of work 2134 @param p_st Pointer to the stride for the next chunk of work 2135 @return one if there is work to be done, zero otherwise 2136 2137 Get the next dynamically allocated chunk of work for this thread. 2138 If there is no more work, then the lb,ub and stride need not be modified. 2139 */ 2140 int 2141 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2142 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2143 { 2144 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2145 } 2146 2147 /*! 2148 See @ref __kmpc_dispatch_next_4 2149 */ 2150 int 2151 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2152 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2153 { 2154 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2155 } 2156 2157 /*! 2158 See @ref __kmpc_dispatch_next_4 2159 */ 2160 int 2161 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2162 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2163 { 2164 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2165 } 2166 2167 /*! 2168 See @ref __kmpc_dispatch_next_4 2169 */ 2170 int 2171 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2172 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2173 { 2174 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2175 } 2176 2177 /*! 2178 @param loc Source code location 2179 @param gtid Global thread id 2180 2181 Mark the end of a dynamic loop. 2182 */ 2183 void 2184 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2185 { 2186 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2187 } 2188 2189 /*! 2190 See @ref __kmpc_dispatch_fini_4 2191 */ 2192 void 2193 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2194 { 2195 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2196 } 2197 2198 /*! 2199 See @ref __kmpc_dispatch_fini_4 2200 */ 2201 void 2202 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2203 { 2204 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2205 } 2206 2207 /*! 2208 See @ref __kmpc_dispatch_fini_4 2209 */ 2210 void 2211 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2212 { 2213 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2214 } 2215 /*! @} */ 2216 2217 //----------------------------------------------------------------------------------------- 2218 //Non-template routines from kmp_dispatch.c used in other sources 2219 2220 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2221 return value == checker; 2222 } 2223 2224 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2225 return value != checker; 2226 } 2227 2228 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2229 return value < checker; 2230 } 2231 2232 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2233 return value >= checker; 2234 } 2235 2236 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2237 return value <= checker; 2238 } 2239 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2240 return value == checker; 2241 } 2242 2243 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2244 return value != checker; 2245 } 2246 2247 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2248 return value < checker; 2249 } 2250 2251 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2252 return value >= checker; 2253 } 2254 2255 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2256 return value <= checker; 2257 } 2258 2259 kmp_uint32 2260 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2261 kmp_uint32 checker, 2262 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2263 , void * obj // Higher-level synchronization object, or NULL. 2264 ) 2265 { 2266 // note: we may not belong to a team at this point 2267 register volatile kmp_uint32 * spin = spinner; 2268 register kmp_uint32 check = checker; 2269 register kmp_uint32 spins; 2270 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2271 register kmp_uint32 r; 2272 2273 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2274 KMP_INIT_YIELD( spins ); 2275 // main wait spin loop 2276 while(!f(r = TCR_4(*spin), check)) { 2277 KMP_FSYNC_SPIN_PREPARE( obj ); 2278 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2279 It causes problems with infinite recursion because of exit lock */ 2280 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2281 __kmp_abort_thread(); */ 2282 2283 __kmp_static_delay(TRUE); 2284 2285 /* if we have waited a bit, or are oversubscribed, yield */ 2286 /* pause is in the following code */ 2287 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2288 KMP_YIELD_SPIN( spins ); 2289 } 2290 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2291 return r; 2292 } 2293 2294 kmp_uint64 2295 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2296 kmp_uint64 checker, 2297 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2298 , void * obj // Higher-level synchronization object, or NULL. 2299 ) 2300 { 2301 // note: we may not belong to a team at this point 2302 register volatile kmp_uint64 * spin = spinner; 2303 register kmp_uint64 check = checker; 2304 register kmp_uint32 spins; 2305 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2306 register kmp_uint64 r; 2307 2308 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2309 KMP_INIT_YIELD( spins ); 2310 // main wait spin loop 2311 while(!f(r = *spin, check)) 2312 { 2313 KMP_FSYNC_SPIN_PREPARE( obj ); 2314 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2315 It causes problems with infinite recursion because of exit lock */ 2316 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2317 __kmp_abort_thread(); */ 2318 2319 __kmp_static_delay(TRUE); 2320 2321 // if we are oversubscribed, 2322 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2323 // pause is in the following code 2324 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2325 KMP_YIELD_SPIN( spins ); 2326 } 2327 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2328 return r; 2329 } 2330 2331 } // extern "C" 2332 2333 #ifdef KMP_GOMP_COMPAT 2334 2335 void 2336 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2337 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2338 kmp_int32 chunk, int push_ws ) 2339 { 2340 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2341 push_ws ); 2342 } 2343 2344 void 2345 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2346 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2347 kmp_int32 chunk, int push_ws ) 2348 { 2349 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2350 push_ws ); 2351 } 2352 2353 void 2354 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2355 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2356 kmp_int64 chunk, int push_ws ) 2357 { 2358 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2359 push_ws ); 2360 } 2361 2362 void 2363 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2364 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2365 kmp_int64 chunk, int push_ws ) 2366 { 2367 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2368 push_ws ); 2369 } 2370 2371 void 2372 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2373 { 2374 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2375 } 2376 2377 void 2378 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2379 { 2380 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2381 } 2382 2383 void 2384 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2385 { 2386 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2387 } 2388 2389 void 2390 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2391 { 2392 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2393 } 2394 2395 #endif /* KMP_GOMP_COMPAT */ 2396 2397 /* ------------------------------------------------------------------------ */ 2398 /* ------------------------------------------------------------------------ */ 2399 2400