1 /* 2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. 3 * $Revision: 42674 $ 4 * $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $ 5 */ 6 7 8 //===----------------------------------------------------------------------===// 9 // 10 // The LLVM Compiler Infrastructure 11 // 12 // This file is dual licensed under the MIT and the University of Illinois Open 13 // Source Licenses. See LICENSE.txt for details. 14 // 15 //===----------------------------------------------------------------------===// 16 17 18 /* 19 * Dynamic scheduling initialization and dispatch. 20 * 21 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however 22 * it may change values between parallel regions. __kmp_max_nth 23 * is the largest value __kmp_nth may take, 1 is the smallest. 24 * 25 */ 26 27 /* ------------------------------------------------------------------------ */ 28 /* ------------------------------------------------------------------------ */ 29 30 #include "kmp.h" 31 #include "kmp_i18n.h" 32 #include "kmp_itt.h" 33 #include "kmp_str.h" 34 #include "kmp_error.h" 35 #if KMP_OS_WINDOWS && KMP_ARCH_X86 36 #include <float.h> 37 #endif 38 39 /* ------------------------------------------------------------------------ */ 40 /* ------------------------------------------------------------------------ */ 41 42 #ifdef KMP_STATIC_STEAL_ENABLED 43 44 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 45 template< typename T > 46 struct dispatch_private_infoXX_template { 47 typedef typename traits_t< T >::unsigned_t UT; 48 typedef typename traits_t< T >::signed_t ST; 49 UT count; // unsigned 50 T ub; 51 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ 52 T lb; 53 ST st; // signed 54 UT tc; // unsigned 55 T static_steal_counter; // for static_steal only; maybe better to put after ub 56 57 /* parm[1-4] are used in different ways by different scheduling algorithms */ 58 59 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) 60 // a) parm3 is properly aligned and 61 // b) all parm1-4 are in the same cache line. 62 // Because of parm1-4 are used together, performance seems to be better 63 // if they are in the same line (not measured though). 64 65 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4 66 T parm1; 67 T parm2; 68 T parm3; 69 T parm4; 70 }; 71 72 UT ordered_lower; // unsigned 73 UT ordered_upper; // unsigned 74 #if KMP_OS_WINDOWS 75 T last_upper; 76 #endif /* KMP_OS_WINDOWS */ 77 }; 78 79 #else /* KMP_STATIC_STEAL_ENABLED */ 80 81 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types 82 template< typename T > 83 struct dispatch_private_infoXX_template { 84 typedef typename traits_t< T >::unsigned_t UT; 85 typedef typename traits_t< T >::signed_t ST; 86 T lb; 87 T ub; 88 ST st; // signed 89 UT tc; // unsigned 90 91 T parm1; 92 T parm2; 93 T parm3; 94 T parm4; 95 96 UT count; // unsigned 97 98 UT ordered_lower; // unsigned 99 UT ordered_upper; // unsigned 100 #if KMP_OS_WINDOWS 101 T last_upper; 102 #endif /* KMP_OS_WINDOWS */ 103 }; 104 105 #endif /* KMP_STATIC_STEAL_ENABLED */ 106 107 // replaces dispatch_private_info structure and dispatch_private_info_t type 108 template< typename T > 109 struct KMP_ALIGN_CACHE dispatch_private_info_template { 110 // duplicate alignment here, otherwise size of structure is not correct in our compiler 111 union KMP_ALIGN_CACHE private_info_tmpl { 112 dispatch_private_infoXX_template< T > p; 113 dispatch_private_info64_t p64; 114 } u; 115 enum sched_type schedule; /* scheduling algorithm */ 116 kmp_uint32 ordered; /* ordered clause specified */ 117 kmp_uint32 ordered_bumped; 118 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order 119 dispatch_private_info * next; /* stack of buffers for nest of serial regions */ 120 kmp_uint32 nomerge; /* don't merge iters if serialized */ 121 kmp_uint32 type_size; 122 enum cons_type pushed_ws; 123 }; 124 125 126 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types 127 template< typename UT > 128 struct dispatch_shared_infoXX_template { 129 /* chunk index under dynamic, number of idle threads under static-steal; 130 iteration index otherwise */ 131 volatile UT iteration; 132 volatile UT num_done; 133 volatile UT ordered_iteration; 134 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar 135 }; 136 137 // replaces dispatch_shared_info structure and dispatch_shared_info_t type 138 template< typename UT > 139 struct dispatch_shared_info_template { 140 // we need union here to keep the structure size 141 union shared_info_tmpl { 142 dispatch_shared_infoXX_template< UT > s; 143 dispatch_shared_info64_t s64; 144 } u; 145 volatile kmp_uint32 buffer_index; 146 }; 147 148 /* ------------------------------------------------------------------------ */ 149 /* ------------------------------------------------------------------------ */ 150 151 static void 152 __kmp_static_delay( int arg ) 153 { 154 /* Work around weird code-gen bug that causes assert to trip */ 155 #if KMP_ARCH_X86_64 && KMP_OS_LINUX 156 #else 157 KMP_ASSERT( arg >= 0 ); 158 #endif 159 } 160 161 static void 162 __kmp_static_yield( int arg ) 163 { 164 __kmp_yield( arg ); 165 } 166 167 #undef USE_TEST_LOCKS 168 169 // test_then_add template (general template should NOT be used) 170 template< typename T > 171 static __forceinline T 172 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); }; 173 174 template<> 175 __forceinline kmp_int32 176 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d ) 177 { 178 kmp_int32 r; 179 r = KMP_TEST_THEN_ADD32( p, d ); 180 return r; 181 } 182 183 template<> 184 __forceinline kmp_int64 185 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d ) 186 { 187 kmp_int64 r; 188 r = KMP_TEST_THEN_ADD64( p, d ); 189 return r; 190 } 191 192 // test_then_inc_acq template (general template should NOT be used) 193 template< typename T > 194 static __forceinline T 195 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); }; 196 197 template<> 198 __forceinline kmp_int32 199 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p ) 200 { 201 kmp_int32 r; 202 r = KMP_TEST_THEN_INC_ACQ32( p ); 203 return r; 204 } 205 206 template<> 207 __forceinline kmp_int64 208 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p ) 209 { 210 kmp_int64 r; 211 r = KMP_TEST_THEN_INC_ACQ64( p ); 212 return r; 213 } 214 215 // test_then_inc template (general template should NOT be used) 216 template< typename T > 217 static __forceinline T 218 test_then_inc( volatile T *p ) { KMP_ASSERT(0); }; 219 220 template<> 221 __forceinline kmp_int32 222 test_then_inc< kmp_int32 >( volatile kmp_int32 *p ) 223 { 224 kmp_int32 r; 225 r = KMP_TEST_THEN_INC32( p ); 226 return r; 227 } 228 229 template<> 230 __forceinline kmp_int64 231 test_then_inc< kmp_int64 >( volatile kmp_int64 *p ) 232 { 233 kmp_int64 r; 234 r = KMP_TEST_THEN_INC64( p ); 235 return r; 236 } 237 238 // compare_and_swap template (general template should NOT be used) 239 template< typename T > 240 static __forceinline kmp_int32 241 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); }; 242 243 template<> 244 __forceinline kmp_int32 245 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s ) 246 { 247 return KMP_COMPARE_AND_STORE_REL32( p, c, s ); 248 } 249 250 template<> 251 __forceinline kmp_int32 252 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s ) 253 { 254 return KMP_COMPARE_AND_STORE_REL64( p, c, s ); 255 } 256 257 /* 258 Spin wait loop that first does pause, then yield. 259 Waits until function returns non-zero when called with *spinner and check. 260 Does NOT put threads to sleep. 261 #if USE_ITT_BUILD 262 Arguments: 263 obj -- is higher-level synchronization object to report to ittnotify. It is used to report 264 locks consistently. For example, if lock is acquired immediately, its address is 265 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired 266 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same 267 address, not an address of low-level spinner. 268 #endif // USE_ITT_BUILD 269 */ 270 template< typename UT > 271 // ToDo: make inline function (move to header file for icl) 272 static UT // unsigned 4- or 8-byte type 273 __kmp_wait_yield( volatile UT * spinner, 274 UT checker, 275 kmp_uint32 (* pred)( UT, UT ) 276 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL. 277 ) 278 { 279 // note: we may not belong to a team at this point 280 register volatile UT * spin = spinner; 281 register UT check = checker; 282 register kmp_uint32 spins; 283 register kmp_uint32 (*f) ( UT, UT ) = pred; 284 register UT r; 285 286 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 287 KMP_INIT_YIELD( spins ); 288 // main wait spin loop 289 while(!f(r = *spin, check)) 290 { 291 KMP_FSYNC_SPIN_PREPARE( obj ); 292 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 293 It causes problems with infinite recursion because of exit lock */ 294 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 295 __kmp_abort_thread(); */ 296 297 __kmp_static_delay(TRUE); 298 299 // if we are oversubscribed, 300 // or have waited a bit (and KMP_LIBRARY=throughput, then yield 301 // pause is in the following code 302 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 303 KMP_YIELD_SPIN( spins ); 304 } 305 KMP_FSYNC_SPIN_ACQUIRED( obj ); 306 return r; 307 } 308 309 template< typename UT > 310 static kmp_uint32 __kmp_eq( UT value, UT checker) { 311 return value == checker; 312 } 313 314 template< typename UT > 315 static kmp_uint32 __kmp_neq( UT value, UT checker) { 316 return value != checker; 317 } 318 319 template< typename UT > 320 static kmp_uint32 __kmp_lt( UT value, UT checker) { 321 return value < checker; 322 } 323 324 template< typename UT > 325 static kmp_uint32 __kmp_ge( UT value, UT checker) { 326 return value >= checker; 327 } 328 329 template< typename UT > 330 static kmp_uint32 __kmp_le( UT value, UT checker) { 331 return value <= checker; 332 } 333 334 335 /* ------------------------------------------------------------------------ */ 336 /* ------------------------------------------------------------------------ */ 337 338 static void 339 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 340 { 341 kmp_info_t *th; 342 343 KMP_DEBUG_ASSERT( gtid_ref ); 344 345 if ( __kmp_env_consistency_check ) { 346 th = __kmp_threads[*gtid_ref]; 347 if ( th -> th.th_root -> r.r_active 348 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) { 349 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL ); 350 } 351 } 352 } 353 354 template< typename UT > 355 static void 356 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 357 { 358 typedef typename traits_t< UT >::signed_t ST; 359 dispatch_private_info_template< UT > * pr; 360 361 int gtid = *gtid_ref; 362 // int cid = *cid_ref; 363 kmp_info_t *th = __kmp_threads[ gtid ]; 364 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 365 366 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) ); 367 if ( __kmp_env_consistency_check ) { 368 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 369 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 370 if ( pr -> pushed_ws != ct_none ) { 371 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL ); 372 } 373 } 374 375 if ( ! th -> th.th_team -> t.t_serialized ) { 376 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 377 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 378 UT lower; 379 380 if ( ! __kmp_env_consistency_check ) { 381 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 382 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 383 } 384 lower = pr->u.p.ordered_lower; 385 386 #if ! defined( KMP_GOMP_COMPAT ) 387 if ( __kmp_env_consistency_check ) { 388 if ( pr->ordered_bumped ) { 389 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 390 __kmp_error_construct2( 391 kmp_i18n_msg_CnsMultipleNesting, 392 ct_ordered_in_pdo, loc_ref, 393 & p->stack_data[ p->w_top ] 394 ); 395 } 396 } 397 #endif /* !defined(KMP_GOMP_COMPAT) */ 398 399 KMP_MB(); 400 #ifdef KMP_DEBUG 401 { 402 const char * buff; 403 // create format specifiers before the debug output 404 buff = __kmp_str_format( 405 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n", 406 traits_t< UT >::spec, traits_t< UT >::spec ); 407 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 408 __kmp_str_free( &buff ); 409 } 410 #endif 411 412 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 413 USE_ITT_BUILD_ARG( NULL ) 414 ); 415 KMP_MB(); /* is this necessary? */ 416 #ifdef KMP_DEBUG 417 { 418 const char * buff; 419 // create format specifiers before the debug output 420 buff = __kmp_str_format( 421 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n", 422 traits_t< UT >::spec, traits_t< UT >::spec ); 423 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 424 __kmp_str_free( &buff ); 425 } 426 #endif 427 } 428 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) ); 429 } 430 431 static void 432 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 433 { 434 kmp_info_t *th; 435 436 if ( __kmp_env_consistency_check ) { 437 th = __kmp_threads[*gtid_ref]; 438 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) { 439 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref ); 440 } 441 } 442 } 443 444 template< typename UT > 445 static void 446 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref ) 447 { 448 typedef typename traits_t< UT >::signed_t ST; 449 dispatch_private_info_template< UT > * pr; 450 451 int gtid = *gtid_ref; 452 // int cid = *cid_ref; 453 kmp_info_t *th = __kmp_threads[ gtid ]; 454 KMP_DEBUG_ASSERT( th -> th.th_dispatch ); 455 456 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) ); 457 if ( __kmp_env_consistency_check ) { 458 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 459 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 460 if ( pr -> pushed_ws != ct_none ) { 461 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref ); 462 } 463 } 464 465 if ( ! th -> th.th_team -> t.t_serialized ) { 466 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 467 ( th -> th.th_dispatch -> th_dispatch_sh_current ); 468 469 if ( ! __kmp_env_consistency_check ) { 470 pr = reinterpret_cast< dispatch_private_info_template< UT >* > 471 ( th -> th.th_dispatch -> th_dispatch_pr_current ); 472 } 473 474 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration ); 475 #if ! defined( KMP_GOMP_COMPAT ) 476 if ( __kmp_env_consistency_check ) { 477 if ( pr->ordered_bumped != 0 ) { 478 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons; 479 /* How to test it? - OM */ 480 __kmp_error_construct2( 481 kmp_i18n_msg_CnsMultipleNesting, 482 ct_ordered_in_pdo, loc_ref, 483 & p->stack_data[ p->w_top ] 484 ); 485 } 486 } 487 #endif /* !defined(KMP_GOMP_COMPAT) */ 488 489 KMP_MB(); /* Flush all pending memory write invalidates. */ 490 491 pr->ordered_bumped += 1; 492 493 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", 494 gtid, pr->ordered_bumped ) ); 495 496 KMP_MB(); /* Flush all pending memory write invalidates. */ 497 498 /* TODO use general release procedure? */ 499 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 500 501 KMP_MB(); /* Flush all pending memory write invalidates. */ 502 } 503 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) ); 504 } 505 506 /* Computes and returns x to the power of y, where y must a non-negative integer */ 507 template< typename UT > 508 static __forceinline long double 509 __kmp_pow(long double x, UT y) { 510 long double s=1.0L; 511 512 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); 513 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned 514 while(y) { 515 if ( y & 1 ) 516 s *= x; 517 x *= x; 518 y >>= 1; 519 } 520 return s; 521 } 522 523 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned 524 (the total number of unassigned iterations in chunks with index greater than or equal to idx). 525 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong 526 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) 527 */ 528 template< typename T > 529 static __inline typename traits_t< T >::unsigned_t 530 __kmp_dispatch_guided_remaining( 531 T tc, 532 typename traits_t< T >::floating_t base, 533 typename traits_t< T >::unsigned_t idx 534 ) { 535 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at 536 least for ICL 8.1, long double arithmetic may not really have 537 long double precision, even with /Qlong_double. Currently, we 538 workaround that in the caller code, by manipulating the FPCW for 539 Windows* OS on IA-32 architecture. The lack of precision is not 540 expected to be a correctness issue, though. 541 */ 542 typedef typename traits_t< T >::unsigned_t UT; 543 544 long double x = tc * __kmp_pow< UT >(base, idx); 545 UT r = (UT) x; 546 if ( x == r ) 547 return r; 548 return r + 1; 549 } 550 551 // Parameters of the guided-iterative algorithm: 552 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic 553 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier 554 // by default n = 2. For example with n = 3 the chunks distribution will be more flat. 555 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. 556 static int guided_int_param = 2; 557 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param; 558 559 // UT - unsigned flavor of T, ST - signed flavor of T, 560 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 561 template< typename T > 562 static void 563 __kmp_dispatch_init( 564 ident_t * loc, 565 int gtid, 566 enum sched_type schedule, 567 T lb, 568 T ub, 569 typename traits_t< T >::signed_t st, 570 typename traits_t< T >::signed_t chunk, 571 int push_ws 572 ) { 573 typedef typename traits_t< T >::unsigned_t UT; 574 typedef typename traits_t< T >::signed_t ST; 575 typedef typename traits_t< T >::floating_t DBL; 576 static const int ___kmp_size_type = sizeof( UT ); 577 578 int active; 579 T tc; 580 kmp_info_t * th; 581 kmp_team_t * team; 582 kmp_uint32 my_buffer_index; 583 dispatch_private_info_template< T > * pr; 584 dispatch_shared_info_template< UT > volatile * sh; 585 586 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) ); 587 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) ); 588 589 if ( ! TCR_4( __kmp_init_parallel ) ) 590 __kmp_parallel_initialize(); 591 592 #ifdef KMP_DEBUG 593 { 594 const char * buff; 595 // create format specifiers before the debug output 596 buff = __kmp_str_format( 597 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", 598 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 599 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) ); 600 __kmp_str_free( &buff ); 601 } 602 #endif 603 /* setup data */ 604 th = __kmp_threads[ gtid ]; 605 team = th -> th.th_team; 606 active = ! team -> t.t_serialized; 607 th->th.th_ident = loc; 608 609 if ( ! active ) { 610 pr = reinterpret_cast< dispatch_private_info_template< T >* > 611 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 612 } else { 613 KMP_DEBUG_ASSERT( th->th.th_dispatch == 614 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 615 616 my_buffer_index = th->th.th_dispatch->th_disp_index ++; 617 618 /* What happens when number of threads changes, need to resize buffer? */ 619 pr = reinterpret_cast< dispatch_private_info_template< T > * > 620 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 621 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * > 622 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] ); 623 } 624 625 /* Pick up the nomerge/ordered bits from the scheduling type */ 626 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) { 627 pr->nomerge = TRUE; 628 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); 629 } else { 630 pr->nomerge = FALSE; 631 } 632 pr->type_size = ___kmp_size_type; // remember the size of variables 633 if ( kmp_ord_lower & schedule ) { 634 pr->ordered = TRUE; 635 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); 636 } else { 637 pr->ordered = FALSE; 638 } 639 if ( schedule == kmp_sch_static ) { 640 schedule = __kmp_static; 641 } else { 642 if ( schedule == kmp_sch_runtime ) { 643 #if OMP_30_ENABLED 644 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified) 645 schedule = team -> t.t_sched.r_sched_type; 646 // Detail the schedule if needed (global controls are differentiated appropriately) 647 if ( schedule == kmp_sch_guided_chunked ) { 648 schedule = __kmp_guided; 649 } else if ( schedule == kmp_sch_static ) { 650 schedule = __kmp_static; 651 } 652 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified) 653 chunk = team -> t.t_sched.chunk; 654 #else 655 kmp_r_sched_t r_sched = __kmp_get_schedule_global(); 656 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default 657 schedule = r_sched.r_sched_type; 658 chunk = r_sched.chunk; 659 #endif 660 661 #ifdef KMP_DEBUG 662 { 663 const char * buff; 664 // create format specifiers before the debug output 665 buff = __kmp_str_format( 666 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n", 667 traits_t< ST >::spec ); 668 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 669 __kmp_str_free( &buff ); 670 } 671 #endif 672 } else { 673 if ( schedule == kmp_sch_guided_chunked ) { 674 schedule = __kmp_guided; 675 } 676 if ( chunk <= 0 ) { 677 chunk = KMP_DEFAULT_CHUNK; 678 } 679 } 680 681 #if OMP_30_ENABLED 682 if ( schedule == kmp_sch_auto ) { 683 // mapping and differentiation: in the __kmp_do_serial_initialize() 684 schedule = __kmp_auto; 685 #ifdef KMP_DEBUG 686 { 687 const char * buff; 688 // create format specifiers before the debug output 689 buff = __kmp_str_format( 690 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n", 691 traits_t< ST >::spec ); 692 KD_TRACE(10, ( buff, gtid, schedule, chunk ) ); 693 __kmp_str_free( &buff ); 694 } 695 #endif 696 } 697 #endif // OMP_30_ENABLED 698 699 /* guided analytical not safe for too many threads */ 700 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) { 701 schedule = kmp_sch_guided_iterative_chunked; 702 KMP_WARNING( DispatchManyThreads ); 703 } 704 pr->u.p.parm1 = chunk; 705 } 706 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper), 707 "unknown scheduling type" ); 708 709 pr->u.p.count = 0; 710 711 if ( __kmp_env_consistency_check ) { 712 if ( st == 0 ) { 713 __kmp_error_construct( 714 kmp_i18n_msg_CnsLoopIncrZeroProhibited, 715 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc 716 ); 717 } 718 } 719 720 tc = ( ub - lb + st ); 721 if ( st != 1 ) { 722 if ( st < 0 ) { 723 if ( lb < ub ) { 724 tc = 0; // zero-trip 725 } else { // lb >= ub 726 tc = (ST)tc / st; // convert to signed division 727 } 728 } else { // st > 0 729 if ( ub < lb ) { 730 tc = 0; // zero-trip 731 } else { // lb >= ub 732 tc /= st; 733 } 734 } 735 } else if ( ub < lb ) { // st == 1 736 tc = 0; // zero-trip 737 } 738 739 pr->u.p.lb = lb; 740 pr->u.p.ub = ub; 741 pr->u.p.st = st; 742 pr->u.p.tc = tc; 743 744 #if KMP_OS_WINDOWS 745 pr->u.p.last_upper = ub + st; 746 #endif /* KMP_OS_WINDOWS */ 747 748 /* NOTE: only the active parallel region(s) has active ordered sections */ 749 750 if ( active ) { 751 if ( pr->ordered == 0 ) { 752 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error; 753 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error; 754 } else { 755 pr->ordered_bumped = 0; 756 757 pr->u.p.ordered_lower = 1; 758 pr->u.p.ordered_upper = 0; 759 760 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >; 761 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >; 762 } 763 } 764 765 if ( __kmp_env_consistency_check ) { 766 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo; 767 if ( push_ws ) { 768 __kmp_push_workshare( gtid, ws, loc ); 769 pr->pushed_ws = ws; 770 } else { 771 __kmp_check_workshare( gtid, ws, loc ); 772 pr->pushed_ws = ct_none; 773 } 774 } 775 776 switch ( schedule ) { 777 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 778 case kmp_sch_static_steal: 779 { 780 T nproc = team->t.t_nproc; 781 T ntc, init; 782 783 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) ); 784 785 ntc = (tc % chunk ? 1 : 0) + tc / chunk; 786 if ( nproc > 1 && ntc >= nproc ) { 787 T id = __kmp_tid_from_gtid(gtid); 788 T small_chunk, extras; 789 790 small_chunk = ntc / nproc; 791 extras = ntc % nproc; 792 793 init = id * small_chunk + ( id < extras ? id : extras ); 794 pr->u.p.count = init; 795 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 ); 796 797 pr->u.p.parm2 = lb; 798 //pr->pfields.parm3 = 0; // it's not used in static_steal 799 pr->u.p.parm4 = id; 800 pr->u.p.st = st; 801 break; 802 } else { 803 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n", 804 gtid ) ); 805 schedule = kmp_sch_static_balanced; 806 /* too few iterations: fall-through to kmp_sch_static_balanced */ 807 } // if 808 /* FALL-THROUGH to static balanced */ 809 } // case 810 #endif 811 case kmp_sch_static_balanced: 812 { 813 T nproc = team->t.t_nproc; 814 T init, limit; 815 816 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n", 817 gtid ) ); 818 819 if ( nproc > 1 ) { 820 T id = __kmp_tid_from_gtid(gtid); 821 822 if ( tc < nproc ) { 823 if ( id < tc ) { 824 init = id; 825 limit = id; 826 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ 827 } else { 828 pr->u.p.count = 1; /* means no more chunks to execute */ 829 pr->u.p.parm1 = FALSE; 830 break; 831 } 832 } else { 833 T small_chunk = tc / nproc; 834 T extras = tc % nproc; 835 init = id * small_chunk + (id < extras ? id : extras); 836 limit = init + small_chunk - (id < extras ? 0 : 1); 837 pr->u.p.parm1 = (id == nproc - 1); 838 } 839 } else { 840 if ( tc > 0 ) { 841 init = 0; 842 limit = tc - 1; 843 pr->u.p.parm1 = TRUE; 844 } else { 845 // zero trip count 846 pr->u.p.count = 1; /* means no more chunks to execute */ 847 pr->u.p.parm1 = FALSE; 848 break; 849 } 850 } 851 if ( st == 1 ) { 852 pr->u.p.lb = lb + init; 853 pr->u.p.ub = lb + limit; 854 } else { 855 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound 856 pr->u.p.lb = lb + init * st; 857 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly 858 if ( st > 0 ) { 859 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp ); 860 } else { 861 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp ); 862 } 863 } 864 if ( pr->ordered ) { 865 pr->u.p.ordered_lower = init; 866 pr->u.p.ordered_upper = limit; 867 } 868 break; 869 } // case 870 case kmp_sch_guided_iterative_chunked : 871 { 872 T nproc = team->t.t_nproc; 873 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid)); 874 875 if ( nproc > 1 ) { 876 if ( (2L * chunk + 1 ) * nproc >= tc ) { 877 /* chunk size too large, switch to dynamic */ 878 schedule = kmp_sch_dynamic_chunked; 879 } else { 880 // when remaining iters become less than parm2 - switch to dynamic 881 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 ); 882 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4 883 } 884 } else { 885 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid)); 886 schedule = kmp_sch_static_greedy; 887 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 888 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 889 pr->u.p.parm1 = tc; 890 } // if 891 } // case 892 break; 893 case kmp_sch_guided_analytical_chunked: 894 { 895 T nproc = team->t.t_nproc; 896 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid)); 897 898 if ( nproc > 1 ) { 899 if ( (2L * chunk + 1 ) * nproc >= tc ) { 900 /* chunk size too large, switch to dynamic */ 901 schedule = kmp_sch_dynamic_chunked; 902 } else { 903 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 904 DBL x; 905 906 #if KMP_OS_WINDOWS && KMP_ARCH_X86 907 /* Linux* OS already has 64-bit computation by default for 908 long double, and on Windows* OS on Intel(R) 64, 909 /Qlong_double doesn't work. On Windows* OS 910 on IA-32 architecture, we need to set precision to 911 64-bit instead of the default 53-bit. Even though long 912 double doesn't work on Windows* OS on Intel(R) 64, the 913 resulting lack of precision is not expected to impact 914 the correctness of the algorithm, but this has not been 915 mathematically proven. 916 */ 917 // save original FPCW and set precision to 64-bit, as 918 // Windows* OS on IA-32 architecture defaults to 53-bit 919 unsigned int oldFpcw = _control87(0,0); 920 _control87(_PC_64,_MCW_PC); // 0,0x30000 921 #endif 922 /* value used for comparison in solver for cross-over point */ 923 long double target = ((long double)chunk * 2 + 1) * nproc / tc; 924 925 /* crossover point--chunk indexes equal to or greater than 926 this point switch to dynamic-style scheduling */ 927 UT cross; 928 929 /* commonly used term: (2 nproc - 1)/(2 nproc) */ 930 x = (long double)1.0 - (long double)0.5 / nproc; 931 932 #ifdef KMP_DEBUG 933 { // test natural alignment 934 struct _test_a { 935 char a; 936 union { 937 char b; 938 DBL d; 939 }; 940 } t; 941 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; 942 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment ); 943 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 ); 944 } 945 #endif // KMP_DEBUG 946 947 /* save the term in thread private dispatch structure */ 948 *(DBL*)&pr->u.p.parm3 = x; 949 950 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */ 951 { 952 UT left, right, mid; 953 long double p; 954 955 /* estimate initial upper and lower bound */ 956 957 /* doesn't matter what value right is as long as it is positive, but 958 it affects performance of the solver 959 */ 960 right = 229; 961 p = __kmp_pow< UT >(x,right); 962 if ( p > target ) { 963 do{ 964 p *= p; 965 right <<= 1; 966 } while(p>target && right < (1<<27)); 967 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */ 968 } else { 969 left = 0; 970 } 971 972 /* bisection root-finding method */ 973 while ( left + 1 < right ) { 974 mid = (left + right) / 2; 975 if ( __kmp_pow< UT >(x,mid) > target ) { 976 left = mid; 977 } else { 978 right = mid; 979 } 980 } // while 981 cross = right; 982 } 983 /* assert sanity of computed crossover point */ 984 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target); 985 986 /* save the crossover point in thread private dispatch structure */ 987 pr->u.p.parm2 = cross; 988 989 // C75803 990 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) ) 991 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3) 992 #else 993 #define GUIDED_ANALYTICAL_WORKAROUND (x) 994 #endif 995 /* dynamic-style scheduling offset */ 996 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk; 997 #if KMP_OS_WINDOWS && KMP_ARCH_X86 998 // restore FPCW 999 _control87(oldFpcw,_MCW_PC); 1000 #endif 1001 } // if 1002 } else { 1003 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n", 1004 gtid ) ); 1005 schedule = kmp_sch_static_greedy; 1006 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ 1007 pr->u.p.parm1 = tc; 1008 } // if 1009 } // case 1010 break; 1011 case kmp_sch_static_greedy: 1012 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid)); 1013 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ? 1014 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc : 1015 tc; 1016 break; 1017 case kmp_sch_static_chunked : 1018 case kmp_sch_dynamic_chunked : 1019 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); 1020 break; 1021 case kmp_sch_trapezoidal : 1022 { 1023 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ 1024 1025 T parm1, parm2, parm3, parm4; 1026 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) ); 1027 1028 parm1 = chunk; 1029 1030 /* F : size of the first cycle */ 1031 parm2 = ( tc / (2 * team->t.t_nproc) ); 1032 1033 if ( parm2 < 1 ) { 1034 parm2 = 1; 1035 } 1036 1037 /* L : size of the last cycle. Make sure the last cycle 1038 * is not larger than the first cycle. 1039 */ 1040 if ( parm1 < 1 ) { 1041 parm1 = 1; 1042 } else if ( parm1 > parm2 ) { 1043 parm1 = parm2; 1044 } 1045 1046 /* N : number of cycles */ 1047 parm3 = ( parm2 + parm1 ); 1048 parm3 = ( 2 * tc + parm3 - 1) / parm3; 1049 1050 if ( parm3 < 2 ) { 1051 parm3 = 2; 1052 } 1053 1054 /* sigma : decreasing incr of the trapezoid */ 1055 parm4 = ( parm3 - 1 ); 1056 parm4 = ( parm2 - parm1 ) / parm4; 1057 1058 // pointless check, because parm4 >= 0 always 1059 //if ( parm4 < 0 ) { 1060 // parm4 = 0; 1061 //} 1062 1063 pr->u.p.parm1 = parm1; 1064 pr->u.p.parm2 = parm2; 1065 pr->u.p.parm3 = parm3; 1066 pr->u.p.parm4 = parm4; 1067 } // case 1068 break; 1069 1070 default: 1071 { 1072 __kmp_msg( 1073 kmp_ms_fatal, // Severity 1074 KMP_MSG( UnknownSchedTypeDetected ), // Primary message 1075 KMP_HNT( GetNewerLibrary ), // Hint 1076 __kmp_msg_null // Variadic argument list terminator 1077 ); 1078 } 1079 break; 1080 } // switch 1081 pr->schedule = schedule; 1082 if ( active ) { 1083 /* The name of this buffer should be my_buffer_index when it's free to use it */ 1084 1085 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n", 1086 gtid, my_buffer_index, sh->buffer_index) ); 1087 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 > 1088 USE_ITT_BUILD_ARG( NULL ) 1089 ); 1090 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are 1091 // *always* 32-bit integers. 1092 KMP_MB(); /* is this necessary? */ 1093 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n", 1094 gtid, my_buffer_index, sh->buffer_index) ); 1095 1096 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr; 1097 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh; 1098 #if USE_ITT_BUILD 1099 if ( pr->ordered ) { 1100 __kmp_itt_ordered_init( gtid ); 1101 }; // if 1102 #endif /* USE_ITT_BUILD */ 1103 }; // if 1104 #ifdef KMP_DEBUG 1105 { 1106 const char * buff; 1107 // create format specifiers before the debug output 1108 buff = __kmp_str_format( 1109 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \ 1110 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \ 1111 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", 1112 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec, 1113 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec, 1114 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec, 1115 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec ); 1116 KD_TRACE(10, ( buff, 1117 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub, 1118 pr->u.p.st, pr->u.p.tc, pr->u.p.count, 1119 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1, 1120 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) ); 1121 __kmp_str_free( &buff ); 1122 } 1123 #endif 1124 #if ( KMP_STATIC_STEAL_ENABLED ) 1125 if ( ___kmp_size_type < 8 ) { 1126 // It cannot be guaranteed that after execution of a loop with some other schedule kind 1127 // all the parm3 variables will contain the same value. 1128 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1 1129 // rather than program life-time increment. 1130 // So the dedicated variable is required. The 'static_steal_counter' is used. 1131 if( schedule == kmp_sch_static_steal ) { 1132 // Other threads will inspect this variable when searching for a victim. 1133 // This is a flag showing that other threads may steal from this thread since then. 1134 volatile T * p = &pr->u.p.static_steal_counter; 1135 *p = *p + 1; 1136 } 1137 } 1138 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING ) 1139 } 1140 1141 /* 1142 * For ordered loops, either __kmp_dispatch_finish() should be called after 1143 * every iteration, or __kmp_dispatch_finish_chunk() should be called after 1144 * every chunk of iterations. If the ordered section(s) were not executed 1145 * for this iteration (or every iteration in this chunk), we need to set the 1146 * ordered iteration counters so that the next thread can proceed. 1147 */ 1148 template< typename UT > 1149 static void 1150 __kmp_dispatch_finish( int gtid, ident_t *loc ) 1151 { 1152 typedef typename traits_t< UT >::signed_t ST; 1153 kmp_info_t *th = __kmp_threads[ gtid ]; 1154 1155 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) ); 1156 if ( ! th -> th.th_team -> t.t_serialized ) { 1157 1158 dispatch_private_info_template< UT > * pr = 1159 reinterpret_cast< dispatch_private_info_template< UT >* > 1160 ( th->th.th_dispatch->th_dispatch_pr_current ); 1161 dispatch_shared_info_template< UT > volatile * sh = 1162 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1163 ( th->th.th_dispatch->th_dispatch_sh_current ); 1164 KMP_DEBUG_ASSERT( pr ); 1165 KMP_DEBUG_ASSERT( sh ); 1166 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1167 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1168 1169 if ( pr->ordered_bumped ) { 1170 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1171 gtid ) ); 1172 pr->ordered_bumped = 0; 1173 } else { 1174 UT lower = pr->u.p.ordered_lower; 1175 1176 #ifdef KMP_DEBUG 1177 { 1178 const char * buff; 1179 // create format specifiers before the debug output 1180 buff = __kmp_str_format( 1181 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n", 1182 traits_t< UT >::spec, traits_t< UT >::spec ); 1183 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1184 __kmp_str_free( &buff ); 1185 } 1186 #endif 1187 1188 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1189 USE_ITT_BUILD_ARG(NULL) 1190 ); 1191 KMP_MB(); /* is this necessary? */ 1192 #ifdef KMP_DEBUG 1193 { 1194 const char * buff; 1195 // create format specifiers before the debug output 1196 buff = __kmp_str_format( 1197 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n", 1198 traits_t< UT >::spec, traits_t< UT >::spec ); 1199 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) ); 1200 __kmp_str_free( &buff ); 1201 } 1202 #endif 1203 1204 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration ); 1205 } // if 1206 } // if 1207 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) ); 1208 } 1209 1210 #ifdef KMP_GOMP_COMPAT 1211 1212 template< typename UT > 1213 static void 1214 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc ) 1215 { 1216 typedef typename traits_t< UT >::signed_t ST; 1217 kmp_info_t *th = __kmp_threads[ gtid ]; 1218 1219 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) ); 1220 if ( ! th -> th.th_team -> t.t_serialized ) { 1221 // int cid; 1222 dispatch_private_info_template< UT > * pr = 1223 reinterpret_cast< dispatch_private_info_template< UT >* > 1224 ( th->th.th_dispatch->th_dispatch_pr_current ); 1225 dispatch_shared_info_template< UT > volatile * sh = 1226 reinterpret_cast< dispatch_shared_info_template< UT >volatile* > 1227 ( th->th.th_dispatch->th_dispatch_sh_current ); 1228 KMP_DEBUG_ASSERT( pr ); 1229 KMP_DEBUG_ASSERT( sh ); 1230 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1231 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1232 1233 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) { 1234 UT lower = pr->u.p.ordered_lower; 1235 UT upper = pr->u.p.ordered_upper; 1236 UT inc = upper - lower + 1; 1237 1238 if ( pr->ordered_bumped == inc ) { 1239 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n", 1240 gtid ) ); 1241 pr->ordered_bumped = 0; 1242 } else { 1243 inc -= pr->ordered_bumped; 1244 1245 #ifdef KMP_DEBUG 1246 { 1247 const char * buff; 1248 // create format specifiers before the debug output 1249 buff = __kmp_str_format( 1250 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \ 1251 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", 1252 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1253 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) ); 1254 __kmp_str_free( &buff ); 1255 } 1256 #endif 1257 1258 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT > 1259 USE_ITT_BUILD_ARG(NULL) 1260 ); 1261 1262 KMP_MB(); /* is this necessary? */ 1263 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n", 1264 gtid ) ); 1265 pr->ordered_bumped = 0; 1266 //!!!!! TODO check if the inc should be unsigned, or signed??? 1267 #ifdef KMP_DEBUG 1268 { 1269 const char * buff; 1270 // create format specifiers before the debug output 1271 buff = __kmp_str_format( 1272 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \ 1273 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", 1274 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec ); 1275 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) ); 1276 __kmp_str_free( &buff ); 1277 } 1278 #endif 1279 1280 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc); 1281 } 1282 // } 1283 } 1284 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) ); 1285 } 1286 1287 #endif /* KMP_GOMP_COMPAT */ 1288 1289 template< typename T > 1290 static int 1291 __kmp_dispatch_next( 1292 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st 1293 ) { 1294 1295 typedef typename traits_t< T >::unsigned_t UT; 1296 typedef typename traits_t< T >::signed_t ST; 1297 typedef typename traits_t< T >::floating_t DBL; 1298 static const int ___kmp_size_type = sizeof( UT ); 1299 1300 int status; 1301 dispatch_private_info_template< T > * pr; 1302 kmp_info_t * th = __kmp_threads[ gtid ]; 1303 kmp_team_t * team = th -> th.th_team; 1304 1305 #ifdef KMP_DEBUG 1306 { 1307 const char * buff; 1308 // create format specifiers before the debug output 1309 buff = __kmp_str_format( 1310 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n", 1311 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1312 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) ); 1313 __kmp_str_free( &buff ); 1314 } 1315 #endif 1316 1317 if ( team -> t.t_serialized ) { 1318 /* NOTE: serialize this dispatch becase we are not at the active level */ 1319 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1320 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */ 1321 KMP_DEBUG_ASSERT( pr ); 1322 1323 if ( (status = (pr->u.p.tc != 0)) == 0 ) { 1324 *p_lb = 0; 1325 *p_ub = 0; 1326 if ( p_st != 0 ) { 1327 *p_st = 0; 1328 } 1329 if ( __kmp_env_consistency_check ) { 1330 if ( pr->pushed_ws != ct_none ) { 1331 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1332 } 1333 } 1334 } else if ( pr->nomerge ) { 1335 kmp_int32 last; 1336 T start; 1337 UT limit, trip, init; 1338 ST incr; 1339 T chunk = pr->u.p.parm1; 1340 1341 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) ); 1342 1343 init = chunk * pr->u.p.count++; 1344 trip = pr->u.p.tc - 1; 1345 1346 if ( (status = (init <= trip)) == 0 ) { 1347 *p_lb = 0; 1348 *p_ub = 0; 1349 if ( p_st != 0 ) *p_st = 0; 1350 if ( __kmp_env_consistency_check ) { 1351 if ( pr->pushed_ws != ct_none ) { 1352 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 1353 } 1354 } 1355 } else { 1356 start = pr->u.p.lb; 1357 limit = chunk + init - 1; 1358 incr = pr->u.p.st; 1359 1360 if ( (last = (limit >= trip)) != 0 ) { 1361 limit = trip; 1362 #if KMP_OS_WINDOWS 1363 pr->u.p.last_upper = pr->u.p.ub; 1364 #endif /* KMP_OS_WINDOWS */ 1365 } 1366 if ( p_last ) { 1367 *p_last = last; 1368 } 1369 if ( p_st != 0 ) { 1370 *p_st = incr; 1371 } 1372 if ( incr == 1 ) { 1373 *p_lb = start + init; 1374 *p_ub = start + limit; 1375 } else { 1376 *p_lb = start + init * incr; 1377 *p_ub = start + limit * incr; 1378 } 1379 1380 if ( pr->ordered ) { 1381 pr->u.p.ordered_lower = init; 1382 pr->u.p.ordered_upper = limit; 1383 #ifdef KMP_DEBUG 1384 { 1385 const char * buff; 1386 // create format specifiers before the debug output 1387 buff = __kmp_str_format( 1388 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1389 traits_t< UT >::spec, traits_t< UT >::spec ); 1390 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1391 __kmp_str_free( &buff ); 1392 } 1393 #endif 1394 } // if 1395 } // if 1396 } else { 1397 pr->u.p.tc = 0; 1398 1399 *p_lb = pr->u.p.lb; 1400 *p_ub = pr->u.p.ub; 1401 #if KMP_OS_WINDOWS 1402 pr->u.p.last_upper = *p_ub; 1403 #endif /* KMP_OS_WINDOWS */ 1404 1405 if ( p_st != 0 ) { 1406 *p_st = pr->u.p.st; 1407 } 1408 if ( p_last ) { 1409 *p_last = TRUE; 1410 } 1411 } // if 1412 #ifdef KMP_DEBUG 1413 { 1414 const char * buff; 1415 // create format specifiers before the debug output 1416 buff = __kmp_str_format( 1417 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \ 1418 "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 1419 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 1420 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) ); 1421 __kmp_str_free( &buff ); 1422 } 1423 #endif 1424 return status; 1425 } else { 1426 kmp_int32 last = 0; 1427 dispatch_shared_info_template< UT > *sh; 1428 T start; 1429 ST incr; 1430 UT limit, trip, init; 1431 1432 KMP_DEBUG_ASSERT( th->th.th_dispatch == 1433 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] ); 1434 1435 pr = reinterpret_cast< dispatch_private_info_template< T >* > 1436 ( th->th.th_dispatch->th_dispatch_pr_current ); 1437 KMP_DEBUG_ASSERT( pr ); 1438 sh = reinterpret_cast< dispatch_shared_info_template< UT >* > 1439 ( th->th.th_dispatch->th_dispatch_sh_current ); 1440 KMP_DEBUG_ASSERT( sh ); 1441 1442 if ( pr->u.p.tc == 0 ) { 1443 // zero trip count 1444 status = 0; 1445 } else { 1446 switch (pr->schedule) { 1447 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1448 case kmp_sch_static_steal: 1449 { 1450 T chunk = pr->u.p.parm1; 1451 1452 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) ); 1453 1454 trip = pr->u.p.tc - 1; 1455 1456 if ( ___kmp_size_type > 4 ) { 1457 // Other threads do not look into the data of this thread, 1458 // so it's not necessary to make volatile casting. 1459 init = ( pr->u.p.count )++; 1460 status = ( init < (UT)pr->u.p.ub ); 1461 } else { 1462 typedef union { 1463 struct { 1464 UT count; 1465 T ub; 1466 } p; 1467 kmp_int64 b; 1468 } union_i4; 1469 // All operations on 'count' or 'ub' must be combined atomically together. 1470 // stealing implemented only for 4-byte indexes 1471 { 1472 union_i4 vold, vnew; 1473 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1474 vnew = vold; 1475 vnew.p.count++; 1476 while( ! KMP_COMPARE_AND_STORE_ACQ64( 1477 ( volatile kmp_int64* )&pr->u.p.count, 1478 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1479 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1480 KMP_CPU_PAUSE(); 1481 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count); 1482 vnew = vold; 1483 vnew.p.count++; 1484 } 1485 vnew = vold; 1486 init = vnew.p.count; 1487 status = ( init < (UT)vnew.p.ub ) ; 1488 } 1489 1490 if( !status ) { 1491 kmp_info_t **other_threads = team->t.t_threads; 1492 int while_limit = 10; 1493 int while_index = 0; 1494 1495 // TODO: algorithm of searching for a victim 1496 // should be cleaned up and measured 1497 while ( ( !status ) && ( while_limit != ++while_index ) ) { 1498 union_i4 vold, vnew; 1499 kmp_int32 remaining; // kmp_int32 because KMP_I4 only 1500 T victimIdx = pr->u.p.parm4; 1501 T oldVictimIdx = victimIdx; 1502 dispatch_private_info_template< T > * victim; 1503 1504 do { 1505 if( !victimIdx ) { 1506 victimIdx = team->t.t_nproc - 1; 1507 } else { 1508 --victimIdx; 1509 } 1510 victim = reinterpret_cast< dispatch_private_info_template< T >* > 1511 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current ); 1512 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx ); 1513 // TODO: think about a proper place of this test 1514 if ( ( !victim ) || 1515 ( (*( volatile T * )&victim->u.p.static_steal_counter) != 1516 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) { 1517 // TODO: delay would be nice 1518 continue; 1519 // the victim is not ready yet to participate in stealing 1520 // because the victim is still in kmp_init_dispatch 1521 } 1522 if ( oldVictimIdx == victimIdx ) { 1523 break; 1524 } 1525 pr->u.p.parm4 = victimIdx; 1526 1527 while( 1 ) { 1528 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count ); 1529 vnew = vold; 1530 1531 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip ); 1532 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) { 1533 break; 1534 } 1535 vnew.p.ub -= (remaining >> 2); 1536 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); 1537 #pragma warning( push ) 1538 // disable warning on pointless comparison of unsigned with 0 1539 #pragma warning( disable: 186 ) 1540 KMP_DEBUG_ASSERT(vnew.p.ub >= 0); 1541 #pragma warning( pop ) 1542 // TODO: Should this be acquire or release? 1543 if ( KMP_COMPARE_AND_STORE_ACQ64( 1544 ( volatile kmp_int64 * )&victim->u.p.count, 1545 *VOLATILE_CAST(kmp_int64 *)&vold.b, 1546 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) { 1547 status = 1; 1548 while_index = 0; 1549 // now update own count and ub 1550 #if KMP_ARCH_X86 1551 // stealing executed on non-KMP_ARCH_X86 only 1552 // Atomic 64-bit write on ia32 is 1553 // unavailable, so we do this in steps. 1554 // This code is not tested. 1555 init = vold.p.count; 1556 pr->u.p.ub = 0; 1557 pr->u.p.count = init + 1; 1558 pr->u.p.ub = vnew.p.count; 1559 #else 1560 init = vnew.p.ub; 1561 vold.p.count = init + 1; 1562 // TODO: is it safe and enough? 1563 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b; 1564 #endif // KMP_ARCH_X86 1565 break; 1566 } // if 1567 KMP_CPU_PAUSE(); 1568 } // while (1) 1569 } // while 1570 } // if 1571 } // if 1572 if ( !status ) { 1573 *p_lb = 0; 1574 *p_ub = 0; 1575 if ( p_st != 0 ) *p_st = 0; 1576 } else { 1577 start = pr->u.p.parm2; 1578 init *= chunk; 1579 limit = chunk + init - 1; 1580 incr = pr->u.p.st; 1581 1582 KMP_DEBUG_ASSERT(init <= trip); 1583 if ( (last = (limit >= trip)) != 0 ) 1584 limit = trip; 1585 if ( p_last ) { 1586 *p_last = last; 1587 } 1588 if ( p_st != 0 ) *p_st = incr; 1589 1590 if ( incr == 1 ) { 1591 *p_lb = start + init; 1592 *p_ub = start + limit; 1593 } else { 1594 *p_lb = start + init * incr; 1595 *p_ub = start + limit * incr; 1596 } 1597 1598 if ( pr->ordered ) { 1599 pr->u.p.ordered_lower = init; 1600 pr->u.p.ordered_upper = limit; 1601 #ifdef KMP_DEBUG 1602 { 1603 const char * buff; 1604 // create format specifiers before the debug output 1605 buff = __kmp_str_format( 1606 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1607 traits_t< UT >::spec, traits_t< UT >::spec ); 1608 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1609 __kmp_str_free( &buff ); 1610 } 1611 #endif 1612 } // if 1613 } // if 1614 break; 1615 } // case 1616 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 ) 1617 case kmp_sch_static_balanced: 1618 { 1619 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) ); 1620 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */ 1621 pr->u.p.count = 1; 1622 *p_lb = pr->u.p.lb; 1623 *p_ub = pr->u.p.ub; 1624 last = pr->u.p.parm1; 1625 if ( p_last ) { 1626 *p_last = last; 1627 } 1628 if ( p_st ) 1629 *p_st = pr->u.p.st; 1630 } else { /* no iterations to do */ 1631 pr->u.p.lb = pr->u.p.ub + pr->u.p.st; 1632 } 1633 if ( pr->ordered ) { 1634 #ifdef KMP_DEBUG 1635 { 1636 const char * buff; 1637 // create format specifiers before the debug output 1638 buff = __kmp_str_format( 1639 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1640 traits_t< UT >::spec, traits_t< UT >::spec ); 1641 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1642 __kmp_str_free( &buff ); 1643 } 1644 #endif 1645 } // if 1646 } // case 1647 break; 1648 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */ 1649 case kmp_sch_static_chunked: 1650 { 1651 T parm1; 1652 1653 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n", 1654 gtid ) ); 1655 parm1 = pr->u.p.parm1; 1656 1657 trip = pr->u.p.tc - 1; 1658 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid)); 1659 1660 if ( (status = (init <= trip)) != 0 ) { 1661 start = pr->u.p.lb; 1662 incr = pr->u.p.st; 1663 limit = parm1 + init - 1; 1664 1665 if ( (last = (limit >= trip)) != 0 ) 1666 limit = trip; 1667 1668 if ( p_last ) { 1669 *p_last = last; 1670 } 1671 if ( p_st != 0 ) *p_st = incr; 1672 1673 pr->u.p.count += team->t.t_nproc; 1674 1675 if ( incr == 1 ) { 1676 *p_lb = start + init; 1677 *p_ub = start + limit; 1678 } 1679 else { 1680 *p_lb = start + init * incr; 1681 *p_ub = start + limit * incr; 1682 } 1683 1684 if ( pr->ordered ) { 1685 pr->u.p.ordered_lower = init; 1686 pr->u.p.ordered_upper = limit; 1687 #ifdef KMP_DEBUG 1688 { 1689 const char * buff; 1690 // create format specifiers before the debug output 1691 buff = __kmp_str_format( 1692 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1693 traits_t< UT >::spec, traits_t< UT >::spec ); 1694 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1695 __kmp_str_free( &buff ); 1696 } 1697 #endif 1698 } // if 1699 } // if 1700 } // case 1701 break; 1702 1703 case kmp_sch_dynamic_chunked: 1704 { 1705 T chunk = pr->u.p.parm1; 1706 1707 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", 1708 gtid ) ); 1709 1710 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1711 trip = pr->u.p.tc - 1; 1712 1713 if ( (status = (init <= trip)) == 0 ) { 1714 *p_lb = 0; 1715 *p_ub = 0; 1716 if ( p_st != 0 ) *p_st = 0; 1717 } else { 1718 start = pr->u.p.lb; 1719 limit = chunk + init - 1; 1720 incr = pr->u.p.st; 1721 1722 if ( (last = (limit >= trip)) != 0 ) 1723 limit = trip; 1724 if ( p_last ) { 1725 *p_last = last; 1726 } 1727 if ( p_st != 0 ) *p_st = incr; 1728 1729 if ( incr == 1 ) { 1730 *p_lb = start + init; 1731 *p_ub = start + limit; 1732 } else { 1733 *p_lb = start + init * incr; 1734 *p_ub = start + limit * incr; 1735 } 1736 1737 if ( pr->ordered ) { 1738 pr->u.p.ordered_lower = init; 1739 pr->u.p.ordered_upper = limit; 1740 #ifdef KMP_DEBUG 1741 { 1742 const char * buff; 1743 // create format specifiers before the debug output 1744 buff = __kmp_str_format( 1745 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1746 traits_t< UT >::spec, traits_t< UT >::spec ); 1747 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1748 __kmp_str_free( &buff ); 1749 } 1750 #endif 1751 } // if 1752 } // if 1753 } // case 1754 break; 1755 1756 case kmp_sch_guided_iterative_chunked: 1757 { 1758 T chunkspec = pr->u.p.parm1; 1759 KD_TRACE(100, 1760 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid)); 1761 trip = pr->u.p.tc; 1762 // Start atomic part of calculations 1763 while(1) { 1764 ST remaining; // signed, because can be < 0 1765 init = sh->u.s.iteration; // shared value 1766 remaining = trip - init; 1767 if ( remaining <= 0 ) { // AC: need to compare with 0 first 1768 // nothing to do, don't try atomic op 1769 status = 0; 1770 break; 1771 } 1772 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default 1773 // use dynamic-style shcedule 1774 // atomically inrement iterations, get old value 1775 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec ); 1776 remaining = trip - init; 1777 if (remaining <= 0) { 1778 status = 0; // all iterations got by other threads 1779 } else { 1780 // got some iterations to work on 1781 status = 1; 1782 if ( (T)remaining > chunkspec ) { 1783 limit = init + chunkspec - 1; 1784 } else { 1785 last = 1; // the last chunk 1786 limit = init + remaining - 1; 1787 } // if 1788 } // if 1789 break; 1790 } // if 1791 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc 1792 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) { 1793 // CAS was successful, chunk obtained 1794 status = 1; 1795 --limit; 1796 break; 1797 } // if 1798 } // while 1799 if ( status != 0 ) { 1800 start = pr->u.p.lb; 1801 incr = pr->u.p.st; 1802 if ( p_st != NULL ) 1803 *p_st = incr; 1804 if ( p_last != NULL ) 1805 *p_last = last; 1806 *p_lb = start + init * incr; 1807 *p_ub = start + limit * incr; 1808 if ( pr->ordered ) { 1809 pr->u.p.ordered_lower = init; 1810 pr->u.p.ordered_upper = limit; 1811 #ifdef KMP_DEBUG 1812 { 1813 const char * buff; 1814 // create format specifiers before the debug output 1815 buff = __kmp_str_format( 1816 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1817 traits_t< UT >::spec, traits_t< UT >::spec ); 1818 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1819 __kmp_str_free( &buff ); 1820 } 1821 #endif 1822 } // if 1823 } else { 1824 *p_lb = 0; 1825 *p_ub = 0; 1826 if ( p_st != NULL ) 1827 *p_st = 0; 1828 } // if 1829 } // case 1830 break; 1831 1832 case kmp_sch_guided_analytical_chunked: 1833 { 1834 T chunkspec = pr->u.p.parm1; 1835 UT chunkIdx; 1836 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1837 /* for storing original FPCW value for Windows* OS on 1838 IA-32 architecture 8-byte version */ 1839 unsigned int oldFpcw; 1840 unsigned int fpcwSet = 0; 1841 #endif 1842 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n", 1843 gtid ) ); 1844 1845 trip = pr->u.p.tc; 1846 1847 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 1848 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip); 1849 1850 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */ 1851 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration ); 1852 if ( chunkIdx >= (UT)pr->u.p.parm2 ) { 1853 --trip; 1854 /* use dynamic-style scheduling */ 1855 init = chunkIdx * chunkspec + pr->u.p.count; 1856 /* need to verify init > 0 in case of overflow in the above calculation */ 1857 if ( (status = (init > 0 && init <= trip)) != 0 ) { 1858 limit = init + chunkspec -1; 1859 1860 if ( (last = (limit >= trip)) != 0 ) 1861 limit = trip; 1862 } 1863 break; 1864 } else { 1865 /* use exponential-style scheduling */ 1866 /* The following check is to workaround the lack of long double precision on Windows* OS. 1867 This check works around the possible effect that init != 0 for chunkIdx == 0. 1868 */ 1869 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1870 /* If we haven't already done so, save original 1871 FPCW and set precision to 64-bit, as Windows* OS 1872 on IA-32 architecture defaults to 53-bit */ 1873 if ( !fpcwSet ) { 1874 oldFpcw = _control87(0,0); 1875 _control87(_PC_64,_MCW_PC); 1876 fpcwSet = 0x30000; 1877 } 1878 #endif 1879 if ( chunkIdx ) { 1880 init = __kmp_dispatch_guided_remaining< T >( 1881 trip, *( DBL * )&pr->u.p.parm3, chunkIdx ); 1882 KMP_DEBUG_ASSERT(init); 1883 init = trip - init; 1884 } else 1885 init = 0; 1886 limit = trip - __kmp_dispatch_guided_remaining< T >( 1887 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 ); 1888 KMP_ASSERT(init <= limit); 1889 if ( init < limit ) { 1890 KMP_DEBUG_ASSERT(limit <= trip); 1891 --limit; 1892 status = 1; 1893 break; 1894 } // if 1895 } // if 1896 } // while (1) 1897 #if KMP_OS_WINDOWS && KMP_ARCH_X86 1898 /* restore FPCW if necessary 1899 AC: check fpcwSet flag first because oldFpcw can be uninitialized here 1900 */ 1901 if ( fpcwSet && ( oldFpcw & fpcwSet ) ) 1902 _control87(oldFpcw,_MCW_PC); 1903 #endif 1904 if ( status != 0 ) { 1905 start = pr->u.p.lb; 1906 incr = pr->u.p.st; 1907 if ( p_st != NULL ) 1908 *p_st = incr; 1909 if ( p_last != NULL ) 1910 *p_last = last; 1911 *p_lb = start + init * incr; 1912 *p_ub = start + limit * incr; 1913 if ( pr->ordered ) { 1914 pr->u.p.ordered_lower = init; 1915 pr->u.p.ordered_upper = limit; 1916 #ifdef KMP_DEBUG 1917 { 1918 const char * buff; 1919 // create format specifiers before the debug output 1920 buff = __kmp_str_format( 1921 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1922 traits_t< UT >::spec, traits_t< UT >::spec ); 1923 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1924 __kmp_str_free( &buff ); 1925 } 1926 #endif 1927 } 1928 } else { 1929 *p_lb = 0; 1930 *p_ub = 0; 1931 if ( p_st != NULL ) 1932 *p_st = 0; 1933 } 1934 } // case 1935 break; 1936 1937 case kmp_sch_trapezoidal: 1938 { 1939 UT index; 1940 T parm2 = pr->u.p.parm2; 1941 T parm3 = pr->u.p.parm3; 1942 T parm4 = pr->u.p.parm4; 1943 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n", 1944 gtid ) ); 1945 1946 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration ); 1947 1948 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2; 1949 trip = pr->u.p.tc - 1; 1950 1951 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) { 1952 *p_lb = 0; 1953 *p_ub = 0; 1954 if ( p_st != 0 ) *p_st = 0; 1955 } else { 1956 start = pr->u.p.lb; 1957 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1; 1958 incr = pr->u.p.st; 1959 1960 if ( (last = (limit >= trip)) != 0 ) 1961 limit = trip; 1962 1963 if ( p_last != 0 ) { 1964 *p_last = last; 1965 } 1966 if ( p_st != 0 ) *p_st = incr; 1967 1968 if ( incr == 1 ) { 1969 *p_lb = start + init; 1970 *p_ub = start + limit; 1971 } else { 1972 *p_lb = start + init * incr; 1973 *p_ub = start + limit * incr; 1974 } 1975 1976 if ( pr->ordered ) { 1977 pr->u.p.ordered_lower = init; 1978 pr->u.p.ordered_upper = limit; 1979 #ifdef KMP_DEBUG 1980 { 1981 const char * buff; 1982 // create format specifiers before the debug output 1983 buff = __kmp_str_format( 1984 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n", 1985 traits_t< UT >::spec, traits_t< UT >::spec ); 1986 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) ); 1987 __kmp_str_free( &buff ); 1988 } 1989 #endif 1990 } // if 1991 } // if 1992 } // case 1993 break; 1994 } // switch 1995 } // if tc == 0; 1996 1997 if ( status == 0 ) { 1998 UT num_done; 1999 2000 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done ); 2001 #ifdef KMP_DEBUG 2002 { 2003 const char * buff; 2004 // create format specifiers before the debug output 2005 buff = __kmp_str_format( 2006 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", 2007 traits_t< UT >::spec ); 2008 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) ); 2009 __kmp_str_free( &buff ); 2010 } 2011 #endif 2012 2013 if ( num_done == team->t.t_nproc-1 ) { 2014 /* NOTE: release this buffer to be reused */ 2015 2016 KMP_MB(); /* Flush all pending memory write invalidates. */ 2017 2018 sh->u.s.num_done = 0; 2019 sh->u.s.iteration = 0; 2020 2021 /* TODO replace with general release procedure? */ 2022 if ( pr->ordered ) { 2023 sh->u.s.ordered_iteration = 0; 2024 } 2025 2026 KMP_MB(); /* Flush all pending memory write invalidates. */ 2027 2028 sh -> buffer_index += KMP_MAX_DISP_BUF; 2029 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", 2030 gtid, sh->buffer_index) ); 2031 2032 KMP_MB(); /* Flush all pending memory write invalidates. */ 2033 2034 } // if 2035 if ( __kmp_env_consistency_check ) { 2036 if ( pr->pushed_ws != ct_none ) { 2037 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc ); 2038 } 2039 } 2040 2041 th -> th.th_dispatch -> th_deo_fcn = NULL; 2042 th -> th.th_dispatch -> th_dxo_fcn = NULL; 2043 th -> th.th_dispatch -> th_dispatch_sh_current = NULL; 2044 th -> th.th_dispatch -> th_dispatch_pr_current = NULL; 2045 } // if (status == 0) 2046 #if KMP_OS_WINDOWS 2047 else if ( last ) { 2048 pr->u.p.last_upper = pr->u.p.ub; 2049 } 2050 #endif /* KMP_OS_WINDOWS */ 2051 } // if 2052 2053 #ifdef KMP_DEBUG 2054 { 2055 const char * buff; 2056 // create format specifiers before the debug output 2057 buff = __kmp_str_format( 2058 "__kmp_dispatch_next: T#%%d normal case: " \ 2059 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n", 2060 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec ); 2061 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) ); 2062 __kmp_str_free( &buff ); 2063 } 2064 #endif 2065 return status; 2066 } 2067 2068 //----------------------------------------------------------------------------------------- 2069 // Dispatch routines 2070 // Transfer call to template< type T > 2071 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, 2072 // T lb, T ub, ST st, ST chunk ) 2073 extern "C" { 2074 2075 /*! 2076 @ingroup WORK_SHARING 2077 @{ 2078 @param loc Source location 2079 @param gtid Global thread id 2080 @param schedule Schedule type 2081 @param lb Lower bound 2082 @param ub Upper bound 2083 @param st Step (or increment if you prefer) 2084 @param chunk The chunk size to block with 2085 2086 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments. 2087 These functions are all identical apart from the types of the arguments. 2088 */ 2089 2090 void 2091 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2092 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk ) 2093 { 2094 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2095 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2096 } 2097 /*! 2098 See @ref __kmpc_dispatch_init_4 2099 */ 2100 void 2101 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2102 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk ) 2103 { 2104 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2105 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2106 } 2107 2108 /*! 2109 See @ref __kmpc_dispatch_init_4 2110 */ 2111 void 2112 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2113 kmp_int64 lb, kmp_int64 ub, 2114 kmp_int64 st, kmp_int64 chunk ) 2115 { 2116 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2117 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2118 } 2119 2120 /*! 2121 See @ref __kmpc_dispatch_init_4 2122 */ 2123 void 2124 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2125 kmp_uint64 lb, kmp_uint64 ub, 2126 kmp_int64 st, kmp_int64 chunk ) 2127 { 2128 KMP_DEBUG_ASSERT( __kmp_init_serial ); 2129 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true ); 2130 } 2131 2132 /*! 2133 @param loc Source code location 2134 @param gtid Global thread id 2135 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise 2136 @param p_lb Pointer to the lower bound for the next chunk of work 2137 @param p_ub Pointer to the upper bound for the next chunk of work 2138 @param p_st Pointer to the stride for the next chunk of work 2139 @return one if there is work to be done, zero otherwise 2140 2141 Get the next dynamically allocated chunk of work for this thread. 2142 If there is no more work, then the lb,ub and stride need not be modified. 2143 */ 2144 int 2145 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2146 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st ) 2147 { 2148 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2149 } 2150 2151 /*! 2152 See @ref __kmpc_dispatch_next_4 2153 */ 2154 int 2155 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2156 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st ) 2157 { 2158 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2159 } 2160 2161 /*! 2162 See @ref __kmpc_dispatch_next_4 2163 */ 2164 int 2165 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2166 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st ) 2167 { 2168 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2169 } 2170 2171 /*! 2172 See @ref __kmpc_dispatch_next_4 2173 */ 2174 int 2175 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, 2176 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st ) 2177 { 2178 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st ); 2179 } 2180 2181 /*! 2182 @param loc Source code location 2183 @param gtid Global thread id 2184 2185 Mark the end of a dynamic loop. 2186 */ 2187 void 2188 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid ) 2189 { 2190 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2191 } 2192 2193 /*! 2194 See @ref __kmpc_dispatch_fini_4 2195 */ 2196 void 2197 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid ) 2198 { 2199 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2200 } 2201 2202 /*! 2203 See @ref __kmpc_dispatch_fini_4 2204 */ 2205 void 2206 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid ) 2207 { 2208 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc ); 2209 } 2210 2211 /*! 2212 See @ref __kmpc_dispatch_fini_4 2213 */ 2214 void 2215 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid ) 2216 { 2217 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc ); 2218 } 2219 /*! @} */ 2220 2221 //----------------------------------------------------------------------------------------- 2222 //Non-template routines from kmp_dispatch.c used in other sources 2223 2224 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) { 2225 return value == checker; 2226 } 2227 2228 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) { 2229 return value != checker; 2230 } 2231 2232 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) { 2233 return value < checker; 2234 } 2235 2236 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) { 2237 return value >= checker; 2238 } 2239 2240 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) { 2241 return value <= checker; 2242 } 2243 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) { 2244 return value == checker; 2245 } 2246 2247 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) { 2248 return value != checker; 2249 } 2250 2251 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) { 2252 return value < checker; 2253 } 2254 2255 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) { 2256 return value >= checker; 2257 } 2258 2259 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) { 2260 return value <= checker; 2261 } 2262 2263 kmp_uint32 2264 __kmp_wait_yield_4(volatile kmp_uint32 * spinner, 2265 kmp_uint32 checker, 2266 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 ) 2267 , void * obj // Higher-level synchronization object, or NULL. 2268 ) 2269 { 2270 // note: we may not belong to a team at this point 2271 register volatile kmp_uint32 * spin = spinner; 2272 register kmp_uint32 check = checker; 2273 register kmp_uint32 spins; 2274 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred; 2275 register kmp_uint32 r; 2276 2277 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2278 KMP_INIT_YIELD( spins ); 2279 // main wait spin loop 2280 while(!f(r = TCR_4(*spin), check)) { 2281 KMP_FSYNC_SPIN_PREPARE( obj ); 2282 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2283 It causes problems with infinite recursion because of exit lock */ 2284 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2285 __kmp_abort_thread(); */ 2286 2287 __kmp_static_delay(TRUE); 2288 2289 /* if we have waited a bit, or are oversubscribed, yield */ 2290 /* pause is in the following code */ 2291 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2292 KMP_YIELD_SPIN( spins ); 2293 } 2294 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2295 return r; 2296 } 2297 2298 kmp_uint64 2299 __kmp_wait_yield_8( volatile kmp_uint64 * spinner, 2300 kmp_uint64 checker, 2301 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 ) 2302 , void * obj // Higher-level synchronization object, or NULL. 2303 ) 2304 { 2305 // note: we may not belong to a team at this point 2306 register volatile kmp_uint64 * spin = spinner; 2307 register kmp_uint64 check = checker; 2308 register kmp_uint32 spins; 2309 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred; 2310 register kmp_uint64 r; 2311 2312 KMP_FSYNC_SPIN_INIT( obj, (void*) spin ); 2313 KMP_INIT_YIELD( spins ); 2314 // main wait spin loop 2315 while(!f(r = *spin, check)) 2316 { 2317 KMP_FSYNC_SPIN_PREPARE( obj ); 2318 /* GEH - remove this since it was accidentally introduced when kmp_wait was split. 2319 It causes problems with infinite recursion because of exit lock */ 2320 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) 2321 __kmp_abort_thread(); */ 2322 2323 __kmp_static_delay(TRUE); 2324 2325 // if we are oversubscribed, 2326 // or have waited a bit (and KMP_LIBARRY=throughput, then yield 2327 // pause is in the following code 2328 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc ); 2329 KMP_YIELD_SPIN( spins ); 2330 } 2331 KMP_FSYNC_SPIN_ACQUIRED( obj ); 2332 return r; 2333 } 2334 2335 } // extern "C" 2336 2337 #ifdef KMP_GOMP_COMPAT 2338 2339 void 2340 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2341 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, 2342 kmp_int32 chunk, int push_ws ) 2343 { 2344 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, 2345 push_ws ); 2346 } 2347 2348 void 2349 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2350 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, 2351 kmp_int32 chunk, int push_ws ) 2352 { 2353 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, 2354 push_ws ); 2355 } 2356 2357 void 2358 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2359 kmp_int64 lb, kmp_int64 ub, kmp_int64 st, 2360 kmp_int64 chunk, int push_ws ) 2361 { 2362 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, 2363 push_ws ); 2364 } 2365 2366 void 2367 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, 2368 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, 2369 kmp_int64 chunk, int push_ws ) 2370 { 2371 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, 2372 push_ws ); 2373 } 2374 2375 void 2376 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid ) 2377 { 2378 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2379 } 2380 2381 void 2382 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid ) 2383 { 2384 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2385 } 2386 2387 void 2388 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid ) 2389 { 2390 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc ); 2391 } 2392 2393 void 2394 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid ) 2395 { 2396 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc ); 2397 } 2398 2399 #endif /* KMP_GOMP_COMPAT */ 2400 2401 /* ------------------------------------------------------------------------ */ 2402 /* ------------------------------------------------------------------------ */ 2403 2404