1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "omp.h" /* extern "C" declarations of user-visible routines */ 17 #include "kmp.h" 18 #include "kmp_i18n.h" 19 #include "kmp_itt.h" 20 #include "kmp_lock.h" 21 #include "kmp_error.h" 22 #include "kmp_stats.h" 23 24 #if OMPT_SUPPORT 25 #include "ompt-internal.h" 26 #include "ompt-specific.h" 27 #endif 28 29 #define MAX_MESSAGE 512 30 31 /* ------------------------------------------------------------------------ */ 32 /* ------------------------------------------------------------------------ */ 33 34 /* flags will be used in future, e.g., to implement */ 35 /* openmp_strict library restrictions */ 36 37 /*! 38 * @ingroup STARTUP_SHUTDOWN 39 * @param loc in source location information 40 * @param flags in for future use (currently ignored) 41 * 42 * Initialize the runtime library. This call is optional; if it is not made then 43 * it will be implicitly called by attempts to use other library functions. 44 * 45 */ 46 void 47 __kmpc_begin(ident_t *loc, kmp_int32 flags) 48 { 49 // By default __kmp_ignore_mppbeg() returns TRUE. 50 if (__kmp_ignore_mppbeg() == FALSE) { 51 __kmp_internal_begin(); 52 53 KC_TRACE( 10, ("__kmpc_begin: called\n" ) ); 54 } 55 } 56 57 /*! 58 * @ingroup STARTUP_SHUTDOWN 59 * @param loc source location information 60 * 61 * Shutdown the runtime library. This is also optional, and even if called will not 62 * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero. 63 */ 64 void 65 __kmpc_end(ident_t *loc) 66 { 67 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op. 68 // However, this can be overridden with KMP_IGNORE_MPPEND environment variable. 69 // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end() 70 // will unregister this root (it can cause library shut down). 71 if (__kmp_ignore_mppend() == FALSE) { 72 KC_TRACE( 10, ("__kmpc_end: called\n" ) ); 73 KA_TRACE( 30, ("__kmpc_end\n" )); 74 75 __kmp_internal_end_thread( -1 ); 76 } 77 } 78 79 /*! 80 @ingroup THREAD_STATES 81 @param loc Source location information. 82 @return The global thread index of the active thread. 83 84 This function can be called in any context. 85 86 If the runtime has ony been entered at the outermost level from a 87 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that 88 which would be returned by omp_get_thread_num() in the outermost 89 active parallel construct. (Or zero if there is no active parallel 90 construct, since the master thread is necessarily thread zero). 91 92 If multiple non-OpenMP threads all enter an OpenMP construct then this 93 will be a unique thread identifier among all the threads created by 94 the OpenMP runtime (but the value cannote be defined in terms of 95 OpenMP thread ids returned by omp_get_thread_num()). 96 97 */ 98 kmp_int32 99 __kmpc_global_thread_num(ident_t *loc) 100 { 101 kmp_int32 gtid = __kmp_entry_gtid(); 102 103 KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) ); 104 105 return gtid; 106 } 107 108 /*! 109 @ingroup THREAD_STATES 110 @param loc Source location information. 111 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 112 113 This function can be called in any context. 114 It returns the total number of threads under the control of the OpenMP runtime. That is 115 not a number that can be determined by any OpenMP standard calls, since the library may be 116 called from more than one non-OpenMP thread, and this reflects the total over all such calls. 117 Similarly the runtime maintains underlying threads even when they are not active (since the cost 118 of creating and destroying OS threads is high), this call counts all such threads even if they are not 119 waiting for work. 120 */ 121 kmp_int32 122 __kmpc_global_num_threads(ident_t *loc) 123 { 124 KC_TRACE(10,("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 125 126 return TCR_4(__kmp_all_nth); 127 } 128 129 /*! 130 @ingroup THREAD_STATES 131 @param loc Source location information. 132 @return The thread number of the calling thread in the innermost active parallel construct. 133 134 */ 135 kmp_int32 136 __kmpc_bound_thread_num(ident_t *loc) 137 { 138 KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) ); 139 return __kmp_tid_from_gtid( __kmp_entry_gtid() ); 140 } 141 142 /*! 143 @ingroup THREAD_STATES 144 @param loc Source location information. 145 @return The number of threads in the innermost active parallel construct. 146 */ 147 kmp_int32 148 __kmpc_bound_num_threads(ident_t *loc) 149 { 150 KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) ); 151 152 return __kmp_entry_thread() -> th.th_team -> t.t_nproc; 153 } 154 155 /*! 156 * @ingroup DEPRECATED 157 * @param loc location description 158 * 159 * This function need not be called. It always returns TRUE. 160 */ 161 kmp_int32 162 __kmpc_ok_to_fork(ident_t *loc) 163 { 164 #ifndef KMP_DEBUG 165 166 return TRUE; 167 168 #else 169 170 const char *semi2; 171 const char *semi3; 172 int line_no; 173 174 if (__kmp_par_range == 0) { 175 return TRUE; 176 } 177 semi2 = loc->psource; 178 if (semi2 == NULL) { 179 return TRUE; 180 } 181 semi2 = strchr(semi2, ';'); 182 if (semi2 == NULL) { 183 return TRUE; 184 } 185 semi2 = strchr(semi2 + 1, ';'); 186 if (semi2 == NULL) { 187 return TRUE; 188 } 189 if (__kmp_par_range_filename[0]) { 190 const char *name = semi2 - 1; 191 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 192 name--; 193 } 194 if ((*name == '/') || (*name == ';')) { 195 name++; 196 } 197 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 198 return __kmp_par_range < 0; 199 } 200 } 201 semi3 = strchr(semi2 + 1, ';'); 202 if (__kmp_par_range_routine[0]) { 203 if ((semi3 != NULL) && (semi3 > semi2) 204 && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 205 return __kmp_par_range < 0; 206 } 207 } 208 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 209 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 210 return __kmp_par_range > 0; 211 } 212 return __kmp_par_range < 0; 213 } 214 return TRUE; 215 216 #endif /* KMP_DEBUG */ 217 218 } 219 220 /*! 221 @ingroup THREAD_STATES 222 @param loc Source location information. 223 @return 1 if this thread is executing inside an active parallel region, zero if not. 224 */ 225 kmp_int32 226 __kmpc_in_parallel( ident_t *loc ) 227 { 228 return __kmp_entry_thread() -> th.th_root -> r.r_active; 229 } 230 231 /*! 232 @ingroup PARALLEL 233 @param loc source location information 234 @param global_tid global thread number 235 @param num_threads number of threads requested for this parallel construct 236 237 Set the number of threads to be used by the next fork spawned by this thread. 238 This call is only required if the parallel construct has a `num_threads` clause. 239 */ 240 void 241 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads ) 242 { 243 KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 244 global_tid, num_threads ) ); 245 246 __kmp_push_num_threads( loc, global_tid, num_threads ); 247 } 248 249 void 250 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid ) 251 { 252 KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) ); 253 254 /* the num_threads are automatically popped */ 255 } 256 257 258 #if OMP_40_ENABLED 259 260 void 261 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind ) 262 { 263 KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", 264 global_tid, proc_bind ) ); 265 266 __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind ); 267 } 268 269 #endif /* OMP_40_ENABLED */ 270 271 272 /*! 273 @ingroup PARALLEL 274 @param loc source location information 275 @param argc total number of arguments in the ellipsis 276 @param microtask pointer to callback routine consisting of outlined parallel construct 277 @param ... pointers to shared variables that aren't global 278 279 Do the actual fork and call the microtask in the relevant number of threads. 280 */ 281 void 282 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) 283 { 284 int gtid = __kmp_entry_gtid(); 285 286 #if (KMP_STATS_ENABLED) 287 int inParallel = __kmpc_in_parallel(loc); 288 if (inParallel) 289 { 290 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 291 } 292 else 293 { 294 KMP_COUNT_BLOCK(OMP_PARALLEL); 295 } 296 #endif 297 298 // maybe to save thr_state is enough here 299 { 300 va_list ap; 301 va_start( ap, microtask ); 302 303 #if OMPT_SUPPORT 304 ompt_frame_t* ompt_frame; 305 if (ompt_enabled) { 306 kmp_info_t *master_th = __kmp_threads[ gtid ]; 307 kmp_team_t *parent_team = master_th->th.th_team; 308 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 309 if (lwt) 310 ompt_frame = &(lwt->ompt_task_info.frame); 311 else 312 { 313 int tid = __kmp_tid_from_gtid( gtid ); 314 ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid]. 315 ompt_task_info.frame); 316 } 317 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 318 } 319 #endif 320 321 #if INCLUDE_SSC_MARKS 322 SSC_MARK_FORKING(); 323 #endif 324 __kmp_fork_call( loc, gtid, fork_context_intel, 325 argc, 326 #if OMPT_SUPPORT 327 VOLATILE_CAST(void *) microtask, // "unwrapped" task 328 #endif 329 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 330 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 331 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 332 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 333 &ap 334 #else 335 ap 336 #endif 337 ); 338 #if INCLUDE_SSC_MARKS 339 SSC_MARK_JOINING(); 340 #endif 341 __kmp_join_call( loc, gtid 342 #if OMPT_SUPPORT 343 , fork_context_intel 344 #endif 345 ); 346 347 va_end( ap ); 348 349 } 350 } 351 352 #if OMP_40_ENABLED 353 /*! 354 @ingroup PARALLEL 355 @param loc source location information 356 @param global_tid global thread number 357 @param num_teams number of teams requested for the teams construct 358 @param num_threads number of threads per team requested for the teams construct 359 360 Set the number of teams to be used by the teams construct. 361 This call is only required if the teams construct has a `num_teams` clause 362 or a `thread_limit` clause (or both). 363 */ 364 void 365 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads ) 366 { 367 KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 368 global_tid, num_teams, num_threads ) ); 369 370 __kmp_push_num_teams( loc, global_tid, num_teams, num_threads ); 371 } 372 373 /*! 374 @ingroup PARALLEL 375 @param loc source location information 376 @param argc total number of arguments in the ellipsis 377 @param microtask pointer to callback routine consisting of outlined teams construct 378 @param ... pointers to shared variables that aren't global 379 380 Do the actual fork and call the microtask in the relevant number of threads. 381 */ 382 void 383 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) 384 { 385 int gtid = __kmp_entry_gtid(); 386 kmp_info_t *this_thr = __kmp_threads[ gtid ]; 387 va_list ap; 388 va_start( ap, microtask ); 389 390 KMP_COUNT_BLOCK(OMP_TEAMS); 391 392 // remember teams entry point and nesting level 393 this_thr->th.th_teams_microtask = microtask; 394 this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host 395 396 #if OMPT_SUPPORT 397 kmp_team_t *parent_team = this_thr->th.th_team; 398 int tid = __kmp_tid_from_gtid( gtid ); 399 if (ompt_enabled) { 400 parent_team->t.t_implicit_task_taskdata[tid]. 401 ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1); 402 } 403 #endif 404 405 // check if __kmpc_push_num_teams called, set default number of teams otherwise 406 if ( this_thr->th.th_teams_size.nteams == 0 ) { 407 __kmp_push_num_teams( loc, gtid, 0, 0 ); 408 } 409 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 410 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 411 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 412 413 __kmp_fork_call( loc, gtid, fork_context_intel, 414 argc, 415 #if OMPT_SUPPORT 416 VOLATILE_CAST(void *) microtask, // "unwrapped" task 417 #endif 418 VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task 419 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 420 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 421 &ap 422 #else 423 ap 424 #endif 425 ); 426 __kmp_join_call( loc, gtid 427 #if OMPT_SUPPORT 428 , fork_context_intel 429 #endif 430 ); 431 432 this_thr->th.th_teams_microtask = NULL; 433 this_thr->th.th_teams_level = 0; 434 *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L; 435 va_end( ap ); 436 } 437 #endif /* OMP_40_ENABLED */ 438 439 440 // 441 // I don't think this function should ever have been exported. 442 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 443 // openmp code ever called it, but it's been exported from the RTL for so 444 // long that I'm afraid to remove the definition. 445 // 446 int 447 __kmpc_invoke_task_func( int gtid ) 448 { 449 return __kmp_invoke_task_func( gtid ); 450 } 451 452 /*! 453 @ingroup PARALLEL 454 @param loc source location information 455 @param global_tid global thread number 456 457 Enter a serialized parallel construct. This interface is used to handle a 458 conditional parallel region, like this, 459 @code 460 #pragma omp parallel if (condition) 461 @endcode 462 when the condition is false. 463 */ 464 void 465 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) 466 { 467 // The implementation is now in kmp_runtime.cpp so that it can share static 468 // functions with kmp_fork_call since the tasks to be done are similar in 469 // each case. 470 __kmp_serialized_parallel(loc, global_tid); 471 } 472 473 /*! 474 @ingroup PARALLEL 475 @param loc source location information 476 @param global_tid global thread number 477 478 Leave a serialized parallel construct. 479 */ 480 void 481 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) 482 { 483 kmp_internal_control_t *top; 484 kmp_info_t *this_thr; 485 kmp_team_t *serial_team; 486 487 KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) ); 488 489 /* skip all this code for autopar serialized loops since it results in 490 unacceptable overhead */ 491 if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) ) 492 return; 493 494 // Not autopar code 495 if( ! TCR_4( __kmp_init_parallel ) ) 496 __kmp_parallel_initialize(); 497 498 this_thr = __kmp_threads[ global_tid ]; 499 serial_team = this_thr->th.th_serial_team; 500 501 #if OMP_45_ENABLED 502 kmp_task_team_t * task_team = this_thr->th.th_task_team; 503 504 // we need to wait for the proxy tasks before finishing the thread 505 if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) 506 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here? 507 #endif 508 509 KMP_MB(); 510 KMP_DEBUG_ASSERT( serial_team ); 511 KMP_ASSERT( serial_team -> t.t_serialized ); 512 KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team ); 513 KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team ); 514 KMP_DEBUG_ASSERT( serial_team -> t.t_threads ); 515 KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr ); 516 517 /* If necessary, pop the internal control stack values and replace the team values */ 518 top = serial_team -> t.t_control_stack_top; 519 if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) { 520 copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top ); 521 serial_team -> t.t_control_stack_top = top -> next; 522 __kmp_free(top); 523 } 524 525 //if( serial_team -> t.t_serialized > 1 ) 526 serial_team -> t.t_level--; 527 528 /* pop dispatch buffers stack */ 529 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 530 { 531 dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer; 532 serial_team->t.t_dispatch->th_disp_buffer = 533 serial_team->t.t_dispatch->th_disp_buffer->next; 534 __kmp_free( disp_buffer ); 535 } 536 537 -- serial_team -> t.t_serialized; 538 if ( serial_team -> t.t_serialized == 0 ) { 539 540 /* return to the parallel section */ 541 542 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 543 if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) { 544 __kmp_clear_x87_fpu_status_word(); 545 __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word ); 546 __kmp_load_mxcsr( &serial_team->t.t_mxcsr ); 547 } 548 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 549 550 this_thr -> th.th_team = serial_team -> t.t_parent; 551 this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid; 552 553 /* restore values cached in the thread */ 554 this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */ 555 this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */ 556 this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized; 557 558 /* TODO the below shouldn't need to be adjusted for serialized teams */ 559 this_thr -> th.th_dispatch = & this_thr -> th.th_team -> 560 t.t_dispatch[ serial_team -> t.t_master_tid ]; 561 562 __kmp_pop_current_task_from_thread( this_thr ); 563 564 KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 ); 565 this_thr -> th.th_current_task -> td_flags.executing = 1; 566 567 if ( __kmp_tasking_mode != tskm_immediate_exec ) { 568 // Copy the task team from the new child / old parent team to the thread. 569 this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 570 KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n", 571 global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) ); 572 } 573 } else { 574 if ( __kmp_tasking_mode != tskm_immediate_exec ) { 575 KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n", 576 global_tid, serial_team, serial_team -> t.t_serialized ) ); 577 } 578 } 579 580 if ( __kmp_env_consistency_check ) 581 __kmp_pop_parallel( global_tid, NULL ); 582 } 583 584 /*! 585 @ingroup SYNCHRONIZATION 586 @param loc source location information. 587 588 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 589 depending on the memory ordering convention obeyed by the compiler 590 even that may not be necessary). 591 */ 592 void 593 __kmpc_flush(ident_t *loc) 594 { 595 KC_TRACE( 10, ("__kmpc_flush: called\n" ) ); 596 597 /* need explicit __mf() here since use volatile instead in library */ 598 KMP_MB(); /* Flush all pending memory write invalidates. */ 599 600 #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) 601 #if KMP_MIC 602 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 603 // We shouldn't need it, though, since the ABI rules require that 604 // * If the compiler generates NGO stores it also generates the fence 605 // * If users hand-code NGO stores they should insert the fence 606 // therefore no incomplete unordered stores should be visible. 607 #else 608 // C74404 609 // This is to address non-temporal store instructions (sfence needed). 610 // The clflush instruction is addressed either (mfence needed). 611 // Probably the non-temporal load monvtdqa instruction should also be addressed. 612 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 613 if ( ! __kmp_cpuinfo.initialized ) { 614 __kmp_query_cpuid( & __kmp_cpuinfo ); 615 }; // if 616 if ( ! __kmp_cpuinfo.sse2 ) { 617 // CPU cannot execute SSE2 instructions. 618 } else { 619 #if KMP_COMPILER_ICC 620 _mm_mfence(); 621 #elif KMP_COMPILER_MSVC 622 MemoryBarrier(); 623 #else 624 __sync_synchronize(); 625 #endif // KMP_COMPILER_ICC 626 }; // if 627 #endif // KMP_MIC 628 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 629 // Nothing to see here move along 630 #elif KMP_ARCH_PPC64 631 // Nothing needed here (we have a real MB above). 632 #if KMP_OS_CNK 633 // The flushing thread needs to yield here; this prevents a 634 // busy-waiting thread from saturating the pipeline. flush is 635 // often used in loops like this: 636 // while (!flag) { 637 // #pragma omp flush(flag) 638 // } 639 // and adding the yield here is good for at least a 10x speedup 640 // when running >2 threads per core (on the NAS LU benchmark). 641 __kmp_yield(TRUE); 642 #endif 643 #else 644 #error Unknown or unsupported architecture 645 #endif 646 647 } 648 649 /* -------------------------------------------------------------------------- */ 650 651 /* -------------------------------------------------------------------------- */ 652 653 /*! 654 @ingroup SYNCHRONIZATION 655 @param loc source location information 656 @param global_tid thread id. 657 658 Execute a barrier. 659 */ 660 void 661 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) 662 { 663 KMP_COUNT_BLOCK(OMP_BARRIER); 664 KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) ); 665 666 if (! TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 669 if ( __kmp_env_consistency_check ) { 670 if ( loc == 0 ) { 671 KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user? 672 }; // if 673 674 __kmp_check_barrier( global_tid, ct_barrier, loc ); 675 } 676 677 #if OMPT_SUPPORT && OMPT_TRACE 678 ompt_frame_t * ompt_frame; 679 if (ompt_enabled ) { 680 ompt_frame = __ompt_get_task_frame_internal(0); 681 if ( ompt_frame->reenter_runtime_frame == NULL ) 682 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 683 } 684 #endif 685 __kmp_threads[ global_tid ]->th.th_ident = loc; 686 // TODO: explicit barrier_wait_id: 687 // this function is called when 'barrier' directive is present or 688 // implicit barrier at the end of a worksharing construct. 689 // 1) better to add a per-thread barrier counter to a thread data structure 690 // 2) set to 0 when a new team is created 691 // 4) no sync is required 692 693 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 694 #if OMPT_SUPPORT && OMPT_TRACE 695 if (ompt_enabled ) { 696 ompt_frame->reenter_runtime_frame = NULL; 697 } 698 #endif 699 } 700 701 /* The BARRIER for a MASTER section is always explicit */ 702 /*! 703 @ingroup WORK_SHARING 704 @param loc source location information. 705 @param global_tid global thread number . 706 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 707 */ 708 kmp_int32 709 __kmpc_master(ident_t *loc, kmp_int32 global_tid) 710 { 711 int status = 0; 712 713 KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) ); 714 715 if( ! TCR_4( __kmp_init_parallel ) ) 716 __kmp_parallel_initialize(); 717 718 if( KMP_MASTER_GTID( global_tid )) { 719 KMP_COUNT_BLOCK(OMP_MASTER); 720 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 721 status = 1; 722 } 723 724 #if OMPT_SUPPORT && OMPT_TRACE 725 if (status) { 726 if (ompt_enabled && 727 ompt_callbacks.ompt_callback(ompt_event_master_begin)) { 728 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 729 kmp_team_t *team = this_thr -> th.th_team; 730 731 int tid = __kmp_tid_from_gtid( global_tid ); 732 ompt_callbacks.ompt_callback(ompt_event_master_begin)( 733 team->t.ompt_team_info.parallel_id, 734 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 735 } 736 } 737 #endif 738 739 if ( __kmp_env_consistency_check ) { 740 #if KMP_USE_DYNAMIC_LOCK 741 if (status) 742 __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 ); 743 else 744 __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 ); 745 #else 746 if (status) 747 __kmp_push_sync( global_tid, ct_master, loc, NULL ); 748 else 749 __kmp_check_sync( global_tid, ct_master, loc, NULL ); 750 #endif 751 } 752 753 return status; 754 } 755 756 /*! 757 @ingroup WORK_SHARING 758 @param loc source location information. 759 @param global_tid global thread number . 760 761 Mark the end of a <tt>master</tt> region. This should only be called by the thread 762 that executes the <tt>master</tt> region. 763 */ 764 void 765 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) 766 { 767 KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) ); 768 769 KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid )); 770 KMP_POP_PARTITIONED_TIMER(); 771 772 #if OMPT_SUPPORT && OMPT_TRACE 773 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 774 kmp_team_t *team = this_thr -> th.th_team; 775 if (ompt_enabled && 776 ompt_callbacks.ompt_callback(ompt_event_master_end)) { 777 int tid = __kmp_tid_from_gtid( global_tid ); 778 ompt_callbacks.ompt_callback(ompt_event_master_end)( 779 team->t.ompt_team_info.parallel_id, 780 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 781 } 782 #endif 783 784 if ( __kmp_env_consistency_check ) { 785 if( global_tid < 0 ) 786 KMP_WARNING( ThreadIdentInvalid ); 787 788 if( KMP_MASTER_GTID( global_tid )) 789 __kmp_pop_sync( global_tid, ct_master, loc ); 790 } 791 } 792 793 /*! 794 @ingroup WORK_SHARING 795 @param loc source location information. 796 @param gtid global thread number. 797 798 Start execution of an <tt>ordered</tt> construct. 799 */ 800 void 801 __kmpc_ordered( ident_t * loc, kmp_int32 gtid ) 802 { 803 int cid = 0; 804 kmp_info_t *th; 805 KMP_DEBUG_ASSERT( __kmp_init_serial ); 806 807 KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid )); 808 809 if (! TCR_4(__kmp_init_parallel)) 810 __kmp_parallel_initialize(); 811 812 #if USE_ITT_BUILD 813 __kmp_itt_ordered_prep( gtid ); 814 // TODO: ordered_wait_id 815 #endif /* USE_ITT_BUILD */ 816 817 th = __kmp_threads[ gtid ]; 818 819 #if OMPT_SUPPORT && OMPT_TRACE 820 if (ompt_enabled) { 821 /* OMPT state update */ 822 th->th.ompt_thread_info.wait_id = (uint64_t) loc; 823 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 824 825 /* OMPT event callback */ 826 if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) { 827 ompt_callbacks.ompt_callback(ompt_event_wait_ordered)( 828 th->th.ompt_thread_info.wait_id); 829 } 830 } 831 #endif 832 833 if ( th -> th.th_dispatch -> th_deo_fcn != 0 ) 834 (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc ); 835 else 836 __kmp_parallel_deo( & gtid, & cid, loc ); 837 838 #if OMPT_SUPPORT && OMPT_TRACE 839 if (ompt_enabled) { 840 /* OMPT state update */ 841 th->th.ompt_thread_info.state = ompt_state_work_parallel; 842 th->th.ompt_thread_info.wait_id = 0; 843 844 /* OMPT event callback */ 845 if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) { 846 ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)( 847 th->th.ompt_thread_info.wait_id); 848 } 849 } 850 #endif 851 852 #if USE_ITT_BUILD 853 __kmp_itt_ordered_start( gtid ); 854 #endif /* USE_ITT_BUILD */ 855 } 856 857 /*! 858 @ingroup WORK_SHARING 859 @param loc source location information. 860 @param gtid global thread number. 861 862 End execution of an <tt>ordered</tt> construct. 863 */ 864 void 865 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid ) 866 { 867 int cid = 0; 868 kmp_info_t *th; 869 870 KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) ); 871 872 #if USE_ITT_BUILD 873 __kmp_itt_ordered_end( gtid ); 874 // TODO: ordered_wait_id 875 #endif /* USE_ITT_BUILD */ 876 877 th = __kmp_threads[ gtid ]; 878 879 if ( th -> th.th_dispatch -> th_dxo_fcn != 0 ) 880 (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc ); 881 else 882 __kmp_parallel_dxo( & gtid, & cid, loc ); 883 884 #if OMPT_SUPPORT && OMPT_BLAME 885 if (ompt_enabled && 886 ompt_callbacks.ompt_callback(ompt_event_release_ordered)) { 887 ompt_callbacks.ompt_callback(ompt_event_release_ordered)( 888 th->th.ompt_thread_info.wait_id); 889 } 890 #endif 891 } 892 893 #if KMP_USE_DYNAMIC_LOCK 894 895 static __forceinline void 896 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag) 897 { 898 // Pointer to the allocated indirect lock is written to crit, while indexing is ignored. 899 void *idx; 900 kmp_indirect_lock_t **lck; 901 lck = (kmp_indirect_lock_t **)crit; 902 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 903 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 904 KMP_SET_I_LOCK_LOCATION(ilk, loc); 905 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 906 KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 907 #if USE_ITT_BUILD 908 __kmp_itt_critical_creating(ilk->lock, loc); 909 #endif 910 int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk); 911 if (status == 0) { 912 #if USE_ITT_BUILD 913 __kmp_itt_critical_destroyed(ilk->lock); 914 #endif 915 // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit. 916 //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 917 } 918 KMP_DEBUG_ASSERT(*lck != NULL); 919 } 920 921 // Fast-path acquire tas lock 922 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) { \ 923 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 924 if (l->lk.poll != KMP_LOCK_FREE(tas) || \ 925 ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \ 926 kmp_uint32 spins; \ 927 KMP_FSYNC_PREPARE(l); \ 928 KMP_INIT_YIELD(spins); \ 929 if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 930 KMP_YIELD(TRUE); \ 931 } else { \ 932 KMP_YIELD_SPIN(spins); \ 933 } \ 934 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 935 while (l->lk.poll != KMP_LOCK_FREE(tas) || \ 936 ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \ 937 __kmp_spin_backoff(&backoff); \ 938 if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 939 KMP_YIELD(TRUE); \ 940 } else { \ 941 KMP_YIELD_SPIN(spins); \ 942 } \ 943 } \ 944 } \ 945 KMP_FSYNC_ACQUIRED(l); \ 946 } 947 948 // Fast-path test tas lock 949 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) { \ 950 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 951 rc = l->lk.poll == KMP_LOCK_FREE(tas) && \ 952 KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas)); \ 953 } 954 955 // Fast-path release tas lock 956 #define KMP_RELEASE_TAS_LOCK(lock, gtid) { \ 957 TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \ 958 KMP_MB(); \ 959 } 960 961 #if KMP_USE_FUTEX 962 963 # include <unistd.h> 964 # include <sys/syscall.h> 965 # ifndef FUTEX_WAIT 966 # define FUTEX_WAIT 0 967 # endif 968 # ifndef FUTEX_WAKE 969 # define FUTEX_WAKE 1 970 # endif 971 972 // Fast-path acquire futex lock 973 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) { \ 974 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 975 kmp_int32 gtid_code = (gtid+1) << 1; \ 976 KMP_MB(); \ 977 KMP_FSYNC_PREPARE(ftx); \ 978 kmp_int32 poll_val; \ 979 while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 980 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 981 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 982 if (!cond) { \ 983 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) { \ 984 continue; \ 985 } \ 986 poll_val |= KMP_LOCK_BUSY(1, futex); \ 987 } \ 988 kmp_int32 rc; \ 989 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \ 990 continue; \ 991 } \ 992 gtid_code |= 1; \ 993 } \ 994 KMP_FSYNC_ACQUIRED(ftx); \ 995 } 996 997 // Fast-path test futex lock 998 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) { \ 999 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1000 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) { \ 1001 KMP_FSYNC_ACQUIRED(ftx); \ 1002 rc = TRUE; \ 1003 } else { \ 1004 rc = FALSE; \ 1005 } \ 1006 } 1007 1008 // Fast-path release futex lock 1009 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) { \ 1010 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1011 KMP_MB(); \ 1012 KMP_FSYNC_RELEASING(ftx); \ 1013 kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1014 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1015 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1016 } \ 1017 KMP_MB(); \ 1018 KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1019 } 1020 1021 #endif // KMP_USE_FUTEX 1022 1023 #else // KMP_USE_DYNAMIC_LOCK 1024 1025 static kmp_user_lock_p 1026 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid ) 1027 { 1028 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1029 1030 // 1031 // Because of the double-check, the following load 1032 // doesn't need to be volatile. 1033 // 1034 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp ); 1035 1036 if ( lck == NULL ) { 1037 void * idx; 1038 1039 // Allocate & initialize the lock. 1040 // Remember allocated locks in table in order to free them in __kmp_cleanup() 1041 lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section ); 1042 __kmp_init_user_lock_with_checks( lck ); 1043 __kmp_set_user_lock_location( lck, loc ); 1044 #if USE_ITT_BUILD 1045 __kmp_itt_critical_creating( lck ); 1046 // __kmp_itt_critical_creating() should be called *before* the first usage of underlying 1047 // lock. It is the only place where we can guarantee it. There are chances the lock will 1048 // destroyed with no usage, but it is not a problem, because this is not real event seen 1049 // by user but rather setting name for object (lock). See more details in kmp_itt.h. 1050 #endif /* USE_ITT_BUILD */ 1051 1052 // 1053 // Use a cmpxchg instruction to slam the start of the critical 1054 // section with the lock pointer. If another thread beat us 1055 // to it, deallocate the lock, and use the lock that the other 1056 // thread allocated. 1057 // 1058 int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck ); 1059 1060 if ( status == 0 ) { 1061 // Deallocate the lock and reload the value. 1062 #if USE_ITT_BUILD 1063 __kmp_itt_critical_destroyed( lck ); 1064 // Let ITT know the lock is destroyed and the same memory location may be reused for 1065 // another purpose. 1066 #endif /* USE_ITT_BUILD */ 1067 __kmp_destroy_user_lock_with_checks( lck ); 1068 __kmp_user_lock_free( &idx, gtid, lck ); 1069 lck = (kmp_user_lock_p)TCR_PTR( *lck_pp ); 1070 KMP_DEBUG_ASSERT( lck != NULL ); 1071 } 1072 } 1073 return lck; 1074 } 1075 1076 #endif // KMP_USE_DYNAMIC_LOCK 1077 1078 /*! 1079 @ingroup WORK_SHARING 1080 @param loc source location information. 1081 @param global_tid global thread number . 1082 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or 1083 some other suitably unique value. 1084 1085 Enter code protected by a `critical` construct. 1086 This function blocks until the executing thread can enter the critical section. 1087 */ 1088 void 1089 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) 1090 { 1091 #if KMP_USE_DYNAMIC_LOCK 1092 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1093 #else 1094 KMP_COUNT_BLOCK(OMP_CRITICAL); 1095 KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ 1096 kmp_user_lock_p lck; 1097 1098 KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); 1099 1100 //TODO: add THR_OVHD_STATE 1101 1102 KMP_CHECK_USER_LOCK_INIT(); 1103 1104 if ( ( __kmp_user_lock_kind == lk_tas ) 1105 && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1106 lck = (kmp_user_lock_p)crit; 1107 } 1108 #if KMP_USE_FUTEX 1109 else if ( ( __kmp_user_lock_kind == lk_futex ) 1110 && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1111 lck = (kmp_user_lock_p)crit; 1112 } 1113 #endif 1114 else { // ticket, queuing or drdpa 1115 lck = __kmp_get_critical_section_ptr( crit, loc, global_tid ); 1116 } 1117 1118 if ( __kmp_env_consistency_check ) 1119 __kmp_push_sync( global_tid, ct_critical, loc, lck ); 1120 1121 /* since the critical directive binds to all threads, not just 1122 * the current team we have to check this even if we are in a 1123 * serialized team */ 1124 /* also, even if we are the uber thread, we still have to conduct the lock, 1125 * as we have to contend with sibling threads */ 1126 1127 #if USE_ITT_BUILD 1128 __kmp_itt_critical_acquiring( lck ); 1129 #endif /* USE_ITT_BUILD */ 1130 // Value of 'crit' should be good for using as a critical_id of the critical section directive. 1131 __kmp_acquire_user_lock_with_checks( lck, global_tid ); 1132 1133 #if USE_ITT_BUILD 1134 __kmp_itt_critical_acquired( lck ); 1135 #endif /* USE_ITT_BUILD */ 1136 1137 KMP_START_EXPLICIT_TIMER(OMP_critical); 1138 KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); 1139 #endif // KMP_USE_DYNAMIC_LOCK 1140 } 1141 1142 #if KMP_USE_DYNAMIC_LOCK 1143 1144 // Converts the given hint to an internal lock implementation 1145 static __forceinline kmp_dyna_lockseq_t 1146 __kmp_map_hint_to_lock(uintptr_t hint) 1147 { 1148 #if KMP_USE_TSX 1149 # define KMP_TSX_LOCK(seq) lockseq_##seq 1150 #else 1151 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1152 #endif 1153 1154 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1155 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1156 #else 1157 # define KMP_CPUINFO_RTM 0 1158 #endif 1159 1160 // Hints that do not require further logic 1161 if (hint & kmp_lock_hint_hle) 1162 return KMP_TSX_LOCK(hle); 1163 if (hint & kmp_lock_hint_rtm) 1164 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq; 1165 if (hint & kmp_lock_hint_adaptive) 1166 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq; 1167 1168 // Rule out conflicting hints first by returning the default lock 1169 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1170 return __kmp_user_lock_seq; 1171 if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative)) 1172 return __kmp_user_lock_seq; 1173 1174 // Do not even consider speculation when it appears to be contended 1175 if (hint & omp_lock_hint_contended) 1176 return lockseq_queuing; 1177 1178 // Uncontended lock without speculation 1179 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1180 return lockseq_tas; 1181 1182 // HLE lock for speculation 1183 if (hint & omp_lock_hint_speculative) 1184 return KMP_TSX_LOCK(hle); 1185 1186 return __kmp_user_lock_seq; 1187 } 1188 1189 /*! 1190 @ingroup WORK_SHARING 1191 @param loc source location information. 1192 @param global_tid global thread number. 1193 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, 1194 or some other suitably unique value. 1195 @param hint the lock hint. 1196 1197 Enter code protected by a `critical` construct with a hint. The hint value is used to suggest a lock implementation. 1198 This function blocks until the executing thread can enter the critical section unless the hint suggests use of 1199 speculative execution and the hardware supports it. 1200 */ 1201 void 1202 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint ) 1203 { 1204 KMP_COUNT_BLOCK(OMP_CRITICAL); 1205 kmp_user_lock_p lck; 1206 1207 KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); 1208 1209 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1210 // Check if it is initialized. 1211 if (*lk == 0) { 1212 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1213 if (KMP_IS_D_LOCK(lckseq)) { 1214 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq)); 1215 } else { 1216 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1217 } 1218 } 1219 // Branch for accessing the actual lock object and set operation. This branching is inevitable since 1220 // this lock initialization does not follow the normal dispatch path (lock table is not used). 1221 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1222 lck = (kmp_user_lock_p)lk; 1223 if (__kmp_env_consistency_check) { 1224 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint)); 1225 } 1226 # if USE_ITT_BUILD 1227 __kmp_itt_critical_acquiring(lck); 1228 # endif 1229 # if KMP_USE_INLINED_TAS 1230 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1231 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1232 } else 1233 # elif KMP_USE_INLINED_FUTEX 1234 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1235 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1236 } else 1237 # endif 1238 { 1239 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1240 } 1241 } else { 1242 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1243 lck = ilk->lock; 1244 if (__kmp_env_consistency_check) { 1245 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint)); 1246 } 1247 # if USE_ITT_BUILD 1248 __kmp_itt_critical_acquiring(lck); 1249 # endif 1250 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1251 } 1252 1253 #if USE_ITT_BUILD 1254 __kmp_itt_critical_acquired( lck ); 1255 #endif /* USE_ITT_BUILD */ 1256 1257 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1258 KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); 1259 } // __kmpc_critical_with_hint 1260 1261 #endif // KMP_USE_DYNAMIC_LOCK 1262 1263 /*! 1264 @ingroup WORK_SHARING 1265 @param loc source location information. 1266 @param global_tid global thread number . 1267 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or 1268 some other suitably unique value. 1269 1270 Leave a critical section, releasing any lock that was held during its execution. 1271 */ 1272 void 1273 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit) 1274 { 1275 kmp_user_lock_p lck; 1276 1277 KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid )); 1278 1279 #if KMP_USE_DYNAMIC_LOCK 1280 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1281 lck = (kmp_user_lock_p)crit; 1282 KMP_ASSERT(lck != NULL); 1283 if (__kmp_env_consistency_check) { 1284 __kmp_pop_sync(global_tid, ct_critical, loc); 1285 } 1286 # if USE_ITT_BUILD 1287 __kmp_itt_critical_releasing( lck ); 1288 # endif 1289 # if KMP_USE_INLINED_TAS 1290 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1291 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1292 } else 1293 # elif KMP_USE_INLINED_FUTEX 1294 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1295 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1296 } else 1297 # endif 1298 { 1299 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1300 } 1301 } else { 1302 kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1303 KMP_ASSERT(ilk != NULL); 1304 lck = ilk->lock; 1305 if (__kmp_env_consistency_check) { 1306 __kmp_pop_sync(global_tid, ct_critical, loc); 1307 } 1308 # if USE_ITT_BUILD 1309 __kmp_itt_critical_releasing( lck ); 1310 # endif 1311 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1312 } 1313 1314 #else // KMP_USE_DYNAMIC_LOCK 1315 1316 if ( ( __kmp_user_lock_kind == lk_tas ) 1317 && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1318 lck = (kmp_user_lock_p)crit; 1319 } 1320 #if KMP_USE_FUTEX 1321 else if ( ( __kmp_user_lock_kind == lk_futex ) 1322 && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1323 lck = (kmp_user_lock_p)crit; 1324 } 1325 #endif 1326 else { // ticket, queuing or drdpa 1327 lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit)); 1328 } 1329 1330 KMP_ASSERT(lck != NULL); 1331 1332 if ( __kmp_env_consistency_check ) 1333 __kmp_pop_sync( global_tid, ct_critical, loc ); 1334 1335 #if USE_ITT_BUILD 1336 __kmp_itt_critical_releasing( lck ); 1337 #endif /* USE_ITT_BUILD */ 1338 // Value of 'crit' should be good for using as a critical_id of the critical section directive. 1339 __kmp_release_user_lock_with_checks( lck, global_tid ); 1340 1341 #if OMPT_SUPPORT && OMPT_BLAME 1342 if (ompt_enabled && 1343 ompt_callbacks.ompt_callback(ompt_event_release_critical)) { 1344 ompt_callbacks.ompt_callback(ompt_event_release_critical)( 1345 (uint64_t) lck); 1346 } 1347 #endif 1348 1349 #endif // KMP_USE_DYNAMIC_LOCK 1350 KMP_POP_PARTITIONED_TIMER(); 1351 KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid )); 1352 } 1353 1354 /*! 1355 @ingroup SYNCHRONIZATION 1356 @param loc source location information 1357 @param global_tid thread id. 1358 @return one if the thread should execute the master block, zero otherwise 1359 1360 Start execution of a combined barrier and master. The barrier is executed inside this function. 1361 */ 1362 kmp_int32 1363 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) 1364 { 1365 int status; 1366 1367 KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) ); 1368 1369 if (! TCR_4(__kmp_init_parallel)) 1370 __kmp_parallel_initialize(); 1371 1372 if ( __kmp_env_consistency_check ) 1373 __kmp_check_barrier( global_tid, ct_barrier, loc ); 1374 1375 #if USE_ITT_NOTIFY 1376 __kmp_threads[global_tid]->th.th_ident = loc; 1377 #endif 1378 status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL ); 1379 1380 return (status != 0) ? 0 : 1; 1381 } 1382 1383 /*! 1384 @ingroup SYNCHRONIZATION 1385 @param loc source location information 1386 @param global_tid thread id. 1387 1388 Complete the execution of a combined barrier and master. This function should 1389 only be called at the completion of the <tt>master</tt> code. Other threads will 1390 still be waiting at the barrier and this call releases them. 1391 */ 1392 void 1393 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) 1394 { 1395 KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid )); 1396 1397 __kmp_end_split_barrier ( bs_plain_barrier, global_tid ); 1398 } 1399 1400 /*! 1401 @ingroup SYNCHRONIZATION 1402 @param loc source location information 1403 @param global_tid thread id. 1404 @return one if the thread should execute the master block, zero otherwise 1405 1406 Start execution of a combined barrier and master(nowait) construct. 1407 The barrier is executed inside this function. 1408 There is no equivalent "end" function, since the 1409 */ 1410 kmp_int32 1411 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid ) 1412 { 1413 kmp_int32 ret; 1414 1415 KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid )); 1416 1417 if (! TCR_4(__kmp_init_parallel)) 1418 __kmp_parallel_initialize(); 1419 1420 if ( __kmp_env_consistency_check ) { 1421 if ( loc == 0 ) { 1422 KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user? 1423 } 1424 __kmp_check_barrier( global_tid, ct_barrier, loc ); 1425 } 1426 1427 #if USE_ITT_NOTIFY 1428 __kmp_threads[global_tid]->th.th_ident = loc; 1429 #endif 1430 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 1431 1432 ret = __kmpc_master (loc, global_tid); 1433 1434 if ( __kmp_env_consistency_check ) { 1435 /* there's no __kmpc_end_master called; so the (stats) */ 1436 /* actions of __kmpc_end_master are done here */ 1437 1438 if ( global_tid < 0 ) { 1439 KMP_WARNING( ThreadIdentInvalid ); 1440 } 1441 if (ret) { 1442 /* only one thread should do the pop since only */ 1443 /* one did the push (see __kmpc_master()) */ 1444 1445 __kmp_pop_sync( global_tid, ct_master, loc ); 1446 } 1447 } 1448 1449 return (ret); 1450 } 1451 1452 /* The BARRIER for a SINGLE process section is always explicit */ 1453 /*! 1454 @ingroup WORK_SHARING 1455 @param loc source location information 1456 @param global_tid global thread number 1457 @return One if this thread should execute the single construct, zero otherwise. 1458 1459 Test whether to execute a <tt>single</tt> construct. 1460 There are no implicit barriers in the two "single" calls, rather the compiler should 1461 introduce an explicit barrier if it is required. 1462 */ 1463 1464 kmp_int32 1465 __kmpc_single(ident_t *loc, kmp_int32 global_tid) 1466 { 1467 kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE ); 1468 1469 if (rc) { 1470 // We are going to execute the single statement, so we should count it. 1471 KMP_COUNT_BLOCK(OMP_SINGLE); 1472 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1473 } 1474 1475 #if OMPT_SUPPORT && OMPT_TRACE 1476 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 1477 kmp_team_t *team = this_thr -> th.th_team; 1478 int tid = __kmp_tid_from_gtid( global_tid ); 1479 1480 if (ompt_enabled) { 1481 if (rc) { 1482 if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) { 1483 ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)( 1484 team->t.ompt_team_info.parallel_id, 1485 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id, 1486 team->t.ompt_team_info.microtask); 1487 } 1488 } else { 1489 if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) { 1490 ompt_callbacks.ompt_callback(ompt_event_single_others_begin)( 1491 team->t.ompt_team_info.parallel_id, 1492 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1493 } 1494 this_thr->th.ompt_thread_info.state = ompt_state_wait_single; 1495 } 1496 } 1497 #endif 1498 1499 return rc; 1500 } 1501 1502 /*! 1503 @ingroup WORK_SHARING 1504 @param loc source location information 1505 @param global_tid global thread number 1506 1507 Mark the end of a <tt>single</tt> construct. This function should 1508 only be called by the thread that executed the block of code protected 1509 by the `single` construct. 1510 */ 1511 void 1512 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) 1513 { 1514 __kmp_exit_single( global_tid ); 1515 KMP_POP_PARTITIONED_TIMER(); 1516 1517 #if OMPT_SUPPORT && OMPT_TRACE 1518 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 1519 kmp_team_t *team = this_thr -> th.th_team; 1520 int tid = __kmp_tid_from_gtid( global_tid ); 1521 1522 if (ompt_enabled && 1523 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) { 1524 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)( 1525 team->t.ompt_team_info.parallel_id, 1526 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1527 } 1528 #endif 1529 } 1530 1531 /*! 1532 @ingroup WORK_SHARING 1533 @param loc Source location 1534 @param global_tid Global thread id 1535 1536 Mark the end of a statically scheduled loop. 1537 */ 1538 void 1539 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid ) 1540 { 1541 KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1542 1543 #if OMPT_SUPPORT && OMPT_TRACE 1544 if (ompt_enabled && 1545 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { 1546 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1547 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1548 ompt_callbacks.ompt_callback(ompt_event_loop_end)( 1549 team_info->parallel_id, task_info->task_id); 1550 } 1551 #endif 1552 1553 if ( __kmp_env_consistency_check ) 1554 __kmp_pop_workshare( global_tid, ct_pdo, loc ); 1555 } 1556 1557 /* 1558 * User routines which take C-style arguments (call by value) 1559 * different from the Fortran equivalent routines 1560 */ 1561 1562 void 1563 ompc_set_num_threads( int arg ) 1564 { 1565 // !!!!! TODO: check the per-task binding 1566 __kmp_set_num_threads( arg, __kmp_entry_gtid() ); 1567 } 1568 1569 void 1570 ompc_set_dynamic( int flag ) 1571 { 1572 kmp_info_t *thread; 1573 1574 /* For the thread-private implementation of the internal controls */ 1575 thread = __kmp_entry_thread(); 1576 1577 __kmp_save_internal_controls( thread ); 1578 1579 set__dynamic( thread, flag ? TRUE : FALSE ); 1580 } 1581 1582 void 1583 ompc_set_nested( int flag ) 1584 { 1585 kmp_info_t *thread; 1586 1587 /* For the thread-private internal controls implementation */ 1588 thread = __kmp_entry_thread(); 1589 1590 __kmp_save_internal_controls( thread ); 1591 1592 set__nested( thread, flag ? TRUE : FALSE ); 1593 } 1594 1595 void 1596 ompc_set_max_active_levels( int max_active_levels ) 1597 { 1598 /* TO DO */ 1599 /* we want per-task implementation of this internal control */ 1600 1601 /* For the per-thread internal controls implementation */ 1602 __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels ); 1603 } 1604 1605 void 1606 ompc_set_schedule( omp_sched_t kind, int modifier ) 1607 { 1608 // !!!!! TODO: check the per-task binding 1609 __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier ); 1610 } 1611 1612 int 1613 ompc_get_ancestor_thread_num( int level ) 1614 { 1615 return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level ); 1616 } 1617 1618 int 1619 ompc_get_team_size( int level ) 1620 { 1621 return __kmp_get_team_size( __kmp_entry_gtid(), level ); 1622 } 1623 1624 void 1625 kmpc_set_stacksize( int arg ) 1626 { 1627 // __kmp_aux_set_stacksize initializes the library if needed 1628 __kmp_aux_set_stacksize( arg ); 1629 } 1630 1631 void 1632 kmpc_set_stacksize_s( size_t arg ) 1633 { 1634 // __kmp_aux_set_stacksize initializes the library if needed 1635 __kmp_aux_set_stacksize( arg ); 1636 } 1637 1638 void 1639 kmpc_set_blocktime( int arg ) 1640 { 1641 int gtid, tid; 1642 kmp_info_t *thread; 1643 1644 gtid = __kmp_entry_gtid(); 1645 tid = __kmp_tid_from_gtid(gtid); 1646 thread = __kmp_thread_from_gtid(gtid); 1647 1648 __kmp_aux_set_blocktime( arg, thread, tid ); 1649 } 1650 1651 void 1652 kmpc_set_library( int arg ) 1653 { 1654 // __kmp_user_set_library initializes the library if needed 1655 __kmp_user_set_library( (enum library_type)arg ); 1656 } 1657 1658 void 1659 kmpc_set_defaults( char const * str ) 1660 { 1661 // __kmp_aux_set_defaults initializes the library if needed 1662 __kmp_aux_set_defaults( str, KMP_STRLEN( str ) ); 1663 } 1664 1665 void 1666 kmpc_set_disp_num_buffers( int arg ) 1667 { 1668 // ignore after initialization because some teams have already 1669 // allocated dispatch buffers 1670 if( __kmp_init_serial == 0 && arg > 0 ) 1671 __kmp_dispatch_num_buffers = arg; 1672 } 1673 1674 int 1675 kmpc_set_affinity_mask_proc( int proc, void **mask ) 1676 { 1677 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1678 return -1; 1679 #else 1680 if ( ! TCR_4(__kmp_init_middle) ) { 1681 __kmp_middle_initialize(); 1682 } 1683 return __kmp_aux_set_affinity_mask_proc( proc, mask ); 1684 #endif 1685 } 1686 1687 int 1688 kmpc_unset_affinity_mask_proc( int proc, void **mask ) 1689 { 1690 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1691 return -1; 1692 #else 1693 if ( ! TCR_4(__kmp_init_middle) ) { 1694 __kmp_middle_initialize(); 1695 } 1696 return __kmp_aux_unset_affinity_mask_proc( proc, mask ); 1697 #endif 1698 } 1699 1700 int 1701 kmpc_get_affinity_mask_proc( int proc, void **mask ) 1702 { 1703 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1704 return -1; 1705 #else 1706 if ( ! TCR_4(__kmp_init_middle) ) { 1707 __kmp_middle_initialize(); 1708 } 1709 return __kmp_aux_get_affinity_mask_proc( proc, mask ); 1710 #endif 1711 } 1712 1713 1714 /* -------------------------------------------------------------------------- */ 1715 /*! 1716 @ingroup THREADPRIVATE 1717 @param loc source location information 1718 @param gtid global thread number 1719 @param cpy_size size of the cpy_data buffer 1720 @param cpy_data pointer to data to be copied 1721 @param cpy_func helper function to call for copying data 1722 @param didit flag variable: 1=single thread; 0=not single thread 1723 1724 __kmpc_copyprivate implements the interface for the private data broadcast needed for 1725 the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran). 1726 All threads participating in the parallel region call this routine. 1727 One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1 1728 and all other threads should have that variable set to 0. 1729 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1730 1731 The OpenMP specification forbids the use of nowait on the single region when a copyprivate 1732 clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid 1733 race conditions, so the code generation for the single region should avoid generating a barrier 1734 after the call to @ref __kmpc_copyprivate. 1735 1736 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1737 The <tt>loc</tt> parameter is a pointer to source location information. 1738 1739 Internal implementation: The single thread will first copy its descriptor address (cpy_data) 1740 to a team-private location, then the other threads will each call the function pointed to by 1741 the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer. 1742 1743 The cpy_func routine used for the copy and the contents of the data area defined by cpy_data 1744 and cpy_size may be built in any fashion that will allow the copy to be done. For instance, 1745 the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers 1746 to the data. The cpy_func routine must interpret the cpy_data buffer appropriately. 1747 1748 The interface to cpy_func is as follows: 1749 @code 1750 void cpy_func( void *destination, void *source ) 1751 @endcode 1752 where void *destination is the cpy_data pointer for the thread being copied to 1753 and void *source is the cpy_data pointer for the thread being copied from. 1754 */ 1755 void 1756 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit ) 1757 { 1758 void **data_ptr; 1759 1760 KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid )); 1761 1762 KMP_MB(); 1763 1764 data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data; 1765 1766 if ( __kmp_env_consistency_check ) { 1767 if ( loc == 0 ) { 1768 KMP_WARNING( ConstructIdentInvalid ); 1769 } 1770 } 1771 1772 /* ToDo: Optimize the following two barriers into some kind of split barrier */ 1773 1774 if (didit) *data_ptr = cpy_data; 1775 1776 /* This barrier is not a barrier region boundary */ 1777 #if USE_ITT_NOTIFY 1778 __kmp_threads[gtid]->th.th_ident = loc; 1779 #endif 1780 __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL ); 1781 1782 if (! didit) (*cpy_func)( cpy_data, *data_ptr ); 1783 1784 /* Consider next barrier the user-visible barrier for barrier region boundaries */ 1785 /* Nesting checks are already handled by the single construct checks */ 1786 1787 #if USE_ITT_NOTIFY 1788 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location) 1789 #endif 1790 __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL ); 1791 } 1792 1793 /* -------------------------------------------------------------------------- */ 1794 1795 #define INIT_LOCK __kmp_init_user_lock_with_checks 1796 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 1797 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 1798 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 1799 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 1800 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed 1801 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 1802 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 1803 #define TEST_LOCK __kmp_test_user_lock_with_checks 1804 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 1805 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 1806 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 1807 1808 1809 /* 1810 * TODO: Make check abort messages use location info & pass it 1811 * into with_checks routines 1812 */ 1813 1814 #if KMP_USE_DYNAMIC_LOCK 1815 1816 // internal lock initializer 1817 static __forceinline void 1818 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq) 1819 { 1820 if (KMP_IS_D_LOCK(seq)) { 1821 KMP_INIT_D_LOCK(lock, seq); 1822 #if USE_ITT_BUILD 1823 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 1824 #endif 1825 } else { 1826 KMP_INIT_I_LOCK(lock, seq); 1827 #if USE_ITT_BUILD 1828 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1829 __kmp_itt_lock_creating(ilk->lock, loc); 1830 #endif 1831 } 1832 } 1833 1834 // internal nest lock initializer 1835 static __forceinline void 1836 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq) 1837 { 1838 #if KMP_USE_TSX 1839 // Don't have nested lock implementation for speculative locks 1840 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 1841 seq = __kmp_user_lock_seq; 1842 #endif 1843 switch (seq) { 1844 case lockseq_tas: 1845 seq = lockseq_nested_tas; 1846 break; 1847 #if KMP_USE_FUTEX 1848 case lockseq_futex: 1849 seq = lockseq_nested_futex; 1850 break; 1851 #endif 1852 case lockseq_ticket: 1853 seq = lockseq_nested_ticket; 1854 break; 1855 case lockseq_queuing: 1856 seq = lockseq_nested_queuing; 1857 break; 1858 case lockseq_drdpa: 1859 seq = lockseq_nested_drdpa; 1860 break; 1861 default: 1862 seq = lockseq_nested_queuing; 1863 } 1864 KMP_INIT_I_LOCK(lock, seq); 1865 #if USE_ITT_BUILD 1866 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1867 __kmp_itt_lock_creating(ilk->lock, loc); 1868 #endif 1869 } 1870 1871 /* initialize the lock with a hint */ 1872 void 1873 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint) 1874 { 1875 KMP_DEBUG_ASSERT(__kmp_init_serial); 1876 if (__kmp_env_consistency_check && user_lock == NULL) { 1877 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 1878 } 1879 1880 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1881 } 1882 1883 /* initialize the lock with a hint */ 1884 void 1885 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint) 1886 { 1887 KMP_DEBUG_ASSERT(__kmp_init_serial); 1888 if (__kmp_env_consistency_check && user_lock == NULL) { 1889 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 1890 } 1891 1892 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1893 } 1894 1895 #endif // KMP_USE_DYNAMIC_LOCK 1896 1897 /* initialize the lock */ 1898 void 1899 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 1900 #if KMP_USE_DYNAMIC_LOCK 1901 KMP_DEBUG_ASSERT(__kmp_init_serial); 1902 if (__kmp_env_consistency_check && user_lock == NULL) { 1903 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 1904 } 1905 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1906 1907 #else // KMP_USE_DYNAMIC_LOCK 1908 1909 static char const * const func = "omp_init_lock"; 1910 kmp_user_lock_p lck; 1911 KMP_DEBUG_ASSERT( __kmp_init_serial ); 1912 1913 if ( __kmp_env_consistency_check ) { 1914 if ( user_lock == NULL ) { 1915 KMP_FATAL( LockIsUninitialized, func ); 1916 } 1917 } 1918 1919 KMP_CHECK_USER_LOCK_INIT(); 1920 1921 if ( ( __kmp_user_lock_kind == lk_tas ) 1922 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 1923 lck = (kmp_user_lock_p)user_lock; 1924 } 1925 #if KMP_USE_FUTEX 1926 else if ( ( __kmp_user_lock_kind == lk_futex ) 1927 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 1928 lck = (kmp_user_lock_p)user_lock; 1929 } 1930 #endif 1931 else { 1932 lck = __kmp_user_lock_allocate( user_lock, gtid, 0 ); 1933 } 1934 INIT_LOCK( lck ); 1935 __kmp_set_user_lock_location( lck, loc ); 1936 1937 #if OMPT_SUPPORT && OMPT_TRACE 1938 if (ompt_enabled && 1939 ompt_callbacks.ompt_callback(ompt_event_init_lock)) { 1940 ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck); 1941 } 1942 #endif 1943 1944 #if USE_ITT_BUILD 1945 __kmp_itt_lock_creating( lck ); 1946 #endif /* USE_ITT_BUILD */ 1947 1948 #endif // KMP_USE_DYNAMIC_LOCK 1949 } // __kmpc_init_lock 1950 1951 /* initialize the lock */ 1952 void 1953 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 1954 #if KMP_USE_DYNAMIC_LOCK 1955 1956 KMP_DEBUG_ASSERT(__kmp_init_serial); 1957 if (__kmp_env_consistency_check && user_lock == NULL) { 1958 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 1959 } 1960 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1961 1962 #else // KMP_USE_DYNAMIC_LOCK 1963 1964 static char const * const func = "omp_init_nest_lock"; 1965 kmp_user_lock_p lck; 1966 KMP_DEBUG_ASSERT( __kmp_init_serial ); 1967 1968 if ( __kmp_env_consistency_check ) { 1969 if ( user_lock == NULL ) { 1970 KMP_FATAL( LockIsUninitialized, func ); 1971 } 1972 } 1973 1974 KMP_CHECK_USER_LOCK_INIT(); 1975 1976 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 1977 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 1978 lck = (kmp_user_lock_p)user_lock; 1979 } 1980 #if KMP_USE_FUTEX 1981 else if ( ( __kmp_user_lock_kind == lk_futex ) 1982 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 1983 <= OMP_NEST_LOCK_T_SIZE ) ) { 1984 lck = (kmp_user_lock_p)user_lock; 1985 } 1986 #endif 1987 else { 1988 lck = __kmp_user_lock_allocate( user_lock, gtid, 0 ); 1989 } 1990 1991 INIT_NESTED_LOCK( lck ); 1992 __kmp_set_user_lock_location( lck, loc ); 1993 1994 #if OMPT_SUPPORT && OMPT_TRACE 1995 if (ompt_enabled && 1996 ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) { 1997 ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck); 1998 } 1999 #endif 2000 2001 #if USE_ITT_BUILD 2002 __kmp_itt_lock_creating( lck ); 2003 #endif /* USE_ITT_BUILD */ 2004 2005 #endif // KMP_USE_DYNAMIC_LOCK 2006 } // __kmpc_init_nest_lock 2007 2008 void 2009 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2010 #if KMP_USE_DYNAMIC_LOCK 2011 2012 # if USE_ITT_BUILD 2013 kmp_user_lock_p lck; 2014 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2015 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2016 } else { 2017 lck = (kmp_user_lock_p)user_lock; 2018 } 2019 __kmp_itt_lock_destroyed(lck); 2020 # endif 2021 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2022 #else 2023 kmp_user_lock_p lck; 2024 2025 if ( ( __kmp_user_lock_kind == lk_tas ) 2026 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2027 lck = (kmp_user_lock_p)user_lock; 2028 } 2029 #if KMP_USE_FUTEX 2030 else if ( ( __kmp_user_lock_kind == lk_futex ) 2031 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2032 lck = (kmp_user_lock_p)user_lock; 2033 } 2034 #endif 2035 else { 2036 lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" ); 2037 } 2038 2039 #if OMPT_SUPPORT && OMPT_TRACE 2040 if (ompt_enabled && 2041 ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) { 2042 ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck); 2043 } 2044 #endif 2045 2046 #if USE_ITT_BUILD 2047 __kmp_itt_lock_destroyed( lck ); 2048 #endif /* USE_ITT_BUILD */ 2049 DESTROY_LOCK( lck ); 2050 2051 if ( ( __kmp_user_lock_kind == lk_tas ) 2052 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2053 ; 2054 } 2055 #if KMP_USE_FUTEX 2056 else if ( ( __kmp_user_lock_kind == lk_futex ) 2057 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2058 ; 2059 } 2060 #endif 2061 else { 2062 __kmp_user_lock_free( user_lock, gtid, lck ); 2063 } 2064 #endif // KMP_USE_DYNAMIC_LOCK 2065 } // __kmpc_destroy_lock 2066 2067 /* destroy the lock */ 2068 void 2069 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2070 #if KMP_USE_DYNAMIC_LOCK 2071 2072 # if USE_ITT_BUILD 2073 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2074 __kmp_itt_lock_destroyed(ilk->lock); 2075 # endif 2076 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2077 2078 #else // KMP_USE_DYNAMIC_LOCK 2079 2080 kmp_user_lock_p lck; 2081 2082 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2083 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2084 lck = (kmp_user_lock_p)user_lock; 2085 } 2086 #if KMP_USE_FUTEX 2087 else if ( ( __kmp_user_lock_kind == lk_futex ) 2088 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2089 <= OMP_NEST_LOCK_T_SIZE ) ) { 2090 lck = (kmp_user_lock_p)user_lock; 2091 } 2092 #endif 2093 else { 2094 lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" ); 2095 } 2096 2097 #if OMPT_SUPPORT && OMPT_TRACE 2098 if (ompt_enabled && 2099 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) { 2100 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck); 2101 } 2102 #endif 2103 2104 #if USE_ITT_BUILD 2105 __kmp_itt_lock_destroyed( lck ); 2106 #endif /* USE_ITT_BUILD */ 2107 2108 DESTROY_NESTED_LOCK( lck ); 2109 2110 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2111 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2112 ; 2113 } 2114 #if KMP_USE_FUTEX 2115 else if ( ( __kmp_user_lock_kind == lk_futex ) 2116 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2117 <= OMP_NEST_LOCK_T_SIZE ) ) { 2118 ; 2119 } 2120 #endif 2121 else { 2122 __kmp_user_lock_free( user_lock, gtid, lck ); 2123 } 2124 #endif // KMP_USE_DYNAMIC_LOCK 2125 } // __kmpc_destroy_nest_lock 2126 2127 void 2128 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2129 KMP_COUNT_BLOCK(OMP_set_lock); 2130 #if KMP_USE_DYNAMIC_LOCK 2131 int tag = KMP_EXTRACT_D_TAG(user_lock); 2132 # if USE_ITT_BUILD 2133 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object. 2134 # endif 2135 # if KMP_USE_INLINED_TAS 2136 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2137 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2138 } else 2139 # elif KMP_USE_INLINED_FUTEX 2140 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2141 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2142 } else 2143 # endif 2144 { 2145 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2146 } 2147 # if USE_ITT_BUILD 2148 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2149 # endif 2150 2151 #else // KMP_USE_DYNAMIC_LOCK 2152 2153 kmp_user_lock_p lck; 2154 2155 if ( ( __kmp_user_lock_kind == lk_tas ) 2156 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2157 lck = (kmp_user_lock_p)user_lock; 2158 } 2159 #if KMP_USE_FUTEX 2160 else if ( ( __kmp_user_lock_kind == lk_futex ) 2161 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2162 lck = (kmp_user_lock_p)user_lock; 2163 } 2164 #endif 2165 else { 2166 lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" ); 2167 } 2168 2169 #if USE_ITT_BUILD 2170 __kmp_itt_lock_acquiring( lck ); 2171 #endif /* USE_ITT_BUILD */ 2172 2173 ACQUIRE_LOCK( lck, gtid ); 2174 2175 #if USE_ITT_BUILD 2176 __kmp_itt_lock_acquired( lck ); 2177 #endif /* USE_ITT_BUILD */ 2178 2179 #if OMPT_SUPPORT && OMPT_TRACE 2180 if (ompt_enabled && 2181 ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) { 2182 ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck); 2183 } 2184 #endif 2185 2186 #endif // KMP_USE_DYNAMIC_LOCK 2187 } 2188 2189 void 2190 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2191 #if KMP_USE_DYNAMIC_LOCK 2192 2193 # if USE_ITT_BUILD 2194 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2195 # endif 2196 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2197 # if USE_ITT_BUILD 2198 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2199 #endif 2200 2201 #if OMPT_SUPPORT && OMPT_TRACE 2202 if (ompt_enabled) { 2203 // missing support here: need to know whether acquired first or not 2204 } 2205 #endif 2206 2207 #else // KMP_USE_DYNAMIC_LOCK 2208 int acquire_status; 2209 kmp_user_lock_p lck; 2210 2211 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2212 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2213 lck = (kmp_user_lock_p)user_lock; 2214 } 2215 #if KMP_USE_FUTEX 2216 else if ( ( __kmp_user_lock_kind == lk_futex ) 2217 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2218 <= OMP_NEST_LOCK_T_SIZE ) ) { 2219 lck = (kmp_user_lock_p)user_lock; 2220 } 2221 #endif 2222 else { 2223 lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" ); 2224 } 2225 2226 #if USE_ITT_BUILD 2227 __kmp_itt_lock_acquiring( lck ); 2228 #endif /* USE_ITT_BUILD */ 2229 2230 ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status ); 2231 2232 #if USE_ITT_BUILD 2233 __kmp_itt_lock_acquired( lck ); 2234 #endif /* USE_ITT_BUILD */ 2235 2236 #if OMPT_SUPPORT && OMPT_TRACE 2237 if (ompt_enabled) { 2238 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2239 if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)) 2240 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck); 2241 } else { 2242 if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)) 2243 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck); 2244 } 2245 } 2246 #endif 2247 2248 #endif // KMP_USE_DYNAMIC_LOCK 2249 } 2250 2251 void 2252 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2253 { 2254 #if KMP_USE_DYNAMIC_LOCK 2255 2256 int tag = KMP_EXTRACT_D_TAG(user_lock); 2257 # if USE_ITT_BUILD 2258 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2259 # endif 2260 # if KMP_USE_INLINED_TAS 2261 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2262 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2263 } else 2264 # elif KMP_USE_INLINED_FUTEX 2265 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2266 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2267 } else 2268 # endif 2269 { 2270 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2271 } 2272 2273 #else // KMP_USE_DYNAMIC_LOCK 2274 2275 kmp_user_lock_p lck; 2276 2277 /* Can't use serial interval since not block structured */ 2278 /* release the lock */ 2279 2280 if ( ( __kmp_user_lock_kind == lk_tas ) 2281 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2282 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2283 // "fast" path implemented to fix customer performance issue 2284 #if USE_ITT_BUILD 2285 __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock ); 2286 #endif /* USE_ITT_BUILD */ 2287 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2288 KMP_MB(); 2289 return; 2290 #else 2291 lck = (kmp_user_lock_p)user_lock; 2292 #endif 2293 } 2294 #if KMP_USE_FUTEX 2295 else if ( ( __kmp_user_lock_kind == lk_futex ) 2296 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2297 lck = (kmp_user_lock_p)user_lock; 2298 } 2299 #endif 2300 else { 2301 lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" ); 2302 } 2303 2304 #if USE_ITT_BUILD 2305 __kmp_itt_lock_releasing( lck ); 2306 #endif /* USE_ITT_BUILD */ 2307 2308 RELEASE_LOCK( lck, gtid ); 2309 2310 #if OMPT_SUPPORT && OMPT_BLAME 2311 if (ompt_enabled && 2312 ompt_callbacks.ompt_callback(ompt_event_release_lock)) { 2313 ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck); 2314 } 2315 #endif 2316 2317 #endif // KMP_USE_DYNAMIC_LOCK 2318 } 2319 2320 /* release the lock */ 2321 void 2322 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2323 { 2324 #if KMP_USE_DYNAMIC_LOCK 2325 2326 # if USE_ITT_BUILD 2327 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2328 # endif 2329 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2330 2331 #else // KMP_USE_DYNAMIC_LOCK 2332 2333 kmp_user_lock_p lck; 2334 2335 /* Can't use serial interval since not block structured */ 2336 2337 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2338 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2339 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2340 // "fast" path implemented to fix customer performance issue 2341 kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock; 2342 #if USE_ITT_BUILD 2343 __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock ); 2344 #endif /* USE_ITT_BUILD */ 2345 if ( --(tl->lk.depth_locked) == 0 ) { 2346 TCW_4(tl->lk.poll, 0); 2347 } 2348 KMP_MB(); 2349 return; 2350 #else 2351 lck = (kmp_user_lock_p)user_lock; 2352 #endif 2353 } 2354 #if KMP_USE_FUTEX 2355 else if ( ( __kmp_user_lock_kind == lk_futex ) 2356 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2357 <= OMP_NEST_LOCK_T_SIZE ) ) { 2358 lck = (kmp_user_lock_p)user_lock; 2359 } 2360 #endif 2361 else { 2362 lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" ); 2363 } 2364 2365 #if USE_ITT_BUILD 2366 __kmp_itt_lock_releasing( lck ); 2367 #endif /* USE_ITT_BUILD */ 2368 2369 int release_status; 2370 release_status = RELEASE_NESTED_LOCK( lck, gtid ); 2371 #if OMPT_SUPPORT && OMPT_BLAME 2372 if (ompt_enabled) { 2373 if (release_status == KMP_LOCK_RELEASED) { 2374 if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) { 2375 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)( 2376 (uint64_t) lck); 2377 } 2378 } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) { 2379 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)( 2380 (uint64_t) lck); 2381 } 2382 } 2383 #endif 2384 2385 #endif // KMP_USE_DYNAMIC_LOCK 2386 } 2387 2388 /* try to acquire the lock */ 2389 int 2390 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2391 { 2392 KMP_COUNT_BLOCK(OMP_test_lock); 2393 2394 #if KMP_USE_DYNAMIC_LOCK 2395 int rc; 2396 int tag = KMP_EXTRACT_D_TAG(user_lock); 2397 # if USE_ITT_BUILD 2398 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2399 # endif 2400 # if KMP_USE_INLINED_TAS 2401 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2402 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2403 } else 2404 # elif KMP_USE_INLINED_FUTEX 2405 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2406 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2407 } else 2408 # endif 2409 { 2410 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2411 } 2412 if (rc) { 2413 # if USE_ITT_BUILD 2414 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2415 # endif 2416 return FTN_TRUE; 2417 } else { 2418 # if USE_ITT_BUILD 2419 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2420 # endif 2421 return FTN_FALSE; 2422 } 2423 2424 #else // KMP_USE_DYNAMIC_LOCK 2425 2426 kmp_user_lock_p lck; 2427 int rc; 2428 2429 if ( ( __kmp_user_lock_kind == lk_tas ) 2430 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2431 lck = (kmp_user_lock_p)user_lock; 2432 } 2433 #if KMP_USE_FUTEX 2434 else if ( ( __kmp_user_lock_kind == lk_futex ) 2435 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2436 lck = (kmp_user_lock_p)user_lock; 2437 } 2438 #endif 2439 else { 2440 lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" ); 2441 } 2442 2443 #if USE_ITT_BUILD 2444 __kmp_itt_lock_acquiring( lck ); 2445 #endif /* USE_ITT_BUILD */ 2446 2447 rc = TEST_LOCK( lck, gtid ); 2448 #if USE_ITT_BUILD 2449 if ( rc ) { 2450 __kmp_itt_lock_acquired( lck ); 2451 } else { 2452 __kmp_itt_lock_cancelled( lck ); 2453 } 2454 #endif /* USE_ITT_BUILD */ 2455 return ( rc ? FTN_TRUE : FTN_FALSE ); 2456 2457 /* Can't use serial interval since not block structured */ 2458 2459 #endif // KMP_USE_DYNAMIC_LOCK 2460 } 2461 2462 /* try to acquire the lock */ 2463 int 2464 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2465 { 2466 #if KMP_USE_DYNAMIC_LOCK 2467 int rc; 2468 # if USE_ITT_BUILD 2469 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2470 # endif 2471 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 2472 # if USE_ITT_BUILD 2473 if (rc) { 2474 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2475 } else { 2476 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2477 } 2478 # endif 2479 return rc; 2480 2481 #else // KMP_USE_DYNAMIC_LOCK 2482 2483 kmp_user_lock_p lck; 2484 int rc; 2485 2486 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2487 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2488 lck = (kmp_user_lock_p)user_lock; 2489 } 2490 #if KMP_USE_FUTEX 2491 else if ( ( __kmp_user_lock_kind == lk_futex ) 2492 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2493 <= OMP_NEST_LOCK_T_SIZE ) ) { 2494 lck = (kmp_user_lock_p)user_lock; 2495 } 2496 #endif 2497 else { 2498 lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" ); 2499 } 2500 2501 #if USE_ITT_BUILD 2502 __kmp_itt_lock_acquiring( lck ); 2503 #endif /* USE_ITT_BUILD */ 2504 2505 rc = TEST_NESTED_LOCK( lck, gtid ); 2506 #if USE_ITT_BUILD 2507 if ( rc ) { 2508 __kmp_itt_lock_acquired( lck ); 2509 } else { 2510 __kmp_itt_lock_cancelled( lck ); 2511 } 2512 #endif /* USE_ITT_BUILD */ 2513 return rc; 2514 2515 /* Can't use serial interval since not block structured */ 2516 2517 #endif // KMP_USE_DYNAMIC_LOCK 2518 } 2519 2520 2521 /*--------------------------------------------------------------------------------------------------------------------*/ 2522 2523 /* 2524 * Interface to fast scalable reduce methods routines 2525 */ 2526 2527 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions; 2528 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then) 2529 // AT: which solution is better? 2530 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \ 2531 ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) ) 2532 2533 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 2534 ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) 2535 2536 // description of the packed_reduction_method variable: look at the macros in kmp.h 2537 2538 2539 // used in a critical section reduce block 2540 static __forceinline void 2541 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) { 2542 2543 // this lock was visible to a customer and to the threading profile tool as a serial overhead span 2544 // (although it's used for an internal purpose only) 2545 // why was it visible in previous implementation? 2546 // should we keep it visible in new reduce block? 2547 kmp_user_lock_p lck; 2548 2549 #if KMP_USE_DYNAMIC_LOCK 2550 2551 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 2552 // Check if it is initialized. 2553 if (*lk == 0) { 2554 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2555 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq)); 2556 } else { 2557 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq)); 2558 } 2559 } 2560 // Branch for accessing the actual lock object and set operation. This branching is inevitable since 2561 // this lock initialization does not follow the normal dispatch path (lock table is not used). 2562 if (KMP_EXTRACT_D_TAG(lk) != 0) { 2563 lck = (kmp_user_lock_p)lk; 2564 KMP_DEBUG_ASSERT(lck != NULL); 2565 if (__kmp_env_consistency_check) { 2566 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2567 } 2568 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 2569 } else { 2570 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 2571 lck = ilk->lock; 2572 KMP_DEBUG_ASSERT(lck != NULL); 2573 if (__kmp_env_consistency_check) { 2574 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2575 } 2576 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 2577 } 2578 2579 #else // KMP_USE_DYNAMIC_LOCK 2580 2581 // We know that the fast reduction code is only emitted by Intel compilers 2582 // with 32 byte critical sections. If there isn't enough space, then we 2583 // have to use a pointer. 2584 if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) { 2585 lck = (kmp_user_lock_p)crit; 2586 } 2587 else { 2588 lck = __kmp_get_critical_section_ptr( crit, loc, global_tid ); 2589 } 2590 KMP_DEBUG_ASSERT( lck != NULL ); 2591 2592 if ( __kmp_env_consistency_check ) 2593 __kmp_push_sync( global_tid, ct_critical, loc, lck ); 2594 2595 __kmp_acquire_user_lock_with_checks( lck, global_tid ); 2596 2597 #endif // KMP_USE_DYNAMIC_LOCK 2598 } 2599 2600 // used in a critical section reduce block 2601 static __forceinline void 2602 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) { 2603 2604 kmp_user_lock_p lck; 2605 2606 #if KMP_USE_DYNAMIC_LOCK 2607 2608 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2609 lck = (kmp_user_lock_p)crit; 2610 if (__kmp_env_consistency_check) 2611 __kmp_pop_sync(global_tid, ct_critical, loc); 2612 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 2613 } else { 2614 kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 2615 if (__kmp_env_consistency_check) 2616 __kmp_pop_sync(global_tid, ct_critical, loc); 2617 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 2618 } 2619 2620 #else // KMP_USE_DYNAMIC_LOCK 2621 2622 // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical 2623 // sections. If there isn't enough space, then we have to use a pointer. 2624 if ( __kmp_base_user_lock_size > 32 ) { 2625 lck = *( (kmp_user_lock_p *) crit ); 2626 KMP_ASSERT( lck != NULL ); 2627 } else { 2628 lck = (kmp_user_lock_p) crit; 2629 } 2630 2631 if ( __kmp_env_consistency_check ) 2632 __kmp_pop_sync( global_tid, ct_critical, loc ); 2633 2634 __kmp_release_user_lock_with_checks( lck, global_tid ); 2635 2636 #endif // KMP_USE_DYNAMIC_LOCK 2637 } // __kmp_end_critical_section_reduce_block 2638 2639 2640 /* 2.a.i. Reduce Block without a terminating barrier */ 2641 /*! 2642 @ingroup SYNCHRONIZATION 2643 @param loc source location information 2644 @param global_tid global thread number 2645 @param num_vars number of items (variables) to be reduced 2646 @param reduce_size size of data in bytes to be reduced 2647 @param reduce_data pointer to data to be reduced 2648 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data 2649 @param lck pointer to the unique lock data structure 2650 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed 2651 2652 The nowait version is used for a reduce clause with the nowait argument. 2653 */ 2654 kmp_int32 2655 __kmpc_reduce_nowait( 2656 ident_t *loc, kmp_int32 global_tid, 2657 kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 2658 kmp_critical_name *lck ) { 2659 2660 KMP_COUNT_BLOCK(REDUCE_nowait); 2661 int retval = 0; 2662 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2663 #if OMP_40_ENABLED 2664 kmp_team_t *team; 2665 kmp_info_t *th; 2666 int teams_swapped = 0, task_state; 2667 #endif 2668 KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) ); 2669 2670 // why do we need this initialization here at all? 2671 // Reduction clause can not be used as a stand-alone directive. 2672 2673 // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed 2674 // possible detection of false-positive race by the threadchecker ??? 2675 if( ! TCR_4( __kmp_init_parallel ) ) 2676 __kmp_parallel_initialize(); 2677 2678 // check correctness of reduce block nesting 2679 #if KMP_USE_DYNAMIC_LOCK 2680 if ( __kmp_env_consistency_check ) 2681 __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 ); 2682 #else 2683 if ( __kmp_env_consistency_check ) 2684 __kmp_push_sync( global_tid, ct_reduce, loc, NULL ); 2685 #endif 2686 2687 #if OMP_40_ENABLED 2688 th = __kmp_thread_from_gtid(global_tid); 2689 if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct? 2690 team = th->th.th_team; 2691 if( team->t.t_level == th->th.th_teams_level ) { 2692 // this is reduction at teams construct 2693 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 2694 // Let's swap teams temporarily for the reduction barrier 2695 teams_swapped = 1; 2696 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2697 th->th.th_team = team->t.t_parent; 2698 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 2699 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 2700 task_state = th->th.th_task_state; 2701 th->th.th_task_state = 0; 2702 } 2703 } 2704 #endif // OMP_40_ENABLED 2705 2706 // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable 2707 // the variable should be either a construct-specific or thread-specific property, not a team specific property 2708 // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct) 2709 // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?) 2710 // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed) 2711 // a thread-specific variable is better regarding two issues above (next construct and extra syncs) 2712 // a thread-specific "th_local.reduction_method" variable is used currently 2713 // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs) 2714 2715 packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck ); 2716 __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method ); 2717 2718 if( packed_reduction_method == critical_reduce_block ) { 2719 2720 __kmp_enter_critical_section_reduce_block( loc, global_tid, lck ); 2721 retval = 1; 2722 2723 } else if( packed_reduction_method == empty_reduce_block ) { 2724 2725 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2726 retval = 1; 2727 2728 } else if( packed_reduction_method == atomic_reduce_block ) { 2729 2730 retval = 2; 2731 2732 // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen) 2733 // (it's not quite good, because the checking block has been closed by this 'pop', 2734 // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction) 2735 if ( __kmp_env_consistency_check ) 2736 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2737 2738 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2739 2740 //AT: performance issue: a real barrier here 2741 //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them) 2742 //AT: (it's not what a customer might expect specifying NOWAIT clause) 2743 //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer) 2744 //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster 2745 // and be more in line with sense of NOWAIT 2746 //AT: TO DO: do epcc test and compare times 2747 2748 // this barrier should be invisible to a customer and to the threading profile tool 2749 // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose) 2750 #if USE_ITT_NOTIFY 2751 __kmp_threads[global_tid]->th.th_ident = loc; 2752 #endif 2753 retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func ); 2754 retval = ( retval != 0 ) ? ( 0 ) : ( 1 ); 2755 2756 // all other workers except master should do this pop here 2757 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 2758 if ( __kmp_env_consistency_check ) { 2759 if( retval == 0 ) { 2760 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2761 } 2762 } 2763 2764 } else { 2765 2766 // should never reach this block 2767 KMP_ASSERT( 0 ); // "unexpected method" 2768 2769 } 2770 #if OMP_40_ENABLED 2771 if( teams_swapped ) { 2772 // Restore thread structure 2773 th->th.th_info.ds.ds_tid = 0; 2774 th->th.th_team = team; 2775 th->th.th_team_nproc = team->t.t_nproc; 2776 th->th.th_task_team = team->t.t_task_team[task_state]; 2777 th->th.th_task_state = task_state; 2778 } 2779 #endif 2780 KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) ); 2781 2782 return retval; 2783 } 2784 2785 /*! 2786 @ingroup SYNCHRONIZATION 2787 @param loc source location information 2788 @param global_tid global thread id. 2789 @param lck pointer to the unique lock data structure 2790 2791 Finish the execution of a reduce nowait. 2792 */ 2793 void 2794 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) { 2795 2796 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2797 2798 KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) ); 2799 2800 packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid ); 2801 2802 if( packed_reduction_method == critical_reduce_block ) { 2803 2804 __kmp_end_critical_section_reduce_block( loc, global_tid, lck ); 2805 2806 } else if( packed_reduction_method == empty_reduce_block ) { 2807 2808 // usage: if team size == 1, no synchronization is required ( on Intel platforms only ) 2809 2810 } else if( packed_reduction_method == atomic_reduce_block ) { 2811 2812 // neither master nor other workers should get here 2813 // (code gen does not generate this call in case 2: atomic reduce block) 2814 // actually it's better to remove this elseif at all; 2815 // after removal this value will checked by the 'else' and will assert 2816 2817 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2818 2819 // only master gets here 2820 2821 } else { 2822 2823 // should never reach this block 2824 KMP_ASSERT( 0 ); // "unexpected method" 2825 2826 } 2827 2828 if ( __kmp_env_consistency_check ) 2829 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2830 2831 KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) ); 2832 2833 return; 2834 } 2835 2836 /* 2.a.ii. Reduce Block with a terminating barrier */ 2837 2838 /*! 2839 @ingroup SYNCHRONIZATION 2840 @param loc source location information 2841 @param global_tid global thread number 2842 @param num_vars number of items (variables) to be reduced 2843 @param reduce_size size of data in bytes to be reduced 2844 @param reduce_data pointer to data to be reduced 2845 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data 2846 @param lck pointer to the unique lock data structure 2847 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed 2848 2849 A blocking reduce that includes an implicit barrier. 2850 */ 2851 kmp_int32 2852 __kmpc_reduce( 2853 ident_t *loc, kmp_int32 global_tid, 2854 kmp_int32 num_vars, size_t reduce_size, void *reduce_data, 2855 void (*reduce_func)(void *lhs_data, void *rhs_data), 2856 kmp_critical_name *lck ) 2857 { 2858 KMP_COUNT_BLOCK(REDUCE_wait); 2859 int retval = 0; 2860 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2861 2862 KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) ); 2863 2864 // why do we need this initialization here at all? 2865 // Reduction clause can not be a stand-alone directive. 2866 2867 // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed 2868 // possible detection of false-positive race by the threadchecker ??? 2869 if( ! TCR_4( __kmp_init_parallel ) ) 2870 __kmp_parallel_initialize(); 2871 2872 // check correctness of reduce block nesting 2873 #if KMP_USE_DYNAMIC_LOCK 2874 if ( __kmp_env_consistency_check ) 2875 __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 ); 2876 #else 2877 if ( __kmp_env_consistency_check ) 2878 __kmp_push_sync( global_tid, ct_reduce, loc, NULL ); 2879 #endif 2880 2881 packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck ); 2882 __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method ); 2883 2884 if( packed_reduction_method == critical_reduce_block ) { 2885 2886 __kmp_enter_critical_section_reduce_block( loc, global_tid, lck ); 2887 retval = 1; 2888 2889 } else if( packed_reduction_method == empty_reduce_block ) { 2890 2891 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2892 retval = 1; 2893 2894 } else if( packed_reduction_method == atomic_reduce_block ) { 2895 2896 retval = 2; 2897 2898 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2899 2900 //case tree_reduce_block: 2901 // this barrier should be visible to a customer and to the threading profile tool 2902 // (it's a terminating barrier on constructs if NOWAIT not specified) 2903 #if USE_ITT_NOTIFY 2904 __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames 2905 #endif 2906 retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func ); 2907 retval = ( retval != 0 ) ? ( 0 ) : ( 1 ); 2908 2909 // all other workers except master should do this pop here 2910 // ( none of other workers except master will enter __kmpc_end_reduce() ) 2911 if ( __kmp_env_consistency_check ) { 2912 if( retval == 0 ) { // 0: all other workers; 1: master 2913 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2914 } 2915 } 2916 2917 } else { 2918 2919 // should never reach this block 2920 KMP_ASSERT( 0 ); // "unexpected method" 2921 2922 } 2923 2924 KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) ); 2925 2926 return retval; 2927 } 2928 2929 /*! 2930 @ingroup SYNCHRONIZATION 2931 @param loc source location information 2932 @param global_tid global thread id. 2933 @param lck pointer to the unique lock data structure 2934 2935 Finish the execution of a blocking reduce. 2936 The <tt>lck</tt> pointer must be the same as that used in the corresponding start function. 2937 */ 2938 void 2939 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) { 2940 2941 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2942 2943 KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) ); 2944 2945 packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid ); 2946 2947 // this barrier should be visible to a customer and to the threading profile tool 2948 // (it's a terminating barrier on constructs if NOWAIT not specified) 2949 2950 if( packed_reduction_method == critical_reduce_block ) { 2951 2952 __kmp_end_critical_section_reduce_block( loc, global_tid, lck ); 2953 2954 // TODO: implicit barrier: should be exposed 2955 #if USE_ITT_NOTIFY 2956 __kmp_threads[global_tid]->th.th_ident = loc; 2957 #endif 2958 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2959 2960 } else if( packed_reduction_method == empty_reduce_block ) { 2961 2962 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2963 2964 // TODO: implicit barrier: should be exposed 2965 #if USE_ITT_NOTIFY 2966 __kmp_threads[global_tid]->th.th_ident = loc; 2967 #endif 2968 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2969 2970 } else if( packed_reduction_method == atomic_reduce_block ) { 2971 2972 // TODO: implicit barrier: should be exposed 2973 #if USE_ITT_NOTIFY 2974 __kmp_threads[global_tid]->th.th_ident = loc; 2975 #endif 2976 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2977 2978 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2979 2980 // only master executes here (master releases all other workers) 2981 __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid ); 2982 2983 } else { 2984 2985 // should never reach this block 2986 KMP_ASSERT( 0 ); // "unexpected method" 2987 2988 } 2989 2990 if ( __kmp_env_consistency_check ) 2991 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2992 2993 KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) ); 2994 2995 return; 2996 } 2997 2998 #undef __KMP_GET_REDUCTION_METHOD 2999 #undef __KMP_SET_REDUCTION_METHOD 3000 3001 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/ 3002 3003 kmp_uint64 3004 __kmpc_get_taskid() { 3005 3006 kmp_int32 gtid; 3007 kmp_info_t * thread; 3008 3009 gtid = __kmp_get_gtid(); 3010 if ( gtid < 0 ) { 3011 return 0; 3012 }; // if 3013 thread = __kmp_thread_from_gtid( gtid ); 3014 return thread->th.th_current_task->td_task_id; 3015 3016 } // __kmpc_get_taskid 3017 3018 3019 kmp_uint64 3020 __kmpc_get_parent_taskid() { 3021 3022 kmp_int32 gtid; 3023 kmp_info_t * thread; 3024 kmp_taskdata_t * parent_task; 3025 3026 gtid = __kmp_get_gtid(); 3027 if ( gtid < 0 ) { 3028 return 0; 3029 }; // if 3030 thread = __kmp_thread_from_gtid( gtid ); 3031 parent_task = thread->th.th_current_task->td_parent; 3032 return ( parent_task == NULL ? 0 : parent_task->td_task_id ); 3033 3034 } // __kmpc_get_parent_taskid 3035 3036 void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT) 3037 { 3038 if ( ! __kmp_init_serial ) { 3039 __kmp_serial_initialize(); 3040 } 3041 __kmp_place_num_sockets = nS; 3042 __kmp_place_socket_offset = sO; 3043 __kmp_place_num_cores = nC; 3044 __kmp_place_core_offset = cO; 3045 __kmp_place_num_threads_per_core = nT; 3046 } 3047 3048 #if OMP_45_ENABLED 3049 /*! 3050 @ingroup WORK_SHARING 3051 @param loc source location information. 3052 @param gtid global thread number. 3053 @param num_dims number of associated doacross loops. 3054 @param dims info on loops bounds. 3055 3056 Initialize doacross loop information. 3057 Expect compiler send us inclusive bounds, 3058 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3059 */ 3060 void 3061 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims) 3062 { 3063 int j, idx; 3064 kmp_int64 last, trace_count; 3065 kmp_info_t *th = __kmp_threads[gtid]; 3066 kmp_team_t *team = th->th.th_team; 3067 kmp_uint32 *flags; 3068 kmp_disp_t *pr_buf = th->th.th_dispatch; 3069 dispatch_shared_info_t *sh_buf; 3070 3071 KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3072 gtid, num_dims, !team->t.t_serialized)); 3073 KMP_DEBUG_ASSERT(dims != NULL); 3074 KMP_DEBUG_ASSERT(num_dims > 0); 3075 3076 if( team->t.t_serialized ) { 3077 KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n")); 3078 return; // no dependencies if team is serialized 3079 } 3080 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3081 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop 3082 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3083 3084 // Save bounds info into allocated private buffer 3085 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3086 pr_buf->th_doacross_info = 3087 (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1)); 3088 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3089 pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions 3090 // Save also address of num_done in order to access it later without knowing the buffer index 3091 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3092 pr_buf->th_doacross_info[2] = dims[0].lo; 3093 pr_buf->th_doacross_info[3] = dims[0].up; 3094 pr_buf->th_doacross_info[4] = dims[0].st; 3095 last = 5; 3096 for( j = 1; j < num_dims; ++j ) { 3097 kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0] 3098 if( dims[j].st == 1 ) { // most common case 3099 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3100 range_length = dims[j].up - dims[j].lo + 1; 3101 } else { 3102 if( dims[j].st > 0 ) { 3103 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3104 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3105 } else { // negative increment 3106 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3107 range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3108 } 3109 } 3110 pr_buf->th_doacross_info[last++] = range_length; 3111 pr_buf->th_doacross_info[last++] = dims[j].lo; 3112 pr_buf->th_doacross_info[last++] = dims[j].up; 3113 pr_buf->th_doacross_info[last++] = dims[j].st; 3114 } 3115 3116 // Compute total trip count. 3117 // Start with range of dims[0] which we don't need to keep in the buffer. 3118 if( dims[0].st == 1 ) { // most common case 3119 trace_count = dims[0].up - dims[0].lo + 1; 3120 } else if( dims[0].st > 0 ) { 3121 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3122 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3123 } else { // negative increment 3124 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3125 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3126 } 3127 for( j = 1; j < num_dims; ++j ) { 3128 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3129 } 3130 KMP_DEBUG_ASSERT(trace_count > 0); 3131 3132 // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers) 3133 if( idx != sh_buf->doacross_buf_idx ) { 3134 // Shared buffer is occupied, wait for it to be free 3135 __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL ); 3136 } 3137 // Check if we are the first thread. After the CAS the first thread gets 0, 3138 // others get 1 if initialization is in progress, allocated pointer otherwise. 3139 flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64( 3140 (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1); 3141 if( flags == NULL ) { 3142 // we are the first thread, allocate the array of flags 3143 kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3144 sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1); 3145 } else if( (kmp_int64)flags == 1 ) { 3146 // initialization is still in progress, need to wait 3147 while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) { 3148 KMP_YIELD(TRUE); 3149 } 3150 } 3151 KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer 3152 pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not 3153 // touch shared buffer on each iteration 3154 KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3155 } 3156 3157 void 3158 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) 3159 { 3160 kmp_int32 shft, num_dims, i; 3161 kmp_uint32 flag; 3162 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3163 kmp_info_t *th = __kmp_threads[gtid]; 3164 kmp_team_t *team = th->th.th_team; 3165 kmp_disp_t *pr_buf; 3166 kmp_int64 lo, up, st; 3167 3168 KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3169 if( team->t.t_serialized ) { 3170 KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n")); 3171 return; // no dependencies if team is serialized 3172 } 3173 3174 // calculate sequential iteration number and check out-of-bounds condition 3175 pr_buf = th->th.th_dispatch; 3176 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3177 num_dims = pr_buf->th_doacross_info[0]; 3178 lo = pr_buf->th_doacross_info[2]; 3179 up = pr_buf->th_doacross_info[3]; 3180 st = pr_buf->th_doacross_info[4]; 3181 if( st == 1 ) { // most common case 3182 if( vec[0] < lo || vec[0] > up ) { 3183 KA_TRACE(20,( 3184 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3185 gtid, vec[0], lo, up)); 3186 return; 3187 } 3188 iter_number = vec[0] - lo; 3189 } else if( st > 0 ) { 3190 if( vec[0] < lo || vec[0] > up ) { 3191 KA_TRACE(20,( 3192 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3193 gtid, vec[0], lo, up)); 3194 return; 3195 } 3196 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3197 } else { // negative increment 3198 if( vec[0] > lo || vec[0] < up ) { 3199 KA_TRACE(20,( 3200 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3201 gtid, vec[0], lo, up)); 3202 return; 3203 } 3204 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3205 } 3206 for( i = 1; i < num_dims; ++i ) { 3207 kmp_int64 iter, ln; 3208 kmp_int32 j = i * 4; 3209 ln = pr_buf->th_doacross_info[j + 1]; 3210 lo = pr_buf->th_doacross_info[j + 2]; 3211 up = pr_buf->th_doacross_info[j + 3]; 3212 st = pr_buf->th_doacross_info[j + 4]; 3213 if( st == 1 ) { 3214 if( vec[i] < lo || vec[i] > up ) { 3215 KA_TRACE(20,( 3216 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3217 gtid, vec[i], lo, up)); 3218 return; 3219 } 3220 iter = vec[i] - lo; 3221 } else if( st > 0 ) { 3222 if( vec[i] < lo || vec[i] > up ) { 3223 KA_TRACE(20,( 3224 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3225 gtid, vec[i], lo, up)); 3226 return; 3227 } 3228 iter = (kmp_uint64)(vec[i] - lo) / st; 3229 } else { // st < 0 3230 if( vec[i] > lo || vec[i] < up ) { 3231 KA_TRACE(20,( 3232 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3233 gtid, vec[i], lo, up)); 3234 return; 3235 } 3236 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3237 } 3238 iter_number = iter + ln * iter_number; 3239 } 3240 shft = iter_number % 32; // use 32-bit granularity 3241 iter_number >>= 5; // divided by 32 3242 flag = 1 << shft; 3243 while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) { 3244 KMP_YIELD(TRUE); 3245 } 3246 KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3247 gtid, (iter_number<<5)+shft)); 3248 } 3249 3250 void 3251 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) 3252 { 3253 kmp_int32 shft, num_dims, i; 3254 kmp_uint32 flag; 3255 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3256 kmp_info_t *th = __kmp_threads[gtid]; 3257 kmp_team_t *team = th->th.th_team; 3258 kmp_disp_t *pr_buf; 3259 kmp_int64 lo, st; 3260 3261 KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 3262 if( team->t.t_serialized ) { 3263 KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n")); 3264 return; // no dependencies if team is serialized 3265 } 3266 3267 // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks) 3268 pr_buf = th->th.th_dispatch; 3269 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3270 num_dims = pr_buf->th_doacross_info[0]; 3271 lo = pr_buf->th_doacross_info[2]; 3272 st = pr_buf->th_doacross_info[4]; 3273 if( st == 1 ) { // most common case 3274 iter_number = vec[0] - lo; 3275 } else if( st > 0 ) { 3276 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3277 } else { // negative increment 3278 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3279 } 3280 for( i = 1; i < num_dims; ++i ) { 3281 kmp_int64 iter, ln; 3282 kmp_int32 j = i * 4; 3283 ln = pr_buf->th_doacross_info[j + 1]; 3284 lo = pr_buf->th_doacross_info[j + 2]; 3285 st = pr_buf->th_doacross_info[j + 4]; 3286 if( st == 1 ) { 3287 iter = vec[i] - lo; 3288 } else if( st > 0 ) { 3289 iter = (kmp_uint64)(vec[i] - lo) / st; 3290 } else { // st < 0 3291 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3292 } 3293 iter_number = iter + ln * iter_number; 3294 } 3295 shft = iter_number % 32; // use 32-bit granularity 3296 iter_number >>= 5; // divided by 32 3297 flag = 1 << shft; 3298 if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) 3299 KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag ); 3300 KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", 3301 gtid, (iter_number<<5)+shft)); 3302 } 3303 3304 void 3305 __kmpc_doacross_fini(ident_t *loc, int gtid) 3306 { 3307 kmp_int64 num_done; 3308 kmp_info_t *th = __kmp_threads[gtid]; 3309 kmp_team_t *team = th->th.th_team; 3310 kmp_disp_t *pr_buf = th->th.th_dispatch; 3311 3312 KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 3313 if( team->t.t_serialized ) { 3314 KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 3315 return; // nothing to do 3316 } 3317 num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1; 3318 if( num_done == th->th.th_team_nproc ) { 3319 // we are the last thread, need to free shared resources 3320 int idx = pr_buf->th_doacross_buf_idx - 1; 3321 dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3322 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done); 3323 KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done); 3324 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 3325 __kmp_thread_free(th, (void*)sh_buf->doacross_flags); 3326 sh_buf->doacross_flags = NULL; 3327 sh_buf->doacross_num_done = 0; 3328 sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use 3329 } 3330 // free private resources (need to keep buffer index forever) 3331 __kmp_thread_free(th, (void*)pr_buf->th_doacross_info); 3332 pr_buf->th_doacross_info = NULL; 3333 KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 3334 } 3335 #endif 3336 3337 // end of file // 3338 3339