1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "omp.h" /* extern "C" declarations of user-visible routines */ 17 #include "kmp.h" 18 #include "kmp_i18n.h" 19 #include "kmp_itt.h" 20 #include "kmp_lock.h" 21 #include "kmp_error.h" 22 #include "kmp_stats.h" 23 24 #if OMPT_SUPPORT 25 #include "ompt-internal.h" 26 #include "ompt-specific.h" 27 #endif 28 29 #define MAX_MESSAGE 512 30 31 /* ------------------------------------------------------------------------ */ 32 /* ------------------------------------------------------------------------ */ 33 34 /* flags will be used in future, e.g., to implement */ 35 /* openmp_strict library restrictions */ 36 37 /*! 38 * @ingroup STARTUP_SHUTDOWN 39 * @param loc in source location information 40 * @param flags in for future use (currently ignored) 41 * 42 * Initialize the runtime library. This call is optional; if it is not made then 43 * it will be implicitly called by attempts to use other library functions. 44 * 45 */ 46 void 47 __kmpc_begin(ident_t *loc, kmp_int32 flags) 48 { 49 // By default __kmpc_begin() is no-op. 50 char *env; 51 if ((env = getenv( "KMP_INITIAL_THREAD_BIND" )) != NULL && 52 __kmp_str_match_true( env )) { 53 __kmp_middle_initialize(); 54 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n" )); 55 } else if (__kmp_ignore_mppbeg() == FALSE) { 56 // By default __kmp_ignore_mppbeg() returns TRUE. 57 __kmp_internal_begin(); 58 KC_TRACE( 10, ("__kmpc_begin: called\n" ) ); 59 } 60 } 61 62 /*! 63 * @ingroup STARTUP_SHUTDOWN 64 * @param loc source location information 65 * 66 * Shutdown the runtime library. This is also optional, and even if called will not 67 * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero. 68 */ 69 void 70 __kmpc_end(ident_t *loc) 71 { 72 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op. 73 // However, this can be overridden with KMP_IGNORE_MPPEND environment variable. 74 // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end() 75 // will unregister this root (it can cause library shut down). 76 if (__kmp_ignore_mppend() == FALSE) { 77 KC_TRACE( 10, ("__kmpc_end: called\n" ) ); 78 KA_TRACE( 30, ("__kmpc_end\n" )); 79 80 __kmp_internal_end_thread( -1 ); 81 } 82 } 83 84 /*! 85 @ingroup THREAD_STATES 86 @param loc Source location information. 87 @return The global thread index of the active thread. 88 89 This function can be called in any context. 90 91 If the runtime has ony been entered at the outermost level from a 92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that 93 which would be returned by omp_get_thread_num() in the outermost 94 active parallel construct. (Or zero if there is no active parallel 95 construct, since the master thread is necessarily thread zero). 96 97 If multiple non-OpenMP threads all enter an OpenMP construct then this 98 will be a unique thread identifier among all the threads created by 99 the OpenMP runtime (but the value cannote be defined in terms of 100 OpenMP thread ids returned by omp_get_thread_num()). 101 102 */ 103 kmp_int32 104 __kmpc_global_thread_num(ident_t *loc) 105 { 106 kmp_int32 gtid = __kmp_entry_gtid(); 107 108 KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) ); 109 110 return gtid; 111 } 112 113 /*! 114 @ingroup THREAD_STATES 115 @param loc Source location information. 116 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 117 118 This function can be called in any context. 119 It returns the total number of threads under the control of the OpenMP runtime. That is 120 not a number that can be determined by any OpenMP standard calls, since the library may be 121 called from more than one non-OpenMP thread, and this reflects the total over all such calls. 122 Similarly the runtime maintains underlying threads even when they are not active (since the cost 123 of creating and destroying OS threads is high), this call counts all such threads even if they are not 124 waiting for work. 125 */ 126 kmp_int32 127 __kmpc_global_num_threads(ident_t *loc) 128 { 129 KC_TRACE(10,("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 130 131 return TCR_4(__kmp_all_nth); 132 } 133 134 /*! 135 @ingroup THREAD_STATES 136 @param loc Source location information. 137 @return The thread number of the calling thread in the innermost active parallel construct. 138 139 */ 140 kmp_int32 141 __kmpc_bound_thread_num(ident_t *loc) 142 { 143 KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) ); 144 return __kmp_tid_from_gtid( __kmp_entry_gtid() ); 145 } 146 147 /*! 148 @ingroup THREAD_STATES 149 @param loc Source location information. 150 @return The number of threads in the innermost active parallel construct. 151 */ 152 kmp_int32 153 __kmpc_bound_num_threads(ident_t *loc) 154 { 155 KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) ); 156 157 return __kmp_entry_thread() -> th.th_team -> t.t_nproc; 158 } 159 160 /*! 161 * @ingroup DEPRECATED 162 * @param loc location description 163 * 164 * This function need not be called. It always returns TRUE. 165 */ 166 kmp_int32 167 __kmpc_ok_to_fork(ident_t *loc) 168 { 169 #ifndef KMP_DEBUG 170 171 return TRUE; 172 173 #else 174 175 const char *semi2; 176 const char *semi3; 177 int line_no; 178 179 if (__kmp_par_range == 0) { 180 return TRUE; 181 } 182 semi2 = loc->psource; 183 if (semi2 == NULL) { 184 return TRUE; 185 } 186 semi2 = strchr(semi2, ';'); 187 if (semi2 == NULL) { 188 return TRUE; 189 } 190 semi2 = strchr(semi2 + 1, ';'); 191 if (semi2 == NULL) { 192 return TRUE; 193 } 194 if (__kmp_par_range_filename[0]) { 195 const char *name = semi2 - 1; 196 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 197 name--; 198 } 199 if ((*name == '/') || (*name == ';')) { 200 name++; 201 } 202 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 203 return __kmp_par_range < 0; 204 } 205 } 206 semi3 = strchr(semi2 + 1, ';'); 207 if (__kmp_par_range_routine[0]) { 208 if ((semi3 != NULL) && (semi3 > semi2) 209 && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 210 return __kmp_par_range < 0; 211 } 212 } 213 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 214 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 215 return __kmp_par_range > 0; 216 } 217 return __kmp_par_range < 0; 218 } 219 return TRUE; 220 221 #endif /* KMP_DEBUG */ 222 223 } 224 225 /*! 226 @ingroup THREAD_STATES 227 @param loc Source location information. 228 @return 1 if this thread is executing inside an active parallel region, zero if not. 229 */ 230 kmp_int32 231 __kmpc_in_parallel( ident_t *loc ) 232 { 233 return __kmp_entry_thread() -> th.th_root -> r.r_active; 234 } 235 236 /*! 237 @ingroup PARALLEL 238 @param loc source location information 239 @param global_tid global thread number 240 @param num_threads number of threads requested for this parallel construct 241 242 Set the number of threads to be used by the next fork spawned by this thread. 243 This call is only required if the parallel construct has a `num_threads` clause. 244 */ 245 void 246 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads ) 247 { 248 KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 249 global_tid, num_threads ) ); 250 251 __kmp_push_num_threads( loc, global_tid, num_threads ); 252 } 253 254 void 255 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid ) 256 { 257 KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) ); 258 259 /* the num_threads are automatically popped */ 260 } 261 262 263 #if OMP_40_ENABLED 264 265 void 266 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind ) 267 { 268 KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", 269 global_tid, proc_bind ) ); 270 271 __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind ); 272 } 273 274 #endif /* OMP_40_ENABLED */ 275 276 277 /*! 278 @ingroup PARALLEL 279 @param loc source location information 280 @param argc total number of arguments in the ellipsis 281 @param microtask pointer to callback routine consisting of outlined parallel construct 282 @param ... pointers to shared variables that aren't global 283 284 Do the actual fork and call the microtask in the relevant number of threads. 285 */ 286 void 287 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) 288 { 289 int gtid = __kmp_entry_gtid(); 290 291 #if (KMP_STATS_ENABLED) 292 int inParallel = __kmpc_in_parallel(loc); 293 if (inParallel) 294 { 295 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 296 } 297 else 298 { 299 KMP_COUNT_BLOCK(OMP_PARALLEL); 300 } 301 #endif 302 303 // maybe to save thr_state is enough here 304 { 305 va_list ap; 306 va_start( ap, microtask ); 307 308 #if OMPT_SUPPORT 309 ompt_frame_t* ompt_frame; 310 if (ompt_enabled) { 311 kmp_info_t *master_th = __kmp_threads[ gtid ]; 312 kmp_team_t *parent_team = master_th->th.th_team; 313 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 314 if (lwt) 315 ompt_frame = &(lwt->ompt_task_info.frame); 316 else 317 { 318 int tid = __kmp_tid_from_gtid( gtid ); 319 ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid]. 320 ompt_task_info.frame); 321 } 322 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 323 } 324 #endif 325 326 #if INCLUDE_SSC_MARKS 327 SSC_MARK_FORKING(); 328 #endif 329 __kmp_fork_call( loc, gtid, fork_context_intel, 330 argc, 331 #if OMPT_SUPPORT 332 VOLATILE_CAST(void *) microtask, // "unwrapped" task 333 #endif 334 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 335 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 336 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 337 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 338 &ap 339 #else 340 ap 341 #endif 342 ); 343 #if INCLUDE_SSC_MARKS 344 SSC_MARK_JOINING(); 345 #endif 346 __kmp_join_call( loc, gtid 347 #if OMPT_SUPPORT 348 , fork_context_intel 349 #endif 350 ); 351 352 va_end( ap ); 353 354 } 355 } 356 357 #if OMP_40_ENABLED 358 /*! 359 @ingroup PARALLEL 360 @param loc source location information 361 @param global_tid global thread number 362 @param num_teams number of teams requested for the teams construct 363 @param num_threads number of threads per team requested for the teams construct 364 365 Set the number of teams to be used by the teams construct. 366 This call is only required if the teams construct has a `num_teams` clause 367 or a `thread_limit` clause (or both). 368 */ 369 void 370 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads ) 371 { 372 KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 373 global_tid, num_teams, num_threads ) ); 374 375 __kmp_push_num_teams( loc, global_tid, num_teams, num_threads ); 376 } 377 378 /*! 379 @ingroup PARALLEL 380 @param loc source location information 381 @param argc total number of arguments in the ellipsis 382 @param microtask pointer to callback routine consisting of outlined teams construct 383 @param ... pointers to shared variables that aren't global 384 385 Do the actual fork and call the microtask in the relevant number of threads. 386 */ 387 void 388 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) 389 { 390 int gtid = __kmp_entry_gtid(); 391 kmp_info_t *this_thr = __kmp_threads[ gtid ]; 392 va_list ap; 393 va_start( ap, microtask ); 394 395 KMP_COUNT_BLOCK(OMP_TEAMS); 396 397 // remember teams entry point and nesting level 398 this_thr->th.th_teams_microtask = microtask; 399 this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host 400 401 #if OMPT_SUPPORT 402 kmp_team_t *parent_team = this_thr->th.th_team; 403 int tid = __kmp_tid_from_gtid( gtid ); 404 if (ompt_enabled) { 405 parent_team->t.t_implicit_task_taskdata[tid]. 406 ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1); 407 } 408 #endif 409 410 // check if __kmpc_push_num_teams called, set default number of teams otherwise 411 if ( this_thr->th.th_teams_size.nteams == 0 ) { 412 __kmp_push_num_teams( loc, gtid, 0, 0 ); 413 } 414 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 415 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 416 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 417 418 __kmp_fork_call( loc, gtid, fork_context_intel, 419 argc, 420 #if OMPT_SUPPORT 421 VOLATILE_CAST(void *) microtask, // "unwrapped" task 422 #endif 423 VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task 424 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 425 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 426 &ap 427 #else 428 ap 429 #endif 430 ); 431 __kmp_join_call( loc, gtid 432 #if OMPT_SUPPORT 433 , fork_context_intel 434 #endif 435 ); 436 437 this_thr->th.th_teams_microtask = NULL; 438 this_thr->th.th_teams_level = 0; 439 *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L; 440 va_end( ap ); 441 } 442 #endif /* OMP_40_ENABLED */ 443 444 445 // 446 // I don't think this function should ever have been exported. 447 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 448 // openmp code ever called it, but it's been exported from the RTL for so 449 // long that I'm afraid to remove the definition. 450 // 451 int 452 __kmpc_invoke_task_func( int gtid ) 453 { 454 return __kmp_invoke_task_func( gtid ); 455 } 456 457 /*! 458 @ingroup PARALLEL 459 @param loc source location information 460 @param global_tid global thread number 461 462 Enter a serialized parallel construct. This interface is used to handle a 463 conditional parallel region, like this, 464 @code 465 #pragma omp parallel if (condition) 466 @endcode 467 when the condition is false. 468 */ 469 void 470 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) 471 { 472 // The implementation is now in kmp_runtime.cpp so that it can share static 473 // functions with kmp_fork_call since the tasks to be done are similar in 474 // each case. 475 __kmp_serialized_parallel(loc, global_tid); 476 } 477 478 /*! 479 @ingroup PARALLEL 480 @param loc source location information 481 @param global_tid global thread number 482 483 Leave a serialized parallel construct. 484 */ 485 void 486 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) 487 { 488 kmp_internal_control_t *top; 489 kmp_info_t *this_thr; 490 kmp_team_t *serial_team; 491 492 KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) ); 493 494 /* skip all this code for autopar serialized loops since it results in 495 unacceptable overhead */ 496 if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) ) 497 return; 498 499 // Not autopar code 500 if( ! TCR_4( __kmp_init_parallel ) ) 501 __kmp_parallel_initialize(); 502 503 this_thr = __kmp_threads[ global_tid ]; 504 serial_team = this_thr->th.th_serial_team; 505 506 #if OMP_45_ENABLED 507 kmp_task_team_t * task_team = this_thr->th.th_task_team; 508 509 // we need to wait for the proxy tasks before finishing the thread 510 if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) 511 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here? 512 #endif 513 514 KMP_MB(); 515 KMP_DEBUG_ASSERT( serial_team ); 516 KMP_ASSERT( serial_team -> t.t_serialized ); 517 KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team ); 518 KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team ); 519 KMP_DEBUG_ASSERT( serial_team -> t.t_threads ); 520 KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr ); 521 522 /* If necessary, pop the internal control stack values and replace the team values */ 523 top = serial_team -> t.t_control_stack_top; 524 if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) { 525 copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top ); 526 serial_team -> t.t_control_stack_top = top -> next; 527 __kmp_free(top); 528 } 529 530 //if( serial_team -> t.t_serialized > 1 ) 531 serial_team -> t.t_level--; 532 533 /* pop dispatch buffers stack */ 534 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 535 { 536 dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer; 537 serial_team->t.t_dispatch->th_disp_buffer = 538 serial_team->t.t_dispatch->th_disp_buffer->next; 539 __kmp_free( disp_buffer ); 540 } 541 542 -- serial_team -> t.t_serialized; 543 if ( serial_team -> t.t_serialized == 0 ) { 544 545 /* return to the parallel section */ 546 547 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 548 if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) { 549 __kmp_clear_x87_fpu_status_word(); 550 __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word ); 551 __kmp_load_mxcsr( &serial_team->t.t_mxcsr ); 552 } 553 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 554 555 this_thr -> th.th_team = serial_team -> t.t_parent; 556 this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid; 557 558 /* restore values cached in the thread */ 559 this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */ 560 this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */ 561 this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized; 562 563 /* TODO the below shouldn't need to be adjusted for serialized teams */ 564 this_thr -> th.th_dispatch = & this_thr -> th.th_team -> 565 t.t_dispatch[ serial_team -> t.t_master_tid ]; 566 567 __kmp_pop_current_task_from_thread( this_thr ); 568 569 KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 ); 570 this_thr -> th.th_current_task -> td_flags.executing = 1; 571 572 if ( __kmp_tasking_mode != tskm_immediate_exec ) { 573 // Copy the task team from the new child / old parent team to the thread. 574 this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 575 KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n", 576 global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) ); 577 } 578 } else { 579 if ( __kmp_tasking_mode != tskm_immediate_exec ) { 580 KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n", 581 global_tid, serial_team, serial_team -> t.t_serialized ) ); 582 } 583 } 584 585 if ( __kmp_env_consistency_check ) 586 __kmp_pop_parallel( global_tid, NULL ); 587 } 588 589 /*! 590 @ingroup SYNCHRONIZATION 591 @param loc source location information. 592 593 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 594 depending on the memory ordering convention obeyed by the compiler 595 even that may not be necessary). 596 */ 597 void 598 __kmpc_flush(ident_t *loc) 599 { 600 KC_TRACE( 10, ("__kmpc_flush: called\n" ) ); 601 602 /* need explicit __mf() here since use volatile instead in library */ 603 KMP_MB(); /* Flush all pending memory write invalidates. */ 604 605 #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) 606 #if KMP_MIC 607 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 608 // We shouldn't need it, though, since the ABI rules require that 609 // * If the compiler generates NGO stores it also generates the fence 610 // * If users hand-code NGO stores they should insert the fence 611 // therefore no incomplete unordered stores should be visible. 612 #else 613 // C74404 614 // This is to address non-temporal store instructions (sfence needed). 615 // The clflush instruction is addressed either (mfence needed). 616 // Probably the non-temporal load monvtdqa instruction should also be addressed. 617 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 618 if ( ! __kmp_cpuinfo.initialized ) { 619 __kmp_query_cpuid( & __kmp_cpuinfo ); 620 }; // if 621 if ( ! __kmp_cpuinfo.sse2 ) { 622 // CPU cannot execute SSE2 instructions. 623 } else { 624 #if KMP_COMPILER_ICC 625 _mm_mfence(); 626 #elif KMP_COMPILER_MSVC 627 MemoryBarrier(); 628 #else 629 __sync_synchronize(); 630 #endif // KMP_COMPILER_ICC 631 }; // if 632 #endif // KMP_MIC 633 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 634 // Nothing to see here move along 635 #elif KMP_ARCH_PPC64 636 // Nothing needed here (we have a real MB above). 637 #if KMP_OS_CNK 638 // The flushing thread needs to yield here; this prevents a 639 // busy-waiting thread from saturating the pipeline. flush is 640 // often used in loops like this: 641 // while (!flag) { 642 // #pragma omp flush(flag) 643 // } 644 // and adding the yield here is good for at least a 10x speedup 645 // when running >2 threads per core (on the NAS LU benchmark). 646 __kmp_yield(TRUE); 647 #endif 648 #else 649 #error Unknown or unsupported architecture 650 #endif 651 652 } 653 654 /* -------------------------------------------------------------------------- */ 655 656 /* -------------------------------------------------------------------------- */ 657 658 /*! 659 @ingroup SYNCHRONIZATION 660 @param loc source location information 661 @param global_tid thread id. 662 663 Execute a barrier. 664 */ 665 void 666 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) 667 { 668 KMP_COUNT_BLOCK(OMP_BARRIER); 669 KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) ); 670 671 if (! TCR_4(__kmp_init_parallel)) 672 __kmp_parallel_initialize(); 673 674 if ( __kmp_env_consistency_check ) { 675 if ( loc == 0 ) { 676 KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user? 677 }; // if 678 679 __kmp_check_barrier( global_tid, ct_barrier, loc ); 680 } 681 682 #if OMPT_SUPPORT && OMPT_TRACE 683 ompt_frame_t * ompt_frame; 684 if (ompt_enabled ) { 685 ompt_frame = __ompt_get_task_frame_internal(0); 686 if ( ompt_frame->reenter_runtime_frame == NULL ) 687 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 688 } 689 #endif 690 __kmp_threads[ global_tid ]->th.th_ident = loc; 691 // TODO: explicit barrier_wait_id: 692 // this function is called when 'barrier' directive is present or 693 // implicit barrier at the end of a worksharing construct. 694 // 1) better to add a per-thread barrier counter to a thread data structure 695 // 2) set to 0 when a new team is created 696 // 4) no sync is required 697 698 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 699 #if OMPT_SUPPORT && OMPT_TRACE 700 if (ompt_enabled ) { 701 ompt_frame->reenter_runtime_frame = NULL; 702 } 703 #endif 704 } 705 706 /* The BARRIER for a MASTER section is always explicit */ 707 /*! 708 @ingroup WORK_SHARING 709 @param loc source location information. 710 @param global_tid global thread number . 711 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 712 */ 713 kmp_int32 714 __kmpc_master(ident_t *loc, kmp_int32 global_tid) 715 { 716 int status = 0; 717 718 KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) ); 719 720 if( ! TCR_4( __kmp_init_parallel ) ) 721 __kmp_parallel_initialize(); 722 723 if( KMP_MASTER_GTID( global_tid )) { 724 KMP_COUNT_BLOCK(OMP_MASTER); 725 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 726 status = 1; 727 } 728 729 #if OMPT_SUPPORT && OMPT_TRACE 730 if (status) { 731 if (ompt_enabled && 732 ompt_callbacks.ompt_callback(ompt_event_master_begin)) { 733 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 734 kmp_team_t *team = this_thr -> th.th_team; 735 736 int tid = __kmp_tid_from_gtid( global_tid ); 737 ompt_callbacks.ompt_callback(ompt_event_master_begin)( 738 team->t.ompt_team_info.parallel_id, 739 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 740 } 741 } 742 #endif 743 744 if ( __kmp_env_consistency_check ) { 745 #if KMP_USE_DYNAMIC_LOCK 746 if (status) 747 __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 ); 748 else 749 __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 ); 750 #else 751 if (status) 752 __kmp_push_sync( global_tid, ct_master, loc, NULL ); 753 else 754 __kmp_check_sync( global_tid, ct_master, loc, NULL ); 755 #endif 756 } 757 758 return status; 759 } 760 761 /*! 762 @ingroup WORK_SHARING 763 @param loc source location information. 764 @param global_tid global thread number . 765 766 Mark the end of a <tt>master</tt> region. This should only be called by the thread 767 that executes the <tt>master</tt> region. 768 */ 769 void 770 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) 771 { 772 KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) ); 773 774 KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid )); 775 KMP_POP_PARTITIONED_TIMER(); 776 777 #if OMPT_SUPPORT && OMPT_TRACE 778 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 779 kmp_team_t *team = this_thr -> th.th_team; 780 if (ompt_enabled && 781 ompt_callbacks.ompt_callback(ompt_event_master_end)) { 782 int tid = __kmp_tid_from_gtid( global_tid ); 783 ompt_callbacks.ompt_callback(ompt_event_master_end)( 784 team->t.ompt_team_info.parallel_id, 785 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 786 } 787 #endif 788 789 if ( __kmp_env_consistency_check ) { 790 if( global_tid < 0 ) 791 KMP_WARNING( ThreadIdentInvalid ); 792 793 if( KMP_MASTER_GTID( global_tid )) 794 __kmp_pop_sync( global_tid, ct_master, loc ); 795 } 796 } 797 798 /*! 799 @ingroup WORK_SHARING 800 @param loc source location information. 801 @param gtid global thread number. 802 803 Start execution of an <tt>ordered</tt> construct. 804 */ 805 void 806 __kmpc_ordered( ident_t * loc, kmp_int32 gtid ) 807 { 808 int cid = 0; 809 kmp_info_t *th; 810 KMP_DEBUG_ASSERT( __kmp_init_serial ); 811 812 KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid )); 813 814 if (! TCR_4(__kmp_init_parallel)) 815 __kmp_parallel_initialize(); 816 817 #if USE_ITT_BUILD 818 __kmp_itt_ordered_prep( gtid ); 819 // TODO: ordered_wait_id 820 #endif /* USE_ITT_BUILD */ 821 822 th = __kmp_threads[ gtid ]; 823 824 #if OMPT_SUPPORT && OMPT_TRACE 825 if (ompt_enabled) { 826 /* OMPT state update */ 827 th->th.ompt_thread_info.wait_id = (uint64_t) loc; 828 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 829 830 /* OMPT event callback */ 831 if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) { 832 ompt_callbacks.ompt_callback(ompt_event_wait_ordered)( 833 th->th.ompt_thread_info.wait_id); 834 } 835 } 836 #endif 837 838 if ( th -> th.th_dispatch -> th_deo_fcn != 0 ) 839 (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc ); 840 else 841 __kmp_parallel_deo( & gtid, & cid, loc ); 842 843 #if OMPT_SUPPORT && OMPT_TRACE 844 if (ompt_enabled) { 845 /* OMPT state update */ 846 th->th.ompt_thread_info.state = ompt_state_work_parallel; 847 th->th.ompt_thread_info.wait_id = 0; 848 849 /* OMPT event callback */ 850 if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) { 851 ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)( 852 th->th.ompt_thread_info.wait_id); 853 } 854 } 855 #endif 856 857 #if USE_ITT_BUILD 858 __kmp_itt_ordered_start( gtid ); 859 #endif /* USE_ITT_BUILD */ 860 } 861 862 /*! 863 @ingroup WORK_SHARING 864 @param loc source location information. 865 @param gtid global thread number. 866 867 End execution of an <tt>ordered</tt> construct. 868 */ 869 void 870 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid ) 871 { 872 int cid = 0; 873 kmp_info_t *th; 874 875 KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) ); 876 877 #if USE_ITT_BUILD 878 __kmp_itt_ordered_end( gtid ); 879 // TODO: ordered_wait_id 880 #endif /* USE_ITT_BUILD */ 881 882 th = __kmp_threads[ gtid ]; 883 884 if ( th -> th.th_dispatch -> th_dxo_fcn != 0 ) 885 (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc ); 886 else 887 __kmp_parallel_dxo( & gtid, & cid, loc ); 888 889 #if OMPT_SUPPORT && OMPT_BLAME 890 if (ompt_enabled && 891 ompt_callbacks.ompt_callback(ompt_event_release_ordered)) { 892 ompt_callbacks.ompt_callback(ompt_event_release_ordered)( 893 th->th.ompt_thread_info.wait_id); 894 } 895 #endif 896 } 897 898 #if KMP_USE_DYNAMIC_LOCK 899 900 static __forceinline void 901 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag) 902 { 903 // Pointer to the allocated indirect lock is written to crit, while indexing is ignored. 904 void *idx; 905 kmp_indirect_lock_t **lck; 906 lck = (kmp_indirect_lock_t **)crit; 907 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 908 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 909 KMP_SET_I_LOCK_LOCATION(ilk, loc); 910 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 911 KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 912 #if USE_ITT_BUILD 913 __kmp_itt_critical_creating(ilk->lock, loc); 914 #endif 915 int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk); 916 if (status == 0) { 917 #if USE_ITT_BUILD 918 __kmp_itt_critical_destroyed(ilk->lock); 919 #endif 920 // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit. 921 //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 922 } 923 KMP_DEBUG_ASSERT(*lck != NULL); 924 } 925 926 // Fast-path acquire tas lock 927 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) { \ 928 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 929 if (l->lk.poll != KMP_LOCK_FREE(tas) || \ 930 ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \ 931 kmp_uint32 spins; \ 932 KMP_FSYNC_PREPARE(l); \ 933 KMP_INIT_YIELD(spins); \ 934 if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 935 KMP_YIELD(TRUE); \ 936 } else { \ 937 KMP_YIELD_SPIN(spins); \ 938 } \ 939 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 940 while (l->lk.poll != KMP_LOCK_FREE(tas) || \ 941 ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \ 942 __kmp_spin_backoff(&backoff); \ 943 if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 944 KMP_YIELD(TRUE); \ 945 } else { \ 946 KMP_YIELD_SPIN(spins); \ 947 } \ 948 } \ 949 } \ 950 KMP_FSYNC_ACQUIRED(l); \ 951 } 952 953 // Fast-path test tas lock 954 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) { \ 955 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 956 rc = l->lk.poll == KMP_LOCK_FREE(tas) && \ 957 KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas)); \ 958 } 959 960 // Fast-path release tas lock 961 #define KMP_RELEASE_TAS_LOCK(lock, gtid) { \ 962 TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \ 963 KMP_MB(); \ 964 } 965 966 #if KMP_USE_FUTEX 967 968 # include <unistd.h> 969 # include <sys/syscall.h> 970 # ifndef FUTEX_WAIT 971 # define FUTEX_WAIT 0 972 # endif 973 # ifndef FUTEX_WAKE 974 # define FUTEX_WAKE 1 975 # endif 976 977 // Fast-path acquire futex lock 978 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) { \ 979 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 980 kmp_int32 gtid_code = (gtid+1) << 1; \ 981 KMP_MB(); \ 982 KMP_FSYNC_PREPARE(ftx); \ 983 kmp_int32 poll_val; \ 984 while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 985 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 986 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 987 if (!cond) { \ 988 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) { \ 989 continue; \ 990 } \ 991 poll_val |= KMP_LOCK_BUSY(1, futex); \ 992 } \ 993 kmp_int32 rc; \ 994 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \ 995 continue; \ 996 } \ 997 gtid_code |= 1; \ 998 } \ 999 KMP_FSYNC_ACQUIRED(ftx); \ 1000 } 1001 1002 // Fast-path test futex lock 1003 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) { \ 1004 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1005 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) { \ 1006 KMP_FSYNC_ACQUIRED(ftx); \ 1007 rc = TRUE; \ 1008 } else { \ 1009 rc = FALSE; \ 1010 } \ 1011 } 1012 1013 // Fast-path release futex lock 1014 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) { \ 1015 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1016 KMP_MB(); \ 1017 KMP_FSYNC_RELEASING(ftx); \ 1018 kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1019 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1020 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1021 } \ 1022 KMP_MB(); \ 1023 KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1024 } 1025 1026 #endif // KMP_USE_FUTEX 1027 1028 #else // KMP_USE_DYNAMIC_LOCK 1029 1030 static kmp_user_lock_p 1031 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid ) 1032 { 1033 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1034 1035 // 1036 // Because of the double-check, the following load 1037 // doesn't need to be volatile. 1038 // 1039 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp ); 1040 1041 if ( lck == NULL ) { 1042 void * idx; 1043 1044 // Allocate & initialize the lock. 1045 // Remember allocated locks in table in order to free them in __kmp_cleanup() 1046 lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section ); 1047 __kmp_init_user_lock_with_checks( lck ); 1048 __kmp_set_user_lock_location( lck, loc ); 1049 #if USE_ITT_BUILD 1050 __kmp_itt_critical_creating( lck ); 1051 // __kmp_itt_critical_creating() should be called *before* the first usage of underlying 1052 // lock. It is the only place where we can guarantee it. There are chances the lock will 1053 // destroyed with no usage, but it is not a problem, because this is not real event seen 1054 // by user but rather setting name for object (lock). See more details in kmp_itt.h. 1055 #endif /* USE_ITT_BUILD */ 1056 1057 // 1058 // Use a cmpxchg instruction to slam the start of the critical 1059 // section with the lock pointer. If another thread beat us 1060 // to it, deallocate the lock, and use the lock that the other 1061 // thread allocated. 1062 // 1063 int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck ); 1064 1065 if ( status == 0 ) { 1066 // Deallocate the lock and reload the value. 1067 #if USE_ITT_BUILD 1068 __kmp_itt_critical_destroyed( lck ); 1069 // Let ITT know the lock is destroyed and the same memory location may be reused for 1070 // another purpose. 1071 #endif /* USE_ITT_BUILD */ 1072 __kmp_destroy_user_lock_with_checks( lck ); 1073 __kmp_user_lock_free( &idx, gtid, lck ); 1074 lck = (kmp_user_lock_p)TCR_PTR( *lck_pp ); 1075 KMP_DEBUG_ASSERT( lck != NULL ); 1076 } 1077 } 1078 return lck; 1079 } 1080 1081 #endif // KMP_USE_DYNAMIC_LOCK 1082 1083 /*! 1084 @ingroup WORK_SHARING 1085 @param loc source location information. 1086 @param global_tid global thread number . 1087 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or 1088 some other suitably unique value. 1089 1090 Enter code protected by a `critical` construct. 1091 This function blocks until the executing thread can enter the critical section. 1092 */ 1093 void 1094 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) 1095 { 1096 #if KMP_USE_DYNAMIC_LOCK 1097 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1098 #else 1099 KMP_COUNT_BLOCK(OMP_CRITICAL); 1100 KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ 1101 kmp_user_lock_p lck; 1102 1103 KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); 1104 1105 //TODO: add THR_OVHD_STATE 1106 1107 KMP_CHECK_USER_LOCK_INIT(); 1108 1109 if ( ( __kmp_user_lock_kind == lk_tas ) 1110 && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1111 lck = (kmp_user_lock_p)crit; 1112 } 1113 #if KMP_USE_FUTEX 1114 else if ( ( __kmp_user_lock_kind == lk_futex ) 1115 && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1116 lck = (kmp_user_lock_p)crit; 1117 } 1118 #endif 1119 else { // ticket, queuing or drdpa 1120 lck = __kmp_get_critical_section_ptr( crit, loc, global_tid ); 1121 } 1122 1123 if ( __kmp_env_consistency_check ) 1124 __kmp_push_sync( global_tid, ct_critical, loc, lck ); 1125 1126 /* since the critical directive binds to all threads, not just 1127 * the current team we have to check this even if we are in a 1128 * serialized team */ 1129 /* also, even if we are the uber thread, we still have to conduct the lock, 1130 * as we have to contend with sibling threads */ 1131 1132 #if USE_ITT_BUILD 1133 __kmp_itt_critical_acquiring( lck ); 1134 #endif /* USE_ITT_BUILD */ 1135 // Value of 'crit' should be good for using as a critical_id of the critical section directive. 1136 __kmp_acquire_user_lock_with_checks( lck, global_tid ); 1137 1138 #if USE_ITT_BUILD 1139 __kmp_itt_critical_acquired( lck ); 1140 #endif /* USE_ITT_BUILD */ 1141 1142 KMP_START_EXPLICIT_TIMER(OMP_critical); 1143 KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); 1144 #endif // KMP_USE_DYNAMIC_LOCK 1145 } 1146 1147 #if KMP_USE_DYNAMIC_LOCK 1148 1149 // Converts the given hint to an internal lock implementation 1150 static __forceinline kmp_dyna_lockseq_t 1151 __kmp_map_hint_to_lock(uintptr_t hint) 1152 { 1153 #if KMP_USE_TSX 1154 # define KMP_TSX_LOCK(seq) lockseq_##seq 1155 #else 1156 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1157 #endif 1158 1159 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1160 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1161 #else 1162 # define KMP_CPUINFO_RTM 0 1163 #endif 1164 1165 // Hints that do not require further logic 1166 if (hint & kmp_lock_hint_hle) 1167 return KMP_TSX_LOCK(hle); 1168 if (hint & kmp_lock_hint_rtm) 1169 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq; 1170 if (hint & kmp_lock_hint_adaptive) 1171 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq; 1172 1173 // Rule out conflicting hints first by returning the default lock 1174 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1175 return __kmp_user_lock_seq; 1176 if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative)) 1177 return __kmp_user_lock_seq; 1178 1179 // Do not even consider speculation when it appears to be contended 1180 if (hint & omp_lock_hint_contended) 1181 return lockseq_queuing; 1182 1183 // Uncontended lock without speculation 1184 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1185 return lockseq_tas; 1186 1187 // HLE lock for speculation 1188 if (hint & omp_lock_hint_speculative) 1189 return KMP_TSX_LOCK(hle); 1190 1191 return __kmp_user_lock_seq; 1192 } 1193 1194 /*! 1195 @ingroup WORK_SHARING 1196 @param loc source location information. 1197 @param global_tid global thread number. 1198 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, 1199 or some other suitably unique value. 1200 @param hint the lock hint. 1201 1202 Enter code protected by a `critical` construct with a hint. The hint value is used to suggest a lock implementation. 1203 This function blocks until the executing thread can enter the critical section unless the hint suggests use of 1204 speculative execution and the hardware supports it. 1205 */ 1206 void 1207 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint ) 1208 { 1209 KMP_COUNT_BLOCK(OMP_CRITICAL); 1210 kmp_user_lock_p lck; 1211 1212 KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); 1213 1214 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1215 // Check if it is initialized. 1216 if (*lk == 0) { 1217 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1218 if (KMP_IS_D_LOCK(lckseq)) { 1219 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq)); 1220 } else { 1221 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1222 } 1223 } 1224 // Branch for accessing the actual lock object and set operation. This branching is inevitable since 1225 // this lock initialization does not follow the normal dispatch path (lock table is not used). 1226 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1227 lck = (kmp_user_lock_p)lk; 1228 if (__kmp_env_consistency_check) { 1229 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint)); 1230 } 1231 # if USE_ITT_BUILD 1232 __kmp_itt_critical_acquiring(lck); 1233 # endif 1234 # if KMP_USE_INLINED_TAS 1235 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1236 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1237 } else 1238 # elif KMP_USE_INLINED_FUTEX 1239 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1240 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1241 } else 1242 # endif 1243 { 1244 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1245 } 1246 } else { 1247 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1248 lck = ilk->lock; 1249 if (__kmp_env_consistency_check) { 1250 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint)); 1251 } 1252 # if USE_ITT_BUILD 1253 __kmp_itt_critical_acquiring(lck); 1254 # endif 1255 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1256 } 1257 1258 #if USE_ITT_BUILD 1259 __kmp_itt_critical_acquired( lck ); 1260 #endif /* USE_ITT_BUILD */ 1261 1262 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1263 KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); 1264 } // __kmpc_critical_with_hint 1265 1266 #endif // KMP_USE_DYNAMIC_LOCK 1267 1268 /*! 1269 @ingroup WORK_SHARING 1270 @param loc source location information. 1271 @param global_tid global thread number . 1272 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or 1273 some other suitably unique value. 1274 1275 Leave a critical section, releasing any lock that was held during its execution. 1276 */ 1277 void 1278 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit) 1279 { 1280 kmp_user_lock_p lck; 1281 1282 KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid )); 1283 1284 #if KMP_USE_DYNAMIC_LOCK 1285 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1286 lck = (kmp_user_lock_p)crit; 1287 KMP_ASSERT(lck != NULL); 1288 if (__kmp_env_consistency_check) { 1289 __kmp_pop_sync(global_tid, ct_critical, loc); 1290 } 1291 # if USE_ITT_BUILD 1292 __kmp_itt_critical_releasing( lck ); 1293 # endif 1294 # if KMP_USE_INLINED_TAS 1295 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1296 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1297 } else 1298 # elif KMP_USE_INLINED_FUTEX 1299 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1300 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1301 } else 1302 # endif 1303 { 1304 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1305 } 1306 } else { 1307 kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1308 KMP_ASSERT(ilk != NULL); 1309 lck = ilk->lock; 1310 if (__kmp_env_consistency_check) { 1311 __kmp_pop_sync(global_tid, ct_critical, loc); 1312 } 1313 # if USE_ITT_BUILD 1314 __kmp_itt_critical_releasing( lck ); 1315 # endif 1316 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1317 } 1318 1319 #else // KMP_USE_DYNAMIC_LOCK 1320 1321 if ( ( __kmp_user_lock_kind == lk_tas ) 1322 && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1323 lck = (kmp_user_lock_p)crit; 1324 } 1325 #if KMP_USE_FUTEX 1326 else if ( ( __kmp_user_lock_kind == lk_futex ) 1327 && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) { 1328 lck = (kmp_user_lock_p)crit; 1329 } 1330 #endif 1331 else { // ticket, queuing or drdpa 1332 lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit)); 1333 } 1334 1335 KMP_ASSERT(lck != NULL); 1336 1337 if ( __kmp_env_consistency_check ) 1338 __kmp_pop_sync( global_tid, ct_critical, loc ); 1339 1340 #if USE_ITT_BUILD 1341 __kmp_itt_critical_releasing( lck ); 1342 #endif /* USE_ITT_BUILD */ 1343 // Value of 'crit' should be good for using as a critical_id of the critical section directive. 1344 __kmp_release_user_lock_with_checks( lck, global_tid ); 1345 1346 #if OMPT_SUPPORT && OMPT_BLAME 1347 if (ompt_enabled && 1348 ompt_callbacks.ompt_callback(ompt_event_release_critical)) { 1349 ompt_callbacks.ompt_callback(ompt_event_release_critical)( 1350 (uint64_t) lck); 1351 } 1352 #endif 1353 1354 #endif // KMP_USE_DYNAMIC_LOCK 1355 KMP_POP_PARTITIONED_TIMER(); 1356 KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid )); 1357 } 1358 1359 /*! 1360 @ingroup SYNCHRONIZATION 1361 @param loc source location information 1362 @param global_tid thread id. 1363 @return one if the thread should execute the master block, zero otherwise 1364 1365 Start execution of a combined barrier and master. The barrier is executed inside this function. 1366 */ 1367 kmp_int32 1368 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) 1369 { 1370 int status; 1371 1372 KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) ); 1373 1374 if (! TCR_4(__kmp_init_parallel)) 1375 __kmp_parallel_initialize(); 1376 1377 if ( __kmp_env_consistency_check ) 1378 __kmp_check_barrier( global_tid, ct_barrier, loc ); 1379 1380 #if USE_ITT_NOTIFY 1381 __kmp_threads[global_tid]->th.th_ident = loc; 1382 #endif 1383 status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL ); 1384 1385 return (status != 0) ? 0 : 1; 1386 } 1387 1388 /*! 1389 @ingroup SYNCHRONIZATION 1390 @param loc source location information 1391 @param global_tid thread id. 1392 1393 Complete the execution of a combined barrier and master. This function should 1394 only be called at the completion of the <tt>master</tt> code. Other threads will 1395 still be waiting at the barrier and this call releases them. 1396 */ 1397 void 1398 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) 1399 { 1400 KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid )); 1401 1402 __kmp_end_split_barrier ( bs_plain_barrier, global_tid ); 1403 } 1404 1405 /*! 1406 @ingroup SYNCHRONIZATION 1407 @param loc source location information 1408 @param global_tid thread id. 1409 @return one if the thread should execute the master block, zero otherwise 1410 1411 Start execution of a combined barrier and master(nowait) construct. 1412 The barrier is executed inside this function. 1413 There is no equivalent "end" function, since the 1414 */ 1415 kmp_int32 1416 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid ) 1417 { 1418 kmp_int32 ret; 1419 1420 KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid )); 1421 1422 if (! TCR_4(__kmp_init_parallel)) 1423 __kmp_parallel_initialize(); 1424 1425 if ( __kmp_env_consistency_check ) { 1426 if ( loc == 0 ) { 1427 KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user? 1428 } 1429 __kmp_check_barrier( global_tid, ct_barrier, loc ); 1430 } 1431 1432 #if USE_ITT_NOTIFY 1433 __kmp_threads[global_tid]->th.th_ident = loc; 1434 #endif 1435 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 1436 1437 ret = __kmpc_master (loc, global_tid); 1438 1439 if ( __kmp_env_consistency_check ) { 1440 /* there's no __kmpc_end_master called; so the (stats) */ 1441 /* actions of __kmpc_end_master are done here */ 1442 1443 if ( global_tid < 0 ) { 1444 KMP_WARNING( ThreadIdentInvalid ); 1445 } 1446 if (ret) { 1447 /* only one thread should do the pop since only */ 1448 /* one did the push (see __kmpc_master()) */ 1449 1450 __kmp_pop_sync( global_tid, ct_master, loc ); 1451 } 1452 } 1453 1454 return (ret); 1455 } 1456 1457 /* The BARRIER for a SINGLE process section is always explicit */ 1458 /*! 1459 @ingroup WORK_SHARING 1460 @param loc source location information 1461 @param global_tid global thread number 1462 @return One if this thread should execute the single construct, zero otherwise. 1463 1464 Test whether to execute a <tt>single</tt> construct. 1465 There are no implicit barriers in the two "single" calls, rather the compiler should 1466 introduce an explicit barrier if it is required. 1467 */ 1468 1469 kmp_int32 1470 __kmpc_single(ident_t *loc, kmp_int32 global_tid) 1471 { 1472 kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE ); 1473 1474 if (rc) { 1475 // We are going to execute the single statement, so we should count it. 1476 KMP_COUNT_BLOCK(OMP_SINGLE); 1477 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1478 } 1479 1480 #if OMPT_SUPPORT && OMPT_TRACE 1481 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 1482 kmp_team_t *team = this_thr -> th.th_team; 1483 int tid = __kmp_tid_from_gtid( global_tid ); 1484 1485 if (ompt_enabled) { 1486 if (rc) { 1487 if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) { 1488 ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)( 1489 team->t.ompt_team_info.parallel_id, 1490 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id, 1491 team->t.ompt_team_info.microtask); 1492 } 1493 } else { 1494 if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) { 1495 ompt_callbacks.ompt_callback(ompt_event_single_others_begin)( 1496 team->t.ompt_team_info.parallel_id, 1497 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1498 } 1499 this_thr->th.ompt_thread_info.state = ompt_state_wait_single; 1500 } 1501 } 1502 #endif 1503 1504 return rc; 1505 } 1506 1507 /*! 1508 @ingroup WORK_SHARING 1509 @param loc source location information 1510 @param global_tid global thread number 1511 1512 Mark the end of a <tt>single</tt> construct. This function should 1513 only be called by the thread that executed the block of code protected 1514 by the `single` construct. 1515 */ 1516 void 1517 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) 1518 { 1519 __kmp_exit_single( global_tid ); 1520 KMP_POP_PARTITIONED_TIMER(); 1521 1522 #if OMPT_SUPPORT && OMPT_TRACE 1523 kmp_info_t *this_thr = __kmp_threads[ global_tid ]; 1524 kmp_team_t *team = this_thr -> th.th_team; 1525 int tid = __kmp_tid_from_gtid( global_tid ); 1526 1527 if (ompt_enabled && 1528 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) { 1529 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)( 1530 team->t.ompt_team_info.parallel_id, 1531 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1532 } 1533 #endif 1534 } 1535 1536 /*! 1537 @ingroup WORK_SHARING 1538 @param loc Source location 1539 @param global_tid Global thread id 1540 1541 Mark the end of a statically scheduled loop. 1542 */ 1543 void 1544 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid ) 1545 { 1546 KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1547 1548 #if OMPT_SUPPORT && OMPT_TRACE 1549 if (ompt_enabled && 1550 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { 1551 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1552 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1553 ompt_callbacks.ompt_callback(ompt_event_loop_end)( 1554 team_info->parallel_id, task_info->task_id); 1555 } 1556 #endif 1557 1558 if ( __kmp_env_consistency_check ) 1559 __kmp_pop_workshare( global_tid, ct_pdo, loc ); 1560 } 1561 1562 /* 1563 * User routines which take C-style arguments (call by value) 1564 * different from the Fortran equivalent routines 1565 */ 1566 1567 void 1568 ompc_set_num_threads( int arg ) 1569 { 1570 // !!!!! TODO: check the per-task binding 1571 __kmp_set_num_threads( arg, __kmp_entry_gtid() ); 1572 } 1573 1574 void 1575 ompc_set_dynamic( int flag ) 1576 { 1577 kmp_info_t *thread; 1578 1579 /* For the thread-private implementation of the internal controls */ 1580 thread = __kmp_entry_thread(); 1581 1582 __kmp_save_internal_controls( thread ); 1583 1584 set__dynamic( thread, flag ? TRUE : FALSE ); 1585 } 1586 1587 void 1588 ompc_set_nested( int flag ) 1589 { 1590 kmp_info_t *thread; 1591 1592 /* For the thread-private internal controls implementation */ 1593 thread = __kmp_entry_thread(); 1594 1595 __kmp_save_internal_controls( thread ); 1596 1597 set__nested( thread, flag ? TRUE : FALSE ); 1598 } 1599 1600 void 1601 ompc_set_max_active_levels( int max_active_levels ) 1602 { 1603 /* TO DO */ 1604 /* we want per-task implementation of this internal control */ 1605 1606 /* For the per-thread internal controls implementation */ 1607 __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels ); 1608 } 1609 1610 void 1611 ompc_set_schedule( omp_sched_t kind, int modifier ) 1612 { 1613 // !!!!! TODO: check the per-task binding 1614 __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier ); 1615 } 1616 1617 int 1618 ompc_get_ancestor_thread_num( int level ) 1619 { 1620 return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level ); 1621 } 1622 1623 int 1624 ompc_get_team_size( int level ) 1625 { 1626 return __kmp_get_team_size( __kmp_entry_gtid(), level ); 1627 } 1628 1629 void 1630 kmpc_set_stacksize( int arg ) 1631 { 1632 // __kmp_aux_set_stacksize initializes the library if needed 1633 __kmp_aux_set_stacksize( arg ); 1634 } 1635 1636 void 1637 kmpc_set_stacksize_s( size_t arg ) 1638 { 1639 // __kmp_aux_set_stacksize initializes the library if needed 1640 __kmp_aux_set_stacksize( arg ); 1641 } 1642 1643 void 1644 kmpc_set_blocktime( int arg ) 1645 { 1646 int gtid, tid; 1647 kmp_info_t *thread; 1648 1649 gtid = __kmp_entry_gtid(); 1650 tid = __kmp_tid_from_gtid(gtid); 1651 thread = __kmp_thread_from_gtid(gtid); 1652 1653 __kmp_aux_set_blocktime( arg, thread, tid ); 1654 } 1655 1656 void 1657 kmpc_set_library( int arg ) 1658 { 1659 // __kmp_user_set_library initializes the library if needed 1660 __kmp_user_set_library( (enum library_type)arg ); 1661 } 1662 1663 void 1664 kmpc_set_defaults( char const * str ) 1665 { 1666 // __kmp_aux_set_defaults initializes the library if needed 1667 __kmp_aux_set_defaults( str, KMP_STRLEN( str ) ); 1668 } 1669 1670 void 1671 kmpc_set_disp_num_buffers( int arg ) 1672 { 1673 // ignore after initialization because some teams have already 1674 // allocated dispatch buffers 1675 if( __kmp_init_serial == 0 && arg > 0 ) 1676 __kmp_dispatch_num_buffers = arg; 1677 } 1678 1679 int 1680 kmpc_set_affinity_mask_proc( int proc, void **mask ) 1681 { 1682 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1683 return -1; 1684 #else 1685 if ( ! TCR_4(__kmp_init_middle) ) { 1686 __kmp_middle_initialize(); 1687 } 1688 return __kmp_aux_set_affinity_mask_proc( proc, mask ); 1689 #endif 1690 } 1691 1692 int 1693 kmpc_unset_affinity_mask_proc( int proc, void **mask ) 1694 { 1695 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1696 return -1; 1697 #else 1698 if ( ! TCR_4(__kmp_init_middle) ) { 1699 __kmp_middle_initialize(); 1700 } 1701 return __kmp_aux_unset_affinity_mask_proc( proc, mask ); 1702 #endif 1703 } 1704 1705 int 1706 kmpc_get_affinity_mask_proc( int proc, void **mask ) 1707 { 1708 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1709 return -1; 1710 #else 1711 if ( ! TCR_4(__kmp_init_middle) ) { 1712 __kmp_middle_initialize(); 1713 } 1714 return __kmp_aux_get_affinity_mask_proc( proc, mask ); 1715 #endif 1716 } 1717 1718 1719 /* -------------------------------------------------------------------------- */ 1720 /*! 1721 @ingroup THREADPRIVATE 1722 @param loc source location information 1723 @param gtid global thread number 1724 @param cpy_size size of the cpy_data buffer 1725 @param cpy_data pointer to data to be copied 1726 @param cpy_func helper function to call for copying data 1727 @param didit flag variable: 1=single thread; 0=not single thread 1728 1729 __kmpc_copyprivate implements the interface for the private data broadcast needed for 1730 the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran). 1731 All threads participating in the parallel region call this routine. 1732 One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1 1733 and all other threads should have that variable set to 0. 1734 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1735 1736 The OpenMP specification forbids the use of nowait on the single region when a copyprivate 1737 clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid 1738 race conditions, so the code generation for the single region should avoid generating a barrier 1739 after the call to @ref __kmpc_copyprivate. 1740 1741 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1742 The <tt>loc</tt> parameter is a pointer to source location information. 1743 1744 Internal implementation: The single thread will first copy its descriptor address (cpy_data) 1745 to a team-private location, then the other threads will each call the function pointed to by 1746 the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer. 1747 1748 The cpy_func routine used for the copy and the contents of the data area defined by cpy_data 1749 and cpy_size may be built in any fashion that will allow the copy to be done. For instance, 1750 the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers 1751 to the data. The cpy_func routine must interpret the cpy_data buffer appropriately. 1752 1753 The interface to cpy_func is as follows: 1754 @code 1755 void cpy_func( void *destination, void *source ) 1756 @endcode 1757 where void *destination is the cpy_data pointer for the thread being copied to 1758 and void *source is the cpy_data pointer for the thread being copied from. 1759 */ 1760 void 1761 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit ) 1762 { 1763 void **data_ptr; 1764 1765 KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid )); 1766 1767 KMP_MB(); 1768 1769 data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data; 1770 1771 if ( __kmp_env_consistency_check ) { 1772 if ( loc == 0 ) { 1773 KMP_WARNING( ConstructIdentInvalid ); 1774 } 1775 } 1776 1777 /* ToDo: Optimize the following two barriers into some kind of split barrier */ 1778 1779 if (didit) *data_ptr = cpy_data; 1780 1781 /* This barrier is not a barrier region boundary */ 1782 #if USE_ITT_NOTIFY 1783 __kmp_threads[gtid]->th.th_ident = loc; 1784 #endif 1785 __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL ); 1786 1787 if (! didit) (*cpy_func)( cpy_data, *data_ptr ); 1788 1789 /* Consider next barrier the user-visible barrier for barrier region boundaries */ 1790 /* Nesting checks are already handled by the single construct checks */ 1791 1792 #if USE_ITT_NOTIFY 1793 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location) 1794 #endif 1795 __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL ); 1796 } 1797 1798 /* -------------------------------------------------------------------------- */ 1799 1800 #define INIT_LOCK __kmp_init_user_lock_with_checks 1801 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 1802 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 1803 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 1804 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 1805 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed 1806 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 1807 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 1808 #define TEST_LOCK __kmp_test_user_lock_with_checks 1809 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 1810 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 1811 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 1812 1813 1814 /* 1815 * TODO: Make check abort messages use location info & pass it 1816 * into with_checks routines 1817 */ 1818 1819 #if KMP_USE_DYNAMIC_LOCK 1820 1821 // internal lock initializer 1822 static __forceinline void 1823 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq) 1824 { 1825 if (KMP_IS_D_LOCK(seq)) { 1826 KMP_INIT_D_LOCK(lock, seq); 1827 #if USE_ITT_BUILD 1828 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 1829 #endif 1830 } else { 1831 KMP_INIT_I_LOCK(lock, seq); 1832 #if USE_ITT_BUILD 1833 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1834 __kmp_itt_lock_creating(ilk->lock, loc); 1835 #endif 1836 } 1837 } 1838 1839 // internal nest lock initializer 1840 static __forceinline void 1841 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq) 1842 { 1843 #if KMP_USE_TSX 1844 // Don't have nested lock implementation for speculative locks 1845 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 1846 seq = __kmp_user_lock_seq; 1847 #endif 1848 switch (seq) { 1849 case lockseq_tas: 1850 seq = lockseq_nested_tas; 1851 break; 1852 #if KMP_USE_FUTEX 1853 case lockseq_futex: 1854 seq = lockseq_nested_futex; 1855 break; 1856 #endif 1857 case lockseq_ticket: 1858 seq = lockseq_nested_ticket; 1859 break; 1860 case lockseq_queuing: 1861 seq = lockseq_nested_queuing; 1862 break; 1863 case lockseq_drdpa: 1864 seq = lockseq_nested_drdpa; 1865 break; 1866 default: 1867 seq = lockseq_nested_queuing; 1868 } 1869 KMP_INIT_I_LOCK(lock, seq); 1870 #if USE_ITT_BUILD 1871 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1872 __kmp_itt_lock_creating(ilk->lock, loc); 1873 #endif 1874 } 1875 1876 /* initialize the lock with a hint */ 1877 void 1878 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint) 1879 { 1880 KMP_DEBUG_ASSERT(__kmp_init_serial); 1881 if (__kmp_env_consistency_check && user_lock == NULL) { 1882 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 1883 } 1884 1885 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1886 } 1887 1888 /* initialize the lock with a hint */ 1889 void 1890 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint) 1891 { 1892 KMP_DEBUG_ASSERT(__kmp_init_serial); 1893 if (__kmp_env_consistency_check && user_lock == NULL) { 1894 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 1895 } 1896 1897 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1898 } 1899 1900 #endif // KMP_USE_DYNAMIC_LOCK 1901 1902 /* initialize the lock */ 1903 void 1904 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 1905 #if KMP_USE_DYNAMIC_LOCK 1906 KMP_DEBUG_ASSERT(__kmp_init_serial); 1907 if (__kmp_env_consistency_check && user_lock == NULL) { 1908 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 1909 } 1910 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1911 1912 #else // KMP_USE_DYNAMIC_LOCK 1913 1914 static char const * const func = "omp_init_lock"; 1915 kmp_user_lock_p lck; 1916 KMP_DEBUG_ASSERT( __kmp_init_serial ); 1917 1918 if ( __kmp_env_consistency_check ) { 1919 if ( user_lock == NULL ) { 1920 KMP_FATAL( LockIsUninitialized, func ); 1921 } 1922 } 1923 1924 KMP_CHECK_USER_LOCK_INIT(); 1925 1926 if ( ( __kmp_user_lock_kind == lk_tas ) 1927 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 1928 lck = (kmp_user_lock_p)user_lock; 1929 } 1930 #if KMP_USE_FUTEX 1931 else if ( ( __kmp_user_lock_kind == lk_futex ) 1932 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 1933 lck = (kmp_user_lock_p)user_lock; 1934 } 1935 #endif 1936 else { 1937 lck = __kmp_user_lock_allocate( user_lock, gtid, 0 ); 1938 } 1939 INIT_LOCK( lck ); 1940 __kmp_set_user_lock_location( lck, loc ); 1941 1942 #if OMPT_SUPPORT && OMPT_TRACE 1943 if (ompt_enabled && 1944 ompt_callbacks.ompt_callback(ompt_event_init_lock)) { 1945 ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck); 1946 } 1947 #endif 1948 1949 #if USE_ITT_BUILD 1950 __kmp_itt_lock_creating( lck ); 1951 #endif /* USE_ITT_BUILD */ 1952 1953 #endif // KMP_USE_DYNAMIC_LOCK 1954 } // __kmpc_init_lock 1955 1956 /* initialize the lock */ 1957 void 1958 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 1959 #if KMP_USE_DYNAMIC_LOCK 1960 1961 KMP_DEBUG_ASSERT(__kmp_init_serial); 1962 if (__kmp_env_consistency_check && user_lock == NULL) { 1963 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 1964 } 1965 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1966 1967 #else // KMP_USE_DYNAMIC_LOCK 1968 1969 static char const * const func = "omp_init_nest_lock"; 1970 kmp_user_lock_p lck; 1971 KMP_DEBUG_ASSERT( __kmp_init_serial ); 1972 1973 if ( __kmp_env_consistency_check ) { 1974 if ( user_lock == NULL ) { 1975 KMP_FATAL( LockIsUninitialized, func ); 1976 } 1977 } 1978 1979 KMP_CHECK_USER_LOCK_INIT(); 1980 1981 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 1982 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 1983 lck = (kmp_user_lock_p)user_lock; 1984 } 1985 #if KMP_USE_FUTEX 1986 else if ( ( __kmp_user_lock_kind == lk_futex ) 1987 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 1988 <= OMP_NEST_LOCK_T_SIZE ) ) { 1989 lck = (kmp_user_lock_p)user_lock; 1990 } 1991 #endif 1992 else { 1993 lck = __kmp_user_lock_allocate( user_lock, gtid, 0 ); 1994 } 1995 1996 INIT_NESTED_LOCK( lck ); 1997 __kmp_set_user_lock_location( lck, loc ); 1998 1999 #if OMPT_SUPPORT && OMPT_TRACE 2000 if (ompt_enabled && 2001 ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) { 2002 ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck); 2003 } 2004 #endif 2005 2006 #if USE_ITT_BUILD 2007 __kmp_itt_lock_creating( lck ); 2008 #endif /* USE_ITT_BUILD */ 2009 2010 #endif // KMP_USE_DYNAMIC_LOCK 2011 } // __kmpc_init_nest_lock 2012 2013 void 2014 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2015 #if KMP_USE_DYNAMIC_LOCK 2016 2017 # if USE_ITT_BUILD 2018 kmp_user_lock_p lck; 2019 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2020 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2021 } else { 2022 lck = (kmp_user_lock_p)user_lock; 2023 } 2024 __kmp_itt_lock_destroyed(lck); 2025 # endif 2026 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2027 #else 2028 kmp_user_lock_p lck; 2029 2030 if ( ( __kmp_user_lock_kind == lk_tas ) 2031 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2032 lck = (kmp_user_lock_p)user_lock; 2033 } 2034 #if KMP_USE_FUTEX 2035 else if ( ( __kmp_user_lock_kind == lk_futex ) 2036 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2037 lck = (kmp_user_lock_p)user_lock; 2038 } 2039 #endif 2040 else { 2041 lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" ); 2042 } 2043 2044 #if OMPT_SUPPORT && OMPT_TRACE 2045 if (ompt_enabled && 2046 ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) { 2047 ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck); 2048 } 2049 #endif 2050 2051 #if USE_ITT_BUILD 2052 __kmp_itt_lock_destroyed( lck ); 2053 #endif /* USE_ITT_BUILD */ 2054 DESTROY_LOCK( lck ); 2055 2056 if ( ( __kmp_user_lock_kind == lk_tas ) 2057 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2058 ; 2059 } 2060 #if KMP_USE_FUTEX 2061 else if ( ( __kmp_user_lock_kind == lk_futex ) 2062 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2063 ; 2064 } 2065 #endif 2066 else { 2067 __kmp_user_lock_free( user_lock, gtid, lck ); 2068 } 2069 #endif // KMP_USE_DYNAMIC_LOCK 2070 } // __kmpc_destroy_lock 2071 2072 /* destroy the lock */ 2073 void 2074 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2075 #if KMP_USE_DYNAMIC_LOCK 2076 2077 # if USE_ITT_BUILD 2078 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2079 __kmp_itt_lock_destroyed(ilk->lock); 2080 # endif 2081 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2082 2083 #else // KMP_USE_DYNAMIC_LOCK 2084 2085 kmp_user_lock_p lck; 2086 2087 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2088 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2089 lck = (kmp_user_lock_p)user_lock; 2090 } 2091 #if KMP_USE_FUTEX 2092 else if ( ( __kmp_user_lock_kind == lk_futex ) 2093 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2094 <= OMP_NEST_LOCK_T_SIZE ) ) { 2095 lck = (kmp_user_lock_p)user_lock; 2096 } 2097 #endif 2098 else { 2099 lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" ); 2100 } 2101 2102 #if OMPT_SUPPORT && OMPT_TRACE 2103 if (ompt_enabled && 2104 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) { 2105 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck); 2106 } 2107 #endif 2108 2109 #if USE_ITT_BUILD 2110 __kmp_itt_lock_destroyed( lck ); 2111 #endif /* USE_ITT_BUILD */ 2112 2113 DESTROY_NESTED_LOCK( lck ); 2114 2115 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2116 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2117 ; 2118 } 2119 #if KMP_USE_FUTEX 2120 else if ( ( __kmp_user_lock_kind == lk_futex ) 2121 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2122 <= OMP_NEST_LOCK_T_SIZE ) ) { 2123 ; 2124 } 2125 #endif 2126 else { 2127 __kmp_user_lock_free( user_lock, gtid, lck ); 2128 } 2129 #endif // KMP_USE_DYNAMIC_LOCK 2130 } // __kmpc_destroy_nest_lock 2131 2132 void 2133 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2134 KMP_COUNT_BLOCK(OMP_set_lock); 2135 #if KMP_USE_DYNAMIC_LOCK 2136 int tag = KMP_EXTRACT_D_TAG(user_lock); 2137 # if USE_ITT_BUILD 2138 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object. 2139 # endif 2140 # if KMP_USE_INLINED_TAS 2141 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2142 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2143 } else 2144 # elif KMP_USE_INLINED_FUTEX 2145 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2146 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2147 } else 2148 # endif 2149 { 2150 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2151 } 2152 # if USE_ITT_BUILD 2153 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2154 # endif 2155 2156 #else // KMP_USE_DYNAMIC_LOCK 2157 2158 kmp_user_lock_p lck; 2159 2160 if ( ( __kmp_user_lock_kind == lk_tas ) 2161 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2162 lck = (kmp_user_lock_p)user_lock; 2163 } 2164 #if KMP_USE_FUTEX 2165 else if ( ( __kmp_user_lock_kind == lk_futex ) 2166 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2167 lck = (kmp_user_lock_p)user_lock; 2168 } 2169 #endif 2170 else { 2171 lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" ); 2172 } 2173 2174 #if USE_ITT_BUILD 2175 __kmp_itt_lock_acquiring( lck ); 2176 #endif /* USE_ITT_BUILD */ 2177 2178 ACQUIRE_LOCK( lck, gtid ); 2179 2180 #if USE_ITT_BUILD 2181 __kmp_itt_lock_acquired( lck ); 2182 #endif /* USE_ITT_BUILD */ 2183 2184 #if OMPT_SUPPORT && OMPT_TRACE 2185 if (ompt_enabled && 2186 ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) { 2187 ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck); 2188 } 2189 #endif 2190 2191 #endif // KMP_USE_DYNAMIC_LOCK 2192 } 2193 2194 void 2195 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) { 2196 #if KMP_USE_DYNAMIC_LOCK 2197 2198 # if USE_ITT_BUILD 2199 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2200 # endif 2201 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2202 # if USE_ITT_BUILD 2203 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2204 #endif 2205 2206 #if OMPT_SUPPORT && OMPT_TRACE 2207 if (ompt_enabled) { 2208 // missing support here: need to know whether acquired first or not 2209 } 2210 #endif 2211 2212 #else // KMP_USE_DYNAMIC_LOCK 2213 int acquire_status; 2214 kmp_user_lock_p lck; 2215 2216 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2217 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2218 lck = (kmp_user_lock_p)user_lock; 2219 } 2220 #if KMP_USE_FUTEX 2221 else if ( ( __kmp_user_lock_kind == lk_futex ) 2222 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2223 <= OMP_NEST_LOCK_T_SIZE ) ) { 2224 lck = (kmp_user_lock_p)user_lock; 2225 } 2226 #endif 2227 else { 2228 lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" ); 2229 } 2230 2231 #if USE_ITT_BUILD 2232 __kmp_itt_lock_acquiring( lck ); 2233 #endif /* USE_ITT_BUILD */ 2234 2235 ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status ); 2236 2237 #if USE_ITT_BUILD 2238 __kmp_itt_lock_acquired( lck ); 2239 #endif /* USE_ITT_BUILD */ 2240 2241 #if OMPT_SUPPORT && OMPT_TRACE 2242 if (ompt_enabled) { 2243 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2244 if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)) 2245 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck); 2246 } else { 2247 if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)) 2248 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck); 2249 } 2250 } 2251 #endif 2252 2253 #endif // KMP_USE_DYNAMIC_LOCK 2254 } 2255 2256 void 2257 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2258 { 2259 #if KMP_USE_DYNAMIC_LOCK 2260 2261 int tag = KMP_EXTRACT_D_TAG(user_lock); 2262 # if USE_ITT_BUILD 2263 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2264 # endif 2265 # if KMP_USE_INLINED_TAS 2266 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2267 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2268 } else 2269 # elif KMP_USE_INLINED_FUTEX 2270 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2271 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2272 } else 2273 # endif 2274 { 2275 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2276 } 2277 2278 #else // KMP_USE_DYNAMIC_LOCK 2279 2280 kmp_user_lock_p lck; 2281 2282 /* Can't use serial interval since not block structured */ 2283 /* release the lock */ 2284 2285 if ( ( __kmp_user_lock_kind == lk_tas ) 2286 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2287 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2288 // "fast" path implemented to fix customer performance issue 2289 #if USE_ITT_BUILD 2290 __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock ); 2291 #endif /* USE_ITT_BUILD */ 2292 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2293 KMP_MB(); 2294 return; 2295 #else 2296 lck = (kmp_user_lock_p)user_lock; 2297 #endif 2298 } 2299 #if KMP_USE_FUTEX 2300 else if ( ( __kmp_user_lock_kind == lk_futex ) 2301 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2302 lck = (kmp_user_lock_p)user_lock; 2303 } 2304 #endif 2305 else { 2306 lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" ); 2307 } 2308 2309 #if USE_ITT_BUILD 2310 __kmp_itt_lock_releasing( lck ); 2311 #endif /* USE_ITT_BUILD */ 2312 2313 RELEASE_LOCK( lck, gtid ); 2314 2315 #if OMPT_SUPPORT && OMPT_BLAME 2316 if (ompt_enabled && 2317 ompt_callbacks.ompt_callback(ompt_event_release_lock)) { 2318 ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck); 2319 } 2320 #endif 2321 2322 #endif // KMP_USE_DYNAMIC_LOCK 2323 } 2324 2325 /* release the lock */ 2326 void 2327 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2328 { 2329 #if KMP_USE_DYNAMIC_LOCK 2330 2331 # if USE_ITT_BUILD 2332 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2333 # endif 2334 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2335 2336 #else // KMP_USE_DYNAMIC_LOCK 2337 2338 kmp_user_lock_p lck; 2339 2340 /* Can't use serial interval since not block structured */ 2341 2342 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2343 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2344 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2345 // "fast" path implemented to fix customer performance issue 2346 kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock; 2347 #if USE_ITT_BUILD 2348 __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock ); 2349 #endif /* USE_ITT_BUILD */ 2350 if ( --(tl->lk.depth_locked) == 0 ) { 2351 TCW_4(tl->lk.poll, 0); 2352 } 2353 KMP_MB(); 2354 return; 2355 #else 2356 lck = (kmp_user_lock_p)user_lock; 2357 #endif 2358 } 2359 #if KMP_USE_FUTEX 2360 else if ( ( __kmp_user_lock_kind == lk_futex ) 2361 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2362 <= OMP_NEST_LOCK_T_SIZE ) ) { 2363 lck = (kmp_user_lock_p)user_lock; 2364 } 2365 #endif 2366 else { 2367 lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" ); 2368 } 2369 2370 #if USE_ITT_BUILD 2371 __kmp_itt_lock_releasing( lck ); 2372 #endif /* USE_ITT_BUILD */ 2373 2374 int release_status; 2375 release_status = RELEASE_NESTED_LOCK( lck, gtid ); 2376 #if OMPT_SUPPORT && OMPT_BLAME 2377 if (ompt_enabled) { 2378 if (release_status == KMP_LOCK_RELEASED) { 2379 if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) { 2380 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)( 2381 (uint64_t) lck); 2382 } 2383 } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) { 2384 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)( 2385 (uint64_t) lck); 2386 } 2387 } 2388 #endif 2389 2390 #endif // KMP_USE_DYNAMIC_LOCK 2391 } 2392 2393 /* try to acquire the lock */ 2394 int 2395 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2396 { 2397 KMP_COUNT_BLOCK(OMP_test_lock); 2398 2399 #if KMP_USE_DYNAMIC_LOCK 2400 int rc; 2401 int tag = KMP_EXTRACT_D_TAG(user_lock); 2402 # if USE_ITT_BUILD 2403 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2404 # endif 2405 # if KMP_USE_INLINED_TAS 2406 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2407 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2408 } else 2409 # elif KMP_USE_INLINED_FUTEX 2410 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2411 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2412 } else 2413 # endif 2414 { 2415 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2416 } 2417 if (rc) { 2418 # if USE_ITT_BUILD 2419 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2420 # endif 2421 return FTN_TRUE; 2422 } else { 2423 # if USE_ITT_BUILD 2424 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2425 # endif 2426 return FTN_FALSE; 2427 } 2428 2429 #else // KMP_USE_DYNAMIC_LOCK 2430 2431 kmp_user_lock_p lck; 2432 int rc; 2433 2434 if ( ( __kmp_user_lock_kind == lk_tas ) 2435 && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2436 lck = (kmp_user_lock_p)user_lock; 2437 } 2438 #if KMP_USE_FUTEX 2439 else if ( ( __kmp_user_lock_kind == lk_futex ) 2440 && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) { 2441 lck = (kmp_user_lock_p)user_lock; 2442 } 2443 #endif 2444 else { 2445 lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" ); 2446 } 2447 2448 #if USE_ITT_BUILD 2449 __kmp_itt_lock_acquiring( lck ); 2450 #endif /* USE_ITT_BUILD */ 2451 2452 rc = TEST_LOCK( lck, gtid ); 2453 #if USE_ITT_BUILD 2454 if ( rc ) { 2455 __kmp_itt_lock_acquired( lck ); 2456 } else { 2457 __kmp_itt_lock_cancelled( lck ); 2458 } 2459 #endif /* USE_ITT_BUILD */ 2460 return ( rc ? FTN_TRUE : FTN_FALSE ); 2461 2462 /* Can't use serial interval since not block structured */ 2463 2464 #endif // KMP_USE_DYNAMIC_LOCK 2465 } 2466 2467 /* try to acquire the lock */ 2468 int 2469 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock ) 2470 { 2471 #if KMP_USE_DYNAMIC_LOCK 2472 int rc; 2473 # if USE_ITT_BUILD 2474 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2475 # endif 2476 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 2477 # if USE_ITT_BUILD 2478 if (rc) { 2479 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2480 } else { 2481 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2482 } 2483 # endif 2484 return rc; 2485 2486 #else // KMP_USE_DYNAMIC_LOCK 2487 2488 kmp_user_lock_p lck; 2489 int rc; 2490 2491 if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll ) 2492 + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) { 2493 lck = (kmp_user_lock_p)user_lock; 2494 } 2495 #if KMP_USE_FUTEX 2496 else if ( ( __kmp_user_lock_kind == lk_futex ) 2497 && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked ) 2498 <= OMP_NEST_LOCK_T_SIZE ) ) { 2499 lck = (kmp_user_lock_p)user_lock; 2500 } 2501 #endif 2502 else { 2503 lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" ); 2504 } 2505 2506 #if USE_ITT_BUILD 2507 __kmp_itt_lock_acquiring( lck ); 2508 #endif /* USE_ITT_BUILD */ 2509 2510 rc = TEST_NESTED_LOCK( lck, gtid ); 2511 #if USE_ITT_BUILD 2512 if ( rc ) { 2513 __kmp_itt_lock_acquired( lck ); 2514 } else { 2515 __kmp_itt_lock_cancelled( lck ); 2516 } 2517 #endif /* USE_ITT_BUILD */ 2518 return rc; 2519 2520 /* Can't use serial interval since not block structured */ 2521 2522 #endif // KMP_USE_DYNAMIC_LOCK 2523 } 2524 2525 2526 /*--------------------------------------------------------------------------------------------------------------------*/ 2527 2528 /* 2529 * Interface to fast scalable reduce methods routines 2530 */ 2531 2532 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions; 2533 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then) 2534 // AT: which solution is better? 2535 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \ 2536 ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) ) 2537 2538 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 2539 ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) 2540 2541 // description of the packed_reduction_method variable: look at the macros in kmp.h 2542 2543 2544 // used in a critical section reduce block 2545 static __forceinline void 2546 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) { 2547 2548 // this lock was visible to a customer and to the threading profile tool as a serial overhead span 2549 // (although it's used for an internal purpose only) 2550 // why was it visible in previous implementation? 2551 // should we keep it visible in new reduce block? 2552 kmp_user_lock_p lck; 2553 2554 #if KMP_USE_DYNAMIC_LOCK 2555 2556 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 2557 // Check if it is initialized. 2558 if (*lk == 0) { 2559 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2560 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq)); 2561 } else { 2562 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq)); 2563 } 2564 } 2565 // Branch for accessing the actual lock object and set operation. This branching is inevitable since 2566 // this lock initialization does not follow the normal dispatch path (lock table is not used). 2567 if (KMP_EXTRACT_D_TAG(lk) != 0) { 2568 lck = (kmp_user_lock_p)lk; 2569 KMP_DEBUG_ASSERT(lck != NULL); 2570 if (__kmp_env_consistency_check) { 2571 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2572 } 2573 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 2574 } else { 2575 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 2576 lck = ilk->lock; 2577 KMP_DEBUG_ASSERT(lck != NULL); 2578 if (__kmp_env_consistency_check) { 2579 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2580 } 2581 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 2582 } 2583 2584 #else // KMP_USE_DYNAMIC_LOCK 2585 2586 // We know that the fast reduction code is only emitted by Intel compilers 2587 // with 32 byte critical sections. If there isn't enough space, then we 2588 // have to use a pointer. 2589 if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) { 2590 lck = (kmp_user_lock_p)crit; 2591 } 2592 else { 2593 lck = __kmp_get_critical_section_ptr( crit, loc, global_tid ); 2594 } 2595 KMP_DEBUG_ASSERT( lck != NULL ); 2596 2597 if ( __kmp_env_consistency_check ) 2598 __kmp_push_sync( global_tid, ct_critical, loc, lck ); 2599 2600 __kmp_acquire_user_lock_with_checks( lck, global_tid ); 2601 2602 #endif // KMP_USE_DYNAMIC_LOCK 2603 } 2604 2605 // used in a critical section reduce block 2606 static __forceinline void 2607 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) { 2608 2609 kmp_user_lock_p lck; 2610 2611 #if KMP_USE_DYNAMIC_LOCK 2612 2613 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2614 lck = (kmp_user_lock_p)crit; 2615 if (__kmp_env_consistency_check) 2616 __kmp_pop_sync(global_tid, ct_critical, loc); 2617 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 2618 } else { 2619 kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 2620 if (__kmp_env_consistency_check) 2621 __kmp_pop_sync(global_tid, ct_critical, loc); 2622 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 2623 } 2624 2625 #else // KMP_USE_DYNAMIC_LOCK 2626 2627 // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical 2628 // sections. If there isn't enough space, then we have to use a pointer. 2629 if ( __kmp_base_user_lock_size > 32 ) { 2630 lck = *( (kmp_user_lock_p *) crit ); 2631 KMP_ASSERT( lck != NULL ); 2632 } else { 2633 lck = (kmp_user_lock_p) crit; 2634 } 2635 2636 if ( __kmp_env_consistency_check ) 2637 __kmp_pop_sync( global_tid, ct_critical, loc ); 2638 2639 __kmp_release_user_lock_with_checks( lck, global_tid ); 2640 2641 #endif // KMP_USE_DYNAMIC_LOCK 2642 } // __kmp_end_critical_section_reduce_block 2643 2644 2645 /* 2.a.i. Reduce Block without a terminating barrier */ 2646 /*! 2647 @ingroup SYNCHRONIZATION 2648 @param loc source location information 2649 @param global_tid global thread number 2650 @param num_vars number of items (variables) to be reduced 2651 @param reduce_size size of data in bytes to be reduced 2652 @param reduce_data pointer to data to be reduced 2653 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data 2654 @param lck pointer to the unique lock data structure 2655 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed 2656 2657 The nowait version is used for a reduce clause with the nowait argument. 2658 */ 2659 kmp_int32 2660 __kmpc_reduce_nowait( 2661 ident_t *loc, kmp_int32 global_tid, 2662 kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 2663 kmp_critical_name *lck ) { 2664 2665 KMP_COUNT_BLOCK(REDUCE_nowait); 2666 int retval = 0; 2667 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2668 #if OMP_40_ENABLED 2669 kmp_team_t *team; 2670 kmp_info_t *th; 2671 int teams_swapped = 0, task_state; 2672 #endif 2673 KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) ); 2674 2675 // why do we need this initialization here at all? 2676 // Reduction clause can not be used as a stand-alone directive. 2677 2678 // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed 2679 // possible detection of false-positive race by the threadchecker ??? 2680 if( ! TCR_4( __kmp_init_parallel ) ) 2681 __kmp_parallel_initialize(); 2682 2683 // check correctness of reduce block nesting 2684 #if KMP_USE_DYNAMIC_LOCK 2685 if ( __kmp_env_consistency_check ) 2686 __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 ); 2687 #else 2688 if ( __kmp_env_consistency_check ) 2689 __kmp_push_sync( global_tid, ct_reduce, loc, NULL ); 2690 #endif 2691 2692 #if OMP_40_ENABLED 2693 th = __kmp_thread_from_gtid(global_tid); 2694 if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct? 2695 team = th->th.th_team; 2696 if( team->t.t_level == th->th.th_teams_level ) { 2697 // this is reduction at teams construct 2698 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 2699 // Let's swap teams temporarily for the reduction barrier 2700 teams_swapped = 1; 2701 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2702 th->th.th_team = team->t.t_parent; 2703 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 2704 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 2705 task_state = th->th.th_task_state; 2706 th->th.th_task_state = 0; 2707 } 2708 } 2709 #endif // OMP_40_ENABLED 2710 2711 // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable 2712 // the variable should be either a construct-specific or thread-specific property, not a team specific property 2713 // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct) 2714 // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?) 2715 // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed) 2716 // a thread-specific variable is better regarding two issues above (next construct and extra syncs) 2717 // a thread-specific "th_local.reduction_method" variable is used currently 2718 // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs) 2719 2720 packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck ); 2721 __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method ); 2722 2723 if( packed_reduction_method == critical_reduce_block ) { 2724 2725 __kmp_enter_critical_section_reduce_block( loc, global_tid, lck ); 2726 retval = 1; 2727 2728 } else if( packed_reduction_method == empty_reduce_block ) { 2729 2730 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2731 retval = 1; 2732 2733 } else if( packed_reduction_method == atomic_reduce_block ) { 2734 2735 retval = 2; 2736 2737 // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen) 2738 // (it's not quite good, because the checking block has been closed by this 'pop', 2739 // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction) 2740 if ( __kmp_env_consistency_check ) 2741 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2742 2743 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2744 2745 //AT: performance issue: a real barrier here 2746 //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them) 2747 //AT: (it's not what a customer might expect specifying NOWAIT clause) 2748 //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer) 2749 //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster 2750 // and be more in line with sense of NOWAIT 2751 //AT: TO DO: do epcc test and compare times 2752 2753 // this barrier should be invisible to a customer and to the threading profile tool 2754 // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose) 2755 #if USE_ITT_NOTIFY 2756 __kmp_threads[global_tid]->th.th_ident = loc; 2757 #endif 2758 retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func ); 2759 retval = ( retval != 0 ) ? ( 0 ) : ( 1 ); 2760 2761 // all other workers except master should do this pop here 2762 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 2763 if ( __kmp_env_consistency_check ) { 2764 if( retval == 0 ) { 2765 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2766 } 2767 } 2768 2769 } else { 2770 2771 // should never reach this block 2772 KMP_ASSERT( 0 ); // "unexpected method" 2773 2774 } 2775 #if OMP_40_ENABLED 2776 if( teams_swapped ) { 2777 // Restore thread structure 2778 th->th.th_info.ds.ds_tid = 0; 2779 th->th.th_team = team; 2780 th->th.th_team_nproc = team->t.t_nproc; 2781 th->th.th_task_team = team->t.t_task_team[task_state]; 2782 th->th.th_task_state = task_state; 2783 } 2784 #endif 2785 KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) ); 2786 2787 return retval; 2788 } 2789 2790 /*! 2791 @ingroup SYNCHRONIZATION 2792 @param loc source location information 2793 @param global_tid global thread id. 2794 @param lck pointer to the unique lock data structure 2795 2796 Finish the execution of a reduce nowait. 2797 */ 2798 void 2799 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) { 2800 2801 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2802 2803 KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) ); 2804 2805 packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid ); 2806 2807 if( packed_reduction_method == critical_reduce_block ) { 2808 2809 __kmp_end_critical_section_reduce_block( loc, global_tid, lck ); 2810 2811 } else if( packed_reduction_method == empty_reduce_block ) { 2812 2813 // usage: if team size == 1, no synchronization is required ( on Intel platforms only ) 2814 2815 } else if( packed_reduction_method == atomic_reduce_block ) { 2816 2817 // neither master nor other workers should get here 2818 // (code gen does not generate this call in case 2: atomic reduce block) 2819 // actually it's better to remove this elseif at all; 2820 // after removal this value will checked by the 'else' and will assert 2821 2822 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2823 2824 // only master gets here 2825 2826 } else { 2827 2828 // should never reach this block 2829 KMP_ASSERT( 0 ); // "unexpected method" 2830 2831 } 2832 2833 if ( __kmp_env_consistency_check ) 2834 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2835 2836 KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) ); 2837 2838 return; 2839 } 2840 2841 /* 2.a.ii. Reduce Block with a terminating barrier */ 2842 2843 /*! 2844 @ingroup SYNCHRONIZATION 2845 @param loc source location information 2846 @param global_tid global thread number 2847 @param num_vars number of items (variables) to be reduced 2848 @param reduce_size size of data in bytes to be reduced 2849 @param reduce_data pointer to data to be reduced 2850 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data 2851 @param lck pointer to the unique lock data structure 2852 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed 2853 2854 A blocking reduce that includes an implicit barrier. 2855 */ 2856 kmp_int32 2857 __kmpc_reduce( 2858 ident_t *loc, kmp_int32 global_tid, 2859 kmp_int32 num_vars, size_t reduce_size, void *reduce_data, 2860 void (*reduce_func)(void *lhs_data, void *rhs_data), 2861 kmp_critical_name *lck ) 2862 { 2863 KMP_COUNT_BLOCK(REDUCE_wait); 2864 int retval = 0; 2865 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2866 2867 KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) ); 2868 2869 // why do we need this initialization here at all? 2870 // Reduction clause can not be a stand-alone directive. 2871 2872 // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed 2873 // possible detection of false-positive race by the threadchecker ??? 2874 if( ! TCR_4( __kmp_init_parallel ) ) 2875 __kmp_parallel_initialize(); 2876 2877 // check correctness of reduce block nesting 2878 #if KMP_USE_DYNAMIC_LOCK 2879 if ( __kmp_env_consistency_check ) 2880 __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 ); 2881 #else 2882 if ( __kmp_env_consistency_check ) 2883 __kmp_push_sync( global_tid, ct_reduce, loc, NULL ); 2884 #endif 2885 2886 packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck ); 2887 __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method ); 2888 2889 if( packed_reduction_method == critical_reduce_block ) { 2890 2891 __kmp_enter_critical_section_reduce_block( loc, global_tid, lck ); 2892 retval = 1; 2893 2894 } else if( packed_reduction_method == empty_reduce_block ) { 2895 2896 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2897 retval = 1; 2898 2899 } else if( packed_reduction_method == atomic_reduce_block ) { 2900 2901 retval = 2; 2902 2903 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2904 2905 //case tree_reduce_block: 2906 // this barrier should be visible to a customer and to the threading profile tool 2907 // (it's a terminating barrier on constructs if NOWAIT not specified) 2908 #if USE_ITT_NOTIFY 2909 __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames 2910 #endif 2911 retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func ); 2912 retval = ( retval != 0 ) ? ( 0 ) : ( 1 ); 2913 2914 // all other workers except master should do this pop here 2915 // ( none of other workers except master will enter __kmpc_end_reduce() ) 2916 if ( __kmp_env_consistency_check ) { 2917 if( retval == 0 ) { // 0: all other workers; 1: master 2918 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2919 } 2920 } 2921 2922 } else { 2923 2924 // should never reach this block 2925 KMP_ASSERT( 0 ); // "unexpected method" 2926 2927 } 2928 2929 KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) ); 2930 2931 return retval; 2932 } 2933 2934 /*! 2935 @ingroup SYNCHRONIZATION 2936 @param loc source location information 2937 @param global_tid global thread id. 2938 @param lck pointer to the unique lock data structure 2939 2940 Finish the execution of a blocking reduce. 2941 The <tt>lck</tt> pointer must be the same as that used in the corresponding start function. 2942 */ 2943 void 2944 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) { 2945 2946 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2947 2948 KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) ); 2949 2950 packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid ); 2951 2952 // this barrier should be visible to a customer and to the threading profile tool 2953 // (it's a terminating barrier on constructs if NOWAIT not specified) 2954 2955 if( packed_reduction_method == critical_reduce_block ) { 2956 2957 __kmp_end_critical_section_reduce_block( loc, global_tid, lck ); 2958 2959 // TODO: implicit barrier: should be exposed 2960 #if USE_ITT_NOTIFY 2961 __kmp_threads[global_tid]->th.th_ident = loc; 2962 #endif 2963 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2964 2965 } else if( packed_reduction_method == empty_reduce_block ) { 2966 2967 // usage: if team size == 1, no synchronization is required ( Intel platforms only ) 2968 2969 // TODO: implicit barrier: should be exposed 2970 #if USE_ITT_NOTIFY 2971 __kmp_threads[global_tid]->th.th_ident = loc; 2972 #endif 2973 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2974 2975 } else if( packed_reduction_method == atomic_reduce_block ) { 2976 2977 // TODO: implicit barrier: should be exposed 2978 #if USE_ITT_NOTIFY 2979 __kmp_threads[global_tid]->th.th_ident = loc; 2980 #endif 2981 __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL ); 2982 2983 } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) { 2984 2985 // only master executes here (master releases all other workers) 2986 __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid ); 2987 2988 } else { 2989 2990 // should never reach this block 2991 KMP_ASSERT( 0 ); // "unexpected method" 2992 2993 } 2994 2995 if ( __kmp_env_consistency_check ) 2996 __kmp_pop_sync( global_tid, ct_reduce, loc ); 2997 2998 KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) ); 2999 3000 return; 3001 } 3002 3003 #undef __KMP_GET_REDUCTION_METHOD 3004 #undef __KMP_SET_REDUCTION_METHOD 3005 3006 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/ 3007 3008 kmp_uint64 3009 __kmpc_get_taskid() { 3010 3011 kmp_int32 gtid; 3012 kmp_info_t * thread; 3013 3014 gtid = __kmp_get_gtid(); 3015 if ( gtid < 0 ) { 3016 return 0; 3017 }; // if 3018 thread = __kmp_thread_from_gtid( gtid ); 3019 return thread->th.th_current_task->td_task_id; 3020 3021 } // __kmpc_get_taskid 3022 3023 3024 kmp_uint64 3025 __kmpc_get_parent_taskid() { 3026 3027 kmp_int32 gtid; 3028 kmp_info_t * thread; 3029 kmp_taskdata_t * parent_task; 3030 3031 gtid = __kmp_get_gtid(); 3032 if ( gtid < 0 ) { 3033 return 0; 3034 }; // if 3035 thread = __kmp_thread_from_gtid( gtid ); 3036 parent_task = thread->th.th_current_task->td_parent; 3037 return ( parent_task == NULL ? 0 : parent_task->td_task_id ); 3038 3039 } // __kmpc_get_parent_taskid 3040 3041 #if OMP_45_ENABLED 3042 /*! 3043 @ingroup WORK_SHARING 3044 @param loc source location information. 3045 @param gtid global thread number. 3046 @param num_dims number of associated doacross loops. 3047 @param dims info on loops bounds. 3048 3049 Initialize doacross loop information. 3050 Expect compiler send us inclusive bounds, 3051 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3052 */ 3053 void 3054 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims) 3055 { 3056 int j, idx; 3057 kmp_int64 last, trace_count; 3058 kmp_info_t *th = __kmp_threads[gtid]; 3059 kmp_team_t *team = th->th.th_team; 3060 kmp_uint32 *flags; 3061 kmp_disp_t *pr_buf = th->th.th_dispatch; 3062 dispatch_shared_info_t *sh_buf; 3063 3064 KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3065 gtid, num_dims, !team->t.t_serialized)); 3066 KMP_DEBUG_ASSERT(dims != NULL); 3067 KMP_DEBUG_ASSERT(num_dims > 0); 3068 3069 if( team->t.t_serialized ) { 3070 KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n")); 3071 return; // no dependencies if team is serialized 3072 } 3073 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3074 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop 3075 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3076 3077 // Save bounds info into allocated private buffer 3078 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3079 pr_buf->th_doacross_info = 3080 (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1)); 3081 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3082 pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions 3083 // Save also address of num_done in order to access it later without knowing the buffer index 3084 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3085 pr_buf->th_doacross_info[2] = dims[0].lo; 3086 pr_buf->th_doacross_info[3] = dims[0].up; 3087 pr_buf->th_doacross_info[4] = dims[0].st; 3088 last = 5; 3089 for( j = 1; j < num_dims; ++j ) { 3090 kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0] 3091 if( dims[j].st == 1 ) { // most common case 3092 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3093 range_length = dims[j].up - dims[j].lo + 1; 3094 } else { 3095 if( dims[j].st > 0 ) { 3096 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3097 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3098 } else { // negative increment 3099 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3100 range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3101 } 3102 } 3103 pr_buf->th_doacross_info[last++] = range_length; 3104 pr_buf->th_doacross_info[last++] = dims[j].lo; 3105 pr_buf->th_doacross_info[last++] = dims[j].up; 3106 pr_buf->th_doacross_info[last++] = dims[j].st; 3107 } 3108 3109 // Compute total trip count. 3110 // Start with range of dims[0] which we don't need to keep in the buffer. 3111 if( dims[0].st == 1 ) { // most common case 3112 trace_count = dims[0].up - dims[0].lo + 1; 3113 } else if( dims[0].st > 0 ) { 3114 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3115 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3116 } else { // negative increment 3117 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3118 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3119 } 3120 for( j = 1; j < num_dims; ++j ) { 3121 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3122 } 3123 KMP_DEBUG_ASSERT(trace_count > 0); 3124 3125 // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers) 3126 if( idx != sh_buf->doacross_buf_idx ) { 3127 // Shared buffer is occupied, wait for it to be free 3128 __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL ); 3129 } 3130 // Check if we are the first thread. After the CAS the first thread gets 0, 3131 // others get 1 if initialization is in progress, allocated pointer otherwise. 3132 flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64( 3133 (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1); 3134 if( flags == NULL ) { 3135 // we are the first thread, allocate the array of flags 3136 kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3137 sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1); 3138 } else if( (kmp_int64)flags == 1 ) { 3139 // initialization is still in progress, need to wait 3140 while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) { 3141 KMP_YIELD(TRUE); 3142 } 3143 } 3144 KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer 3145 pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not 3146 // touch shared buffer on each iteration 3147 KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3148 } 3149 3150 void 3151 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) 3152 { 3153 kmp_int32 shft, num_dims, i; 3154 kmp_uint32 flag; 3155 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3156 kmp_info_t *th = __kmp_threads[gtid]; 3157 kmp_team_t *team = th->th.th_team; 3158 kmp_disp_t *pr_buf; 3159 kmp_int64 lo, up, st; 3160 3161 KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3162 if( team->t.t_serialized ) { 3163 KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n")); 3164 return; // no dependencies if team is serialized 3165 } 3166 3167 // calculate sequential iteration number and check out-of-bounds condition 3168 pr_buf = th->th.th_dispatch; 3169 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3170 num_dims = pr_buf->th_doacross_info[0]; 3171 lo = pr_buf->th_doacross_info[2]; 3172 up = pr_buf->th_doacross_info[3]; 3173 st = pr_buf->th_doacross_info[4]; 3174 if( st == 1 ) { // most common case 3175 if( vec[0] < lo || vec[0] > up ) { 3176 KA_TRACE(20,( 3177 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3178 gtid, vec[0], lo, up)); 3179 return; 3180 } 3181 iter_number = vec[0] - lo; 3182 } else if( st > 0 ) { 3183 if( vec[0] < lo || vec[0] > up ) { 3184 KA_TRACE(20,( 3185 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3186 gtid, vec[0], lo, up)); 3187 return; 3188 } 3189 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3190 } else { // negative increment 3191 if( vec[0] > lo || vec[0] < up ) { 3192 KA_TRACE(20,( 3193 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3194 gtid, vec[0], lo, up)); 3195 return; 3196 } 3197 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3198 } 3199 for( i = 1; i < num_dims; ++i ) { 3200 kmp_int64 iter, ln; 3201 kmp_int32 j = i * 4; 3202 ln = pr_buf->th_doacross_info[j + 1]; 3203 lo = pr_buf->th_doacross_info[j + 2]; 3204 up = pr_buf->th_doacross_info[j + 3]; 3205 st = pr_buf->th_doacross_info[j + 4]; 3206 if( st == 1 ) { 3207 if( vec[i] < lo || vec[i] > up ) { 3208 KA_TRACE(20,( 3209 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3210 gtid, vec[i], lo, up)); 3211 return; 3212 } 3213 iter = vec[i] - lo; 3214 } else if( st > 0 ) { 3215 if( vec[i] < lo || vec[i] > up ) { 3216 KA_TRACE(20,( 3217 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3218 gtid, vec[i], lo, up)); 3219 return; 3220 } 3221 iter = (kmp_uint64)(vec[i] - lo) / st; 3222 } else { // st < 0 3223 if( vec[i] > lo || vec[i] < up ) { 3224 KA_TRACE(20,( 3225 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", 3226 gtid, vec[i], lo, up)); 3227 return; 3228 } 3229 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3230 } 3231 iter_number = iter + ln * iter_number; 3232 } 3233 shft = iter_number % 32; // use 32-bit granularity 3234 iter_number >>= 5; // divided by 32 3235 flag = 1 << shft; 3236 while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) { 3237 KMP_YIELD(TRUE); 3238 } 3239 KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3240 gtid, (iter_number<<5)+shft)); 3241 } 3242 3243 void 3244 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) 3245 { 3246 kmp_int32 shft, num_dims, i; 3247 kmp_uint32 flag; 3248 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3249 kmp_info_t *th = __kmp_threads[gtid]; 3250 kmp_team_t *team = th->th.th_team; 3251 kmp_disp_t *pr_buf; 3252 kmp_int64 lo, st; 3253 3254 KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 3255 if( team->t.t_serialized ) { 3256 KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n")); 3257 return; // no dependencies if team is serialized 3258 } 3259 3260 // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks) 3261 pr_buf = th->th.th_dispatch; 3262 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3263 num_dims = pr_buf->th_doacross_info[0]; 3264 lo = pr_buf->th_doacross_info[2]; 3265 st = pr_buf->th_doacross_info[4]; 3266 if( st == 1 ) { // most common case 3267 iter_number = vec[0] - lo; 3268 } else if( st > 0 ) { 3269 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3270 } else { // negative increment 3271 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3272 } 3273 for( i = 1; i < num_dims; ++i ) { 3274 kmp_int64 iter, ln; 3275 kmp_int32 j = i * 4; 3276 ln = pr_buf->th_doacross_info[j + 1]; 3277 lo = pr_buf->th_doacross_info[j + 2]; 3278 st = pr_buf->th_doacross_info[j + 4]; 3279 if( st == 1 ) { 3280 iter = vec[i] - lo; 3281 } else if( st > 0 ) { 3282 iter = (kmp_uint64)(vec[i] - lo) / st; 3283 } else { // st < 0 3284 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3285 } 3286 iter_number = iter + ln * iter_number; 3287 } 3288 shft = iter_number % 32; // use 32-bit granularity 3289 iter_number >>= 5; // divided by 32 3290 flag = 1 << shft; 3291 if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) 3292 KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag ); 3293 KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", 3294 gtid, (iter_number<<5)+shft)); 3295 } 3296 3297 void 3298 __kmpc_doacross_fini(ident_t *loc, int gtid) 3299 { 3300 kmp_int64 num_done; 3301 kmp_info_t *th = __kmp_threads[gtid]; 3302 kmp_team_t *team = th->th.th_team; 3303 kmp_disp_t *pr_buf = th->th.th_dispatch; 3304 3305 KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 3306 if( team->t.t_serialized ) { 3307 KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 3308 return; // nothing to do 3309 } 3310 num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1; 3311 if( num_done == th->th.th_team_nproc ) { 3312 // we are the last thread, need to free shared resources 3313 int idx = pr_buf->th_doacross_buf_idx - 1; 3314 dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3315 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done); 3316 KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done); 3317 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 3318 __kmp_thread_free(th, (void*)sh_buf->doacross_flags); 3319 sh_buf->doacross_flags = NULL; 3320 sh_buf->doacross_num_done = 0; 3321 sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use 3322 } 3323 // free private resources (need to keep buffer index forever) 3324 __kmp_thread_free(th, (void*)pr_buf->th_doacross_info); 3325 pr_buf->th_doacross_info = NULL; 3326 KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 3327 } 3328 #endif 3329 3330 // end of file // 3331 3332