1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "omp.h" /* extern "C" declarations of user-visible routines */ 17 #include "kmp.h" 18 #include "kmp_error.h" 19 #include "kmp_i18n.h" 20 #include "kmp_itt.h" 21 #include "kmp_lock.h" 22 #include "kmp_stats.h" 23 24 #if OMPT_SUPPORT 25 #include "ompt-internal.h" 26 #include "ompt-specific.h" 27 #endif 28 29 #define MAX_MESSAGE 512 30 31 // flags will be used in future, e.g. to implement openmp_strict library 32 // restrictions 33 34 /*! 35 * @ingroup STARTUP_SHUTDOWN 36 * @param loc in source location information 37 * @param flags in for future use (currently ignored) 38 * 39 * Initialize the runtime library. This call is optional; if it is not made then 40 * it will be implicitly called by attempts to use other library functions. 41 */ 42 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 43 // By default __kmpc_begin() is no-op. 44 char *env; 45 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 46 __kmp_str_match_true(env)) { 47 __kmp_middle_initialize(); 48 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 49 } else if (__kmp_ignore_mppbeg() == FALSE) { 50 // By default __kmp_ignore_mppbeg() returns TRUE. 51 __kmp_internal_begin(); 52 KC_TRACE(10, ("__kmpc_begin: called\n")); 53 } 54 } 55 56 /*! 57 * @ingroup STARTUP_SHUTDOWN 58 * @param loc source location information 59 * 60 * Shutdown the runtime library. This is also optional, and even if called will 61 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 62 * zero. 63 */ 64 void __kmpc_end(ident_t *loc) { 65 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 66 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 67 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 68 // returns FALSE and __kmpc_end() will unregister this root (it can cause 69 // library shut down). 70 if (__kmp_ignore_mppend() == FALSE) { 71 KC_TRACE(10, ("__kmpc_end: called\n")); 72 KA_TRACE(30, ("__kmpc_end\n")); 73 74 __kmp_internal_end_thread(-1); 75 } 76 } 77 78 /*! 79 @ingroup THREAD_STATES 80 @param loc Source location information. 81 @return The global thread index of the active thread. 82 83 This function can be called in any context. 84 85 If the runtime has ony been entered at the outermost level from a 86 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 87 that which would be returned by omp_get_thread_num() in the outermost 88 active parallel construct. (Or zero if there is no active parallel 89 construct, since the master thread is necessarily thread zero). 90 91 If multiple non-OpenMP threads all enter an OpenMP construct then this 92 will be a unique thread identifier among all the threads created by 93 the OpenMP runtime (but the value cannote be defined in terms of 94 OpenMP thread ids returned by omp_get_thread_num()). 95 */ 96 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 97 kmp_int32 gtid = __kmp_entry_gtid(); 98 99 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 100 101 return gtid; 102 } 103 104 /*! 105 @ingroup THREAD_STATES 106 @param loc Source location information. 107 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 108 109 This function can be called in any context. 110 It returns the total number of threads under the control of the OpenMP runtime. 111 That is not a number that can be determined by any OpenMP standard calls, since 112 the library may be called from more than one non-OpenMP thread, and this 113 reflects the total over all such calls. Similarly the runtime maintains 114 underlying threads even when they are not active (since the cost of creating 115 and destroying OS threads is high), this call counts all such threads even if 116 they are not waiting for work. 117 */ 118 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 119 KC_TRACE(10, 120 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 121 122 return TCR_4(__kmp_all_nth); 123 } 124 125 /*! 126 @ingroup THREAD_STATES 127 @param loc Source location information. 128 @return The thread number of the calling thread in the innermost active parallel 129 construct. 130 */ 131 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 132 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 133 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 134 } 135 136 /*! 137 @ingroup THREAD_STATES 138 @param loc Source location information. 139 @return The number of threads in the innermost active parallel construct. 140 */ 141 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 142 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 143 144 return __kmp_entry_thread()->th.th_team->t.t_nproc; 145 } 146 147 /*! 148 * @ingroup DEPRECATED 149 * @param loc location description 150 * 151 * This function need not be called. It always returns TRUE. 152 */ 153 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 154 #ifndef KMP_DEBUG 155 156 return TRUE; 157 158 #else 159 160 const char *semi2; 161 const char *semi3; 162 int line_no; 163 164 if (__kmp_par_range == 0) { 165 return TRUE; 166 } 167 semi2 = loc->psource; 168 if (semi2 == NULL) { 169 return TRUE; 170 } 171 semi2 = strchr(semi2, ';'); 172 if (semi2 == NULL) { 173 return TRUE; 174 } 175 semi2 = strchr(semi2 + 1, ';'); 176 if (semi2 == NULL) { 177 return TRUE; 178 } 179 if (__kmp_par_range_filename[0]) { 180 const char *name = semi2 - 1; 181 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 182 name--; 183 } 184 if ((*name == '/') || (*name == ';')) { 185 name++; 186 } 187 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 188 return __kmp_par_range < 0; 189 } 190 } 191 semi3 = strchr(semi2 + 1, ';'); 192 if (__kmp_par_range_routine[0]) { 193 if ((semi3 != NULL) && (semi3 > semi2) && 194 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 195 return __kmp_par_range < 0; 196 } 197 } 198 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 199 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 200 return __kmp_par_range > 0; 201 } 202 return __kmp_par_range < 0; 203 } 204 return TRUE; 205 206 #endif /* KMP_DEBUG */ 207 } 208 209 /*! 210 @ingroup THREAD_STATES 211 @param loc Source location information. 212 @return 1 if this thread is executing inside an active parallel region, zero if 213 not. 214 */ 215 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 216 return __kmp_entry_thread()->th.th_root->r.r_active; 217 } 218 219 /*! 220 @ingroup PARALLEL 221 @param loc source location information 222 @param global_tid global thread number 223 @param num_threads number of threads requested for this parallel construct 224 225 Set the number of threads to be used by the next fork spawned by this thread. 226 This call is only required if the parallel construct has a `num_threads` clause. 227 */ 228 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 229 kmp_int32 num_threads) { 230 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 231 global_tid, num_threads)); 232 233 __kmp_push_num_threads(loc, global_tid, num_threads); 234 } 235 236 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 237 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 238 239 /* the num_threads are automatically popped */ 240 } 241 242 #if OMP_40_ENABLED 243 244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 245 kmp_int32 proc_bind) { 246 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 247 proc_bind)); 248 249 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 250 } 251 252 #endif /* OMP_40_ENABLED */ 253 254 /*! 255 @ingroup PARALLEL 256 @param loc source location information 257 @param argc total number of arguments in the ellipsis 258 @param microtask pointer to callback routine consisting of outlined parallel 259 construct 260 @param ... pointers to shared variables that aren't global 261 262 Do the actual fork and call the microtask in the relevant number of threads. 263 */ 264 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 265 int gtid = __kmp_entry_gtid(); 266 267 #if (KMP_STATS_ENABLED) 268 int inParallel = __kmpc_in_parallel(loc); 269 if (inParallel) { 270 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 271 } else { 272 KMP_COUNT_BLOCK(OMP_PARALLEL); 273 } 274 #endif 275 276 // maybe to save thr_state is enough here 277 { 278 va_list ap; 279 va_start(ap, microtask); 280 281 #if OMPT_SUPPORT 282 ompt_frame_t *ompt_frame; 283 if (ompt_enabled) { 284 kmp_info_t *master_th = __kmp_threads[gtid]; 285 kmp_team_t *parent_team = master_th->th.th_team; 286 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 287 if (lwt) 288 ompt_frame = &(lwt->ompt_task_info.frame); 289 else { 290 int tid = __kmp_tid_from_gtid(gtid); 291 ompt_frame = &( 292 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 293 } 294 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 295 } 296 #endif 297 298 #if INCLUDE_SSC_MARKS 299 SSC_MARK_FORKING(); 300 #endif 301 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 302 #if OMPT_SUPPORT 303 VOLATILE_CAST(void *) microtask, // "unwrapped" task 304 #endif 305 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 306 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 307 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 308 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 309 &ap 310 #else 311 ap 312 #endif 313 ); 314 #if INCLUDE_SSC_MARKS 315 SSC_MARK_JOINING(); 316 #endif 317 __kmp_join_call(loc, gtid 318 #if OMPT_SUPPORT 319 , 320 fork_context_intel 321 #endif 322 ); 323 324 va_end(ap); 325 } 326 } 327 328 #if OMP_40_ENABLED 329 /*! 330 @ingroup PARALLEL 331 @param loc source location information 332 @param global_tid global thread number 333 @param num_teams number of teams requested for the teams construct 334 @param num_threads number of threads per team requested for the teams construct 335 336 Set the number of teams to be used by the teams construct. 337 This call is only required if the teams construct has a `num_teams` clause 338 or a `thread_limit` clause (or both). 339 */ 340 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 341 kmp_int32 num_teams, kmp_int32 num_threads) { 342 KA_TRACE(20, 343 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 344 global_tid, num_teams, num_threads)); 345 346 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 347 } 348 349 /*! 350 @ingroup PARALLEL 351 @param loc source location information 352 @param argc total number of arguments in the ellipsis 353 @param microtask pointer to callback routine consisting of outlined teams 354 construct 355 @param ... pointers to shared variables that aren't global 356 357 Do the actual fork and call the microtask in the relevant number of threads. 358 */ 359 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 360 ...) { 361 int gtid = __kmp_entry_gtid(); 362 kmp_info_t *this_thr = __kmp_threads[gtid]; 363 va_list ap; 364 va_start(ap, microtask); 365 366 KMP_COUNT_BLOCK(OMP_TEAMS); 367 368 // remember teams entry point and nesting level 369 this_thr->th.th_teams_microtask = microtask; 370 this_thr->th.th_teams_level = 371 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 372 373 #if OMPT_SUPPORT 374 kmp_team_t *parent_team = this_thr->th.th_team; 375 int tid = __kmp_tid_from_gtid(gtid); 376 if (ompt_enabled) { 377 parent_team->t.t_implicit_task_taskdata[tid] 378 .ompt_task_info.frame.reenter_runtime_frame = 379 __builtin_frame_address(1); 380 } 381 #endif 382 383 // check if __kmpc_push_num_teams called, set default number of teams 384 // otherwise 385 if (this_thr->th.th_teams_size.nteams == 0) { 386 __kmp_push_num_teams(loc, gtid, 0, 0); 387 } 388 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 389 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 390 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 391 392 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 393 #if OMPT_SUPPORT 394 VOLATILE_CAST(void *) microtask, // "unwrapped" task 395 #endif 396 VOLATILE_CAST(microtask_t) 397 __kmp_teams_master, // "wrapped" task 398 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 399 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 400 &ap 401 #else 402 ap 403 #endif 404 ); 405 __kmp_join_call(loc, gtid 406 #if OMPT_SUPPORT 407 , 408 fork_context_intel 409 #endif 410 ); 411 412 this_thr->th.th_teams_microtask = NULL; 413 this_thr->th.th_teams_level = 0; 414 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 415 va_end(ap); 416 } 417 #endif /* OMP_40_ENABLED */ 418 419 // I don't think this function should ever have been exported. 420 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 421 // openmp code ever called it, but it's been exported from the RTL for so 422 // long that I'm afraid to remove the definition. 423 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 424 425 /*! 426 @ingroup PARALLEL 427 @param loc source location information 428 @param global_tid global thread number 429 430 Enter a serialized parallel construct. This interface is used to handle a 431 conditional parallel region, like this, 432 @code 433 #pragma omp parallel if (condition) 434 @endcode 435 when the condition is false. 436 */ 437 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 438 // The implementation is now in kmp_runtime.cpp so that it can share static 439 // functions with kmp_fork_call since the tasks to be done are similar in 440 // each case. 441 __kmp_serialized_parallel(loc, global_tid); 442 } 443 444 /*! 445 @ingroup PARALLEL 446 @param loc source location information 447 @param global_tid global thread number 448 449 Leave a serialized parallel construct. 450 */ 451 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 452 kmp_internal_control_t *top; 453 kmp_info_t *this_thr; 454 kmp_team_t *serial_team; 455 456 KC_TRACE(10, 457 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 458 459 /* skip all this code for autopar serialized loops since it results in 460 unacceptable overhead */ 461 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 462 return; 463 464 // Not autopar code 465 if (!TCR_4(__kmp_init_parallel)) 466 __kmp_parallel_initialize(); 467 468 this_thr = __kmp_threads[global_tid]; 469 serial_team = this_thr->th.th_serial_team; 470 471 #if OMP_45_ENABLED 472 kmp_task_team_t *task_team = this_thr->th.th_task_team; 473 474 // we need to wait for the proxy tasks before finishing the thread 475 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 476 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 477 #endif 478 479 KMP_MB(); 480 KMP_DEBUG_ASSERT(serial_team); 481 KMP_ASSERT(serial_team->t.t_serialized); 482 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 483 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 484 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 485 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 486 487 /* If necessary, pop the internal control stack values and replace the team 488 * values */ 489 top = serial_team->t.t_control_stack_top; 490 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 491 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 492 serial_team->t.t_control_stack_top = top->next; 493 __kmp_free(top); 494 } 495 496 // if( serial_team -> t.t_serialized > 1 ) 497 serial_team->t.t_level--; 498 499 /* pop dispatch buffers stack */ 500 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 501 { 502 dispatch_private_info_t *disp_buffer = 503 serial_team->t.t_dispatch->th_disp_buffer; 504 serial_team->t.t_dispatch->th_disp_buffer = 505 serial_team->t.t_dispatch->th_disp_buffer->next; 506 __kmp_free(disp_buffer); 507 } 508 509 --serial_team->t.t_serialized; 510 if (serial_team->t.t_serialized == 0) { 511 512 /* return to the parallel section */ 513 514 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 515 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 516 __kmp_clear_x87_fpu_status_word(); 517 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 518 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 519 } 520 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 521 522 this_thr->th.th_team = serial_team->t.t_parent; 523 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 524 525 /* restore values cached in the thread */ 526 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 527 this_thr->th.th_team_master = 528 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 529 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 530 531 /* TODO the below shouldn't need to be adjusted for serialized teams */ 532 this_thr->th.th_dispatch = 533 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 534 535 __kmp_pop_current_task_from_thread(this_thr); 536 537 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 538 this_thr->th.th_current_task->td_flags.executing = 1; 539 540 if (__kmp_tasking_mode != tskm_immediate_exec) { 541 // Copy the task team from the new child / old parent team to the thread. 542 this_thr->th.th_task_team = 543 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 544 KA_TRACE(20, 545 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 546 "team %p\n", 547 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 548 } 549 } else { 550 if (__kmp_tasking_mode != tskm_immediate_exec) { 551 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 552 "depth of serial team %p to %d\n", 553 global_tid, serial_team, serial_team->t.t_serialized)); 554 } 555 } 556 557 if (__kmp_env_consistency_check) 558 __kmp_pop_parallel(global_tid, NULL); 559 } 560 561 /*! 562 @ingroup SYNCHRONIZATION 563 @param loc source location information. 564 565 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 566 depending on the memory ordering convention obeyed by the compiler 567 even that may not be necessary). 568 */ 569 void __kmpc_flush(ident_t *loc) { 570 KC_TRACE(10, ("__kmpc_flush: called\n")); 571 572 /* need explicit __mf() here since use volatile instead in library */ 573 KMP_MB(); /* Flush all pending memory write invalidates. */ 574 575 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 576 #if KMP_MIC 577 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 578 // We shouldn't need it, though, since the ABI rules require that 579 // * If the compiler generates NGO stores it also generates the fence 580 // * If users hand-code NGO stores they should insert the fence 581 // therefore no incomplete unordered stores should be visible. 582 #else 583 // C74404 584 // This is to address non-temporal store instructions (sfence needed). 585 // The clflush instruction is addressed either (mfence needed). 586 // Probably the non-temporal load monvtdqa instruction should also be 587 // addressed. 588 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 589 if (!__kmp_cpuinfo.initialized) { 590 __kmp_query_cpuid(&__kmp_cpuinfo); 591 }; // if 592 if (!__kmp_cpuinfo.sse2) { 593 // CPU cannot execute SSE2 instructions. 594 } else { 595 #if KMP_COMPILER_ICC 596 _mm_mfence(); 597 #elif KMP_COMPILER_MSVC 598 MemoryBarrier(); 599 #else 600 __sync_synchronize(); 601 #endif // KMP_COMPILER_ICC 602 }; // if 603 #endif // KMP_MIC 604 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 605 // Nothing to see here move along 606 #elif KMP_ARCH_PPC64 607 // Nothing needed here (we have a real MB above). 608 #if KMP_OS_CNK 609 // The flushing thread needs to yield here; this prevents a 610 // busy-waiting thread from saturating the pipeline. flush is 611 // often used in loops like this: 612 // while (!flag) { 613 // #pragma omp flush(flag) 614 // } 615 // and adding the yield here is good for at least a 10x speedup 616 // when running >2 threads per core (on the NAS LU benchmark). 617 __kmp_yield(TRUE); 618 #endif 619 #else 620 #error Unknown or unsupported architecture 621 #endif 622 } 623 624 /* -------------------------------------------------------------------------- */ 625 /*! 626 @ingroup SYNCHRONIZATION 627 @param loc source location information 628 @param global_tid thread id. 629 630 Execute a barrier. 631 */ 632 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 633 KMP_COUNT_BLOCK(OMP_BARRIER); 634 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 635 636 if (!TCR_4(__kmp_init_parallel)) 637 __kmp_parallel_initialize(); 638 639 if (__kmp_env_consistency_check) { 640 if (loc == 0) { 641 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 642 }; // if 643 644 __kmp_check_barrier(global_tid, ct_barrier, loc); 645 } 646 647 #if OMPT_SUPPORT && OMPT_TRACE 648 ompt_frame_t *ompt_frame; 649 if (ompt_enabled) { 650 ompt_frame = __ompt_get_task_frame_internal(0); 651 if (ompt_frame->reenter_runtime_frame == NULL) 652 ompt_frame->reenter_runtime_frame = __builtin_frame_address(1); 653 } 654 #endif 655 __kmp_threads[global_tid]->th.th_ident = loc; 656 // TODO: explicit barrier_wait_id: 657 // this function is called when 'barrier' directive is present or 658 // implicit barrier at the end of a worksharing construct. 659 // 1) better to add a per-thread barrier counter to a thread data structure 660 // 2) set to 0 when a new team is created 661 // 4) no sync is required 662 663 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 664 #if OMPT_SUPPORT && OMPT_TRACE 665 if (ompt_enabled) { 666 ompt_frame->reenter_runtime_frame = NULL; 667 } 668 #endif 669 } 670 671 /* The BARRIER for a MASTER section is always explicit */ 672 /*! 673 @ingroup WORK_SHARING 674 @param loc source location information. 675 @param global_tid global thread number . 676 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 677 */ 678 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 679 int status = 0; 680 681 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 682 683 if (!TCR_4(__kmp_init_parallel)) 684 __kmp_parallel_initialize(); 685 686 if (KMP_MASTER_GTID(global_tid)) { 687 KMP_COUNT_BLOCK(OMP_MASTER); 688 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 689 status = 1; 690 } 691 692 #if OMPT_SUPPORT && OMPT_TRACE 693 if (status) { 694 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_begin)) { 695 kmp_info_t *this_thr = __kmp_threads[global_tid]; 696 kmp_team_t *team = this_thr->th.th_team; 697 698 int tid = __kmp_tid_from_gtid(global_tid); 699 ompt_callbacks.ompt_callback(ompt_event_master_begin)( 700 team->t.ompt_team_info.parallel_id, 701 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 702 } 703 } 704 #endif 705 706 if (__kmp_env_consistency_check) { 707 #if KMP_USE_DYNAMIC_LOCK 708 if (status) 709 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 710 else 711 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 712 #else 713 if (status) 714 __kmp_push_sync(global_tid, ct_master, loc, NULL); 715 else 716 __kmp_check_sync(global_tid, ct_master, loc, NULL); 717 #endif 718 } 719 720 return status; 721 } 722 723 /*! 724 @ingroup WORK_SHARING 725 @param loc source location information. 726 @param global_tid global thread number . 727 728 Mark the end of a <tt>master</tt> region. This should only be called by the 729 thread that executes the <tt>master</tt> region. 730 */ 731 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 732 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 733 734 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 735 KMP_POP_PARTITIONED_TIMER(); 736 737 #if OMPT_SUPPORT && OMPT_TRACE 738 kmp_info_t *this_thr = __kmp_threads[global_tid]; 739 kmp_team_t *team = this_thr->th.th_team; 740 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_end)) { 741 int tid = __kmp_tid_from_gtid(global_tid); 742 ompt_callbacks.ompt_callback(ompt_event_master_end)( 743 team->t.ompt_team_info.parallel_id, 744 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 745 } 746 #endif 747 748 if (__kmp_env_consistency_check) { 749 if (global_tid < 0) 750 KMP_WARNING(ThreadIdentInvalid); 751 752 if (KMP_MASTER_GTID(global_tid)) 753 __kmp_pop_sync(global_tid, ct_master, loc); 754 } 755 } 756 757 /*! 758 @ingroup WORK_SHARING 759 @param loc source location information. 760 @param gtid global thread number. 761 762 Start execution of an <tt>ordered</tt> construct. 763 */ 764 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 765 int cid = 0; 766 kmp_info_t *th; 767 KMP_DEBUG_ASSERT(__kmp_init_serial); 768 769 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 770 771 if (!TCR_4(__kmp_init_parallel)) 772 __kmp_parallel_initialize(); 773 774 #if USE_ITT_BUILD 775 __kmp_itt_ordered_prep(gtid); 776 // TODO: ordered_wait_id 777 #endif /* USE_ITT_BUILD */ 778 779 th = __kmp_threads[gtid]; 780 781 #if OMPT_SUPPORT && OMPT_TRACE 782 if (ompt_enabled) { 783 /* OMPT state update */ 784 th->th.ompt_thread_info.wait_id = (uint64_t)loc; 785 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 786 787 /* OMPT event callback */ 788 if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) { 789 ompt_callbacks.ompt_callback(ompt_event_wait_ordered)( 790 th->th.ompt_thread_info.wait_id); 791 } 792 } 793 #endif 794 795 if (th->th.th_dispatch->th_deo_fcn != 0) 796 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 797 else 798 __kmp_parallel_deo(>id, &cid, loc); 799 800 #if OMPT_SUPPORT && OMPT_TRACE 801 if (ompt_enabled) { 802 /* OMPT state update */ 803 th->th.ompt_thread_info.state = ompt_state_work_parallel; 804 th->th.ompt_thread_info.wait_id = 0; 805 806 /* OMPT event callback */ 807 if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) { 808 ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)( 809 th->th.ompt_thread_info.wait_id); 810 } 811 } 812 #endif 813 814 #if USE_ITT_BUILD 815 __kmp_itt_ordered_start(gtid); 816 #endif /* USE_ITT_BUILD */ 817 } 818 819 /*! 820 @ingroup WORK_SHARING 821 @param loc source location information. 822 @param gtid global thread number. 823 824 End execution of an <tt>ordered</tt> construct. 825 */ 826 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 827 int cid = 0; 828 kmp_info_t *th; 829 830 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 831 832 #if USE_ITT_BUILD 833 __kmp_itt_ordered_end(gtid); 834 // TODO: ordered_wait_id 835 #endif /* USE_ITT_BUILD */ 836 837 th = __kmp_threads[gtid]; 838 839 if (th->th.th_dispatch->th_dxo_fcn != 0) 840 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 841 else 842 __kmp_parallel_dxo(>id, &cid, loc); 843 844 #if OMPT_SUPPORT && OMPT_BLAME 845 if (ompt_enabled && 846 ompt_callbacks.ompt_callback(ompt_event_release_ordered)) { 847 ompt_callbacks.ompt_callback(ompt_event_release_ordered)( 848 th->th.ompt_thread_info.wait_id); 849 } 850 #endif 851 } 852 853 #if KMP_USE_DYNAMIC_LOCK 854 855 static __forceinline void 856 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 857 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 858 // Pointer to the allocated indirect lock is written to crit, while indexing 859 // is ignored. 860 void *idx; 861 kmp_indirect_lock_t **lck; 862 lck = (kmp_indirect_lock_t **)crit; 863 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 864 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 865 KMP_SET_I_LOCK_LOCATION(ilk, loc); 866 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 867 KA_TRACE(20, 868 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 869 #if USE_ITT_BUILD 870 __kmp_itt_critical_creating(ilk->lock, loc); 871 #endif 872 int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk); 873 if (status == 0) { 874 #if USE_ITT_BUILD 875 __kmp_itt_critical_destroyed(ilk->lock); 876 #endif 877 // We don't really need to destroy the unclaimed lock here since it will be 878 // cleaned up at program exit. 879 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 880 } 881 KMP_DEBUG_ASSERT(*lck != NULL); 882 } 883 884 // Fast-path acquire tas lock 885 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 886 { \ 887 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 888 if (l->lk.poll != KMP_LOCK_FREE(tas) || \ 889 !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 890 KMP_LOCK_BUSY(gtid + 1, tas))) { \ 891 kmp_uint32 spins; \ 892 KMP_FSYNC_PREPARE(l); \ 893 KMP_INIT_YIELD(spins); \ 894 if (TCR_4(__kmp_nth) > \ 895 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 896 KMP_YIELD(TRUE); \ 897 } else { \ 898 KMP_YIELD_SPIN(spins); \ 899 } \ 900 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 901 while (l->lk.poll != KMP_LOCK_FREE(tas) || \ 902 !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 903 KMP_LOCK_BUSY(gtid + 1, tas))) { \ 904 __kmp_spin_backoff(&backoff); \ 905 if (TCR_4(__kmp_nth) > \ 906 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 907 KMP_YIELD(TRUE); \ 908 } else { \ 909 KMP_YIELD_SPIN(spins); \ 910 } \ 911 } \ 912 } \ 913 KMP_FSYNC_ACQUIRED(l); \ 914 } 915 916 // Fast-path test tas lock 917 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 918 { \ 919 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 920 rc = l->lk.poll == KMP_LOCK_FREE(tas) && \ 921 KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 922 KMP_LOCK_BUSY(gtid + 1, tas)); \ 923 } 924 925 // Fast-path release tas lock 926 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 927 { \ 928 TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \ 929 KMP_MB(); \ 930 } 931 932 #if KMP_USE_FUTEX 933 934 #include <sys/syscall.h> 935 #include <unistd.h> 936 #ifndef FUTEX_WAIT 937 #define FUTEX_WAIT 0 938 #endif 939 #ifndef FUTEX_WAKE 940 #define FUTEX_WAKE 1 941 #endif 942 943 // Fast-path acquire futex lock 944 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 945 { \ 946 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 947 kmp_int32 gtid_code = (gtid + 1) << 1; \ 948 KMP_MB(); \ 949 KMP_FSYNC_PREPARE(ftx); \ 950 kmp_int32 poll_val; \ 951 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 952 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 953 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 954 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 955 if (!cond) { \ 956 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 957 poll_val | \ 958 KMP_LOCK_BUSY(1, futex))) { \ 959 continue; \ 960 } \ 961 poll_val |= KMP_LOCK_BUSY(1, futex); \ 962 } \ 963 kmp_int32 rc; \ 964 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 965 NULL, NULL, 0)) != 0) { \ 966 continue; \ 967 } \ 968 gtid_code |= 1; \ 969 } \ 970 KMP_FSYNC_ACQUIRED(ftx); \ 971 } 972 973 // Fast-path test futex lock 974 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 975 { \ 976 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 977 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 978 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 979 KMP_FSYNC_ACQUIRED(ftx); \ 980 rc = TRUE; \ 981 } else { \ 982 rc = FALSE; \ 983 } \ 984 } 985 986 // Fast-path release futex lock 987 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 988 { \ 989 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 990 KMP_MB(); \ 991 KMP_FSYNC_RELEASING(ftx); \ 992 kmp_int32 poll_val = \ 993 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 994 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 995 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 996 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 997 } \ 998 KMP_MB(); \ 999 KMP_YIELD(TCR_4(__kmp_nth) > \ 1000 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1001 } 1002 1003 #endif // KMP_USE_FUTEX 1004 1005 #else // KMP_USE_DYNAMIC_LOCK 1006 1007 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1008 ident_t const *loc, 1009 kmp_int32 gtid) { 1010 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1011 1012 // Because of the double-check, the following load doesn't need to be volatile 1013 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1014 1015 if (lck == NULL) { 1016 void *idx; 1017 1018 // Allocate & initialize the lock. 1019 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1020 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1021 __kmp_init_user_lock_with_checks(lck); 1022 __kmp_set_user_lock_location(lck, loc); 1023 #if USE_ITT_BUILD 1024 __kmp_itt_critical_creating(lck); 1025 // __kmp_itt_critical_creating() should be called *before* the first usage 1026 // of underlying lock. It is the only place where we can guarantee it. There 1027 // are chances the lock will destroyed with no usage, but it is not a 1028 // problem, because this is not real event seen by user but rather setting 1029 // name for object (lock). See more details in kmp_itt.h. 1030 #endif /* USE_ITT_BUILD */ 1031 1032 // Use a cmpxchg instruction to slam the start of the critical section with 1033 // the lock pointer. If another thread beat us to it, deallocate the lock, 1034 // and use the lock that the other thread allocated. 1035 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1036 1037 if (status == 0) { 1038 // Deallocate the lock and reload the value. 1039 #if USE_ITT_BUILD 1040 __kmp_itt_critical_destroyed(lck); 1041 // Let ITT know the lock is destroyed and the same memory location may be reused 1042 // for another purpose. 1043 #endif /* USE_ITT_BUILD */ 1044 __kmp_destroy_user_lock_with_checks(lck); 1045 __kmp_user_lock_free(&idx, gtid, lck); 1046 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1047 KMP_DEBUG_ASSERT(lck != NULL); 1048 } 1049 } 1050 return lck; 1051 } 1052 1053 #endif // KMP_USE_DYNAMIC_LOCK 1054 1055 /*! 1056 @ingroup WORK_SHARING 1057 @param loc source location information. 1058 @param global_tid global thread number . 1059 @param crit identity of the critical section. This could be a pointer to a lock 1060 associated with the critical section, or some other suitably unique value. 1061 1062 Enter code protected by a `critical` construct. 1063 This function blocks until the executing thread can enter the critical section. 1064 */ 1065 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1066 kmp_critical_name *crit) { 1067 #if KMP_USE_DYNAMIC_LOCK 1068 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1069 #else 1070 KMP_COUNT_BLOCK(OMP_CRITICAL); 1071 KMP_TIME_PARTITIONED_BLOCK( 1072 OMP_critical_wait); /* Time spent waiting to enter the critical section */ 1073 kmp_user_lock_p lck; 1074 1075 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1076 1077 // TODO: add THR_OVHD_STATE 1078 1079 KMP_CHECK_USER_LOCK_INIT(); 1080 1081 if ((__kmp_user_lock_kind == lk_tas) && 1082 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1083 lck = (kmp_user_lock_p)crit; 1084 } 1085 #if KMP_USE_FUTEX 1086 else if ((__kmp_user_lock_kind == lk_futex) && 1087 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1088 lck = (kmp_user_lock_p)crit; 1089 } 1090 #endif 1091 else { // ticket, queuing or drdpa 1092 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1093 } 1094 1095 if (__kmp_env_consistency_check) 1096 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1097 1098 // since the critical directive binds to all threads, not just the current 1099 // team we have to check this even if we are in a serialized team. 1100 // also, even if we are the uber thread, we still have to conduct the lock, 1101 // as we have to contend with sibling threads. 1102 1103 #if USE_ITT_BUILD 1104 __kmp_itt_critical_acquiring(lck); 1105 #endif /* USE_ITT_BUILD */ 1106 // Value of 'crit' should be good for using as a critical_id of the critical 1107 // section directive. 1108 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1109 1110 #if USE_ITT_BUILD 1111 __kmp_itt_critical_acquired(lck); 1112 #endif /* USE_ITT_BUILD */ 1113 1114 KMP_START_EXPLICIT_TIMER(OMP_critical); 1115 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1116 #endif // KMP_USE_DYNAMIC_LOCK 1117 } 1118 1119 #if KMP_USE_DYNAMIC_LOCK 1120 1121 // Converts the given hint to an internal lock implementation 1122 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1123 #if KMP_USE_TSX 1124 #define KMP_TSX_LOCK(seq) lockseq_##seq 1125 #else 1126 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1127 #endif 1128 1129 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1130 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1131 #else 1132 #define KMP_CPUINFO_RTM 0 1133 #endif 1134 1135 // Hints that do not require further logic 1136 if (hint & kmp_lock_hint_hle) 1137 return KMP_TSX_LOCK(hle); 1138 if (hint & kmp_lock_hint_rtm) 1139 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1140 if (hint & kmp_lock_hint_adaptive) 1141 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1142 1143 // Rule out conflicting hints first by returning the default lock 1144 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1145 return __kmp_user_lock_seq; 1146 if ((hint & omp_lock_hint_speculative) && 1147 (hint & omp_lock_hint_nonspeculative)) 1148 return __kmp_user_lock_seq; 1149 1150 // Do not even consider speculation when it appears to be contended 1151 if (hint & omp_lock_hint_contended) 1152 return lockseq_queuing; 1153 1154 // Uncontended lock without speculation 1155 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1156 return lockseq_tas; 1157 1158 // HLE lock for speculation 1159 if (hint & omp_lock_hint_speculative) 1160 return KMP_TSX_LOCK(hle); 1161 1162 return __kmp_user_lock_seq; 1163 } 1164 1165 /*! 1166 @ingroup WORK_SHARING 1167 @param loc source location information. 1168 @param global_tid global thread number. 1169 @param crit identity of the critical section. This could be a pointer to a lock 1170 associated with the critical section, or some other suitably unique value. 1171 @param hint the lock hint. 1172 1173 Enter code protected by a `critical` construct with a hint. The hint value is 1174 used to suggest a lock implementation. This function blocks until the executing 1175 thread can enter the critical section unless the hint suggests use of 1176 speculative execution and the hardware supports it. 1177 */ 1178 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1179 kmp_critical_name *crit, uintptr_t hint) { 1180 KMP_COUNT_BLOCK(OMP_CRITICAL); 1181 kmp_user_lock_p lck; 1182 1183 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1184 1185 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1186 // Check if it is initialized. 1187 if (*lk == 0) { 1188 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1189 if (KMP_IS_D_LOCK(lckseq)) { 1190 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1191 KMP_GET_D_TAG(lckseq)); 1192 } else { 1193 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1194 } 1195 } 1196 // Branch for accessing the actual lock object and set operation. This 1197 // branching is inevitable since this lock initialization does not follow the 1198 // normal dispatch path (lock table is not used). 1199 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1200 lck = (kmp_user_lock_p)lk; 1201 if (__kmp_env_consistency_check) { 1202 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1203 __kmp_map_hint_to_lock(hint)); 1204 } 1205 #if USE_ITT_BUILD 1206 __kmp_itt_critical_acquiring(lck); 1207 #endif 1208 #if KMP_USE_INLINED_TAS 1209 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1210 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1211 } else 1212 #elif KMP_USE_INLINED_FUTEX 1213 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1214 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1215 } else 1216 #endif 1217 { 1218 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1219 } 1220 } else { 1221 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1222 lck = ilk->lock; 1223 if (__kmp_env_consistency_check) { 1224 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1225 __kmp_map_hint_to_lock(hint)); 1226 } 1227 #if USE_ITT_BUILD 1228 __kmp_itt_critical_acquiring(lck); 1229 #endif 1230 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1231 } 1232 1233 #if USE_ITT_BUILD 1234 __kmp_itt_critical_acquired(lck); 1235 #endif /* USE_ITT_BUILD */ 1236 1237 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1238 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1239 } // __kmpc_critical_with_hint 1240 1241 #endif // KMP_USE_DYNAMIC_LOCK 1242 1243 /*! 1244 @ingroup WORK_SHARING 1245 @param loc source location information. 1246 @param global_tid global thread number . 1247 @param crit identity of the critical section. This could be a pointer to a lock 1248 associated with the critical section, or some other suitably unique value. 1249 1250 Leave a critical section, releasing any lock that was held during its execution. 1251 */ 1252 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1253 kmp_critical_name *crit) { 1254 kmp_user_lock_p lck; 1255 1256 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1257 1258 #if KMP_USE_DYNAMIC_LOCK 1259 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1260 lck = (kmp_user_lock_p)crit; 1261 KMP_ASSERT(lck != NULL); 1262 if (__kmp_env_consistency_check) { 1263 __kmp_pop_sync(global_tid, ct_critical, loc); 1264 } 1265 #if USE_ITT_BUILD 1266 __kmp_itt_critical_releasing(lck); 1267 #endif 1268 #if KMP_USE_INLINED_TAS 1269 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1270 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1271 } else 1272 #elif KMP_USE_INLINED_FUTEX 1273 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1274 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1275 } else 1276 #endif 1277 { 1278 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1279 } 1280 } else { 1281 kmp_indirect_lock_t *ilk = 1282 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1283 KMP_ASSERT(ilk != NULL); 1284 lck = ilk->lock; 1285 if (__kmp_env_consistency_check) { 1286 __kmp_pop_sync(global_tid, ct_critical, loc); 1287 } 1288 #if USE_ITT_BUILD 1289 __kmp_itt_critical_releasing(lck); 1290 #endif 1291 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1292 } 1293 1294 #else // KMP_USE_DYNAMIC_LOCK 1295 1296 if ((__kmp_user_lock_kind == lk_tas) && 1297 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1298 lck = (kmp_user_lock_p)crit; 1299 } 1300 #if KMP_USE_FUTEX 1301 else if ((__kmp_user_lock_kind == lk_futex) && 1302 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1303 lck = (kmp_user_lock_p)crit; 1304 } 1305 #endif 1306 else { // ticket, queuing or drdpa 1307 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1308 } 1309 1310 KMP_ASSERT(lck != NULL); 1311 1312 if (__kmp_env_consistency_check) 1313 __kmp_pop_sync(global_tid, ct_critical, loc); 1314 1315 #if USE_ITT_BUILD 1316 __kmp_itt_critical_releasing(lck); 1317 #endif /* USE_ITT_BUILD */ 1318 // Value of 'crit' should be good for using as a critical_id of the critical 1319 // section directive. 1320 __kmp_release_user_lock_with_checks(lck, global_tid); 1321 1322 #if OMPT_SUPPORT && OMPT_BLAME 1323 if (ompt_enabled && 1324 ompt_callbacks.ompt_callback(ompt_event_release_critical)) { 1325 ompt_callbacks.ompt_callback(ompt_event_release_critical)((uint64_t)lck); 1326 } 1327 #endif 1328 1329 #endif // KMP_USE_DYNAMIC_LOCK 1330 KMP_POP_PARTITIONED_TIMER(); 1331 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1332 } 1333 1334 /*! 1335 @ingroup SYNCHRONIZATION 1336 @param loc source location information 1337 @param global_tid thread id. 1338 @return one if the thread should execute the master block, zero otherwise 1339 1340 Start execution of a combined barrier and master. The barrier is executed inside 1341 this function. 1342 */ 1343 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1344 int status; 1345 1346 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1347 1348 if (!TCR_4(__kmp_init_parallel)) 1349 __kmp_parallel_initialize(); 1350 1351 if (__kmp_env_consistency_check) 1352 __kmp_check_barrier(global_tid, ct_barrier, loc); 1353 1354 #if USE_ITT_NOTIFY 1355 __kmp_threads[global_tid]->th.th_ident = loc; 1356 #endif 1357 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1358 1359 return (status != 0) ? 0 : 1; 1360 } 1361 1362 /*! 1363 @ingroup SYNCHRONIZATION 1364 @param loc source location information 1365 @param global_tid thread id. 1366 1367 Complete the execution of a combined barrier and master. This function should 1368 only be called at the completion of the <tt>master</tt> code. Other threads will 1369 still be waiting at the barrier and this call releases them. 1370 */ 1371 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1372 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1373 1374 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1375 } 1376 1377 /*! 1378 @ingroup SYNCHRONIZATION 1379 @param loc source location information 1380 @param global_tid thread id. 1381 @return one if the thread should execute the master block, zero otherwise 1382 1383 Start execution of a combined barrier and master(nowait) construct. 1384 The barrier is executed inside this function. 1385 There is no equivalent "end" function, since the 1386 */ 1387 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1388 kmp_int32 ret; 1389 1390 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1391 1392 if (!TCR_4(__kmp_init_parallel)) 1393 __kmp_parallel_initialize(); 1394 1395 if (__kmp_env_consistency_check) { 1396 if (loc == 0) { 1397 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1398 } 1399 __kmp_check_barrier(global_tid, ct_barrier, loc); 1400 } 1401 1402 #if USE_ITT_NOTIFY 1403 __kmp_threads[global_tid]->th.th_ident = loc; 1404 #endif 1405 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1406 1407 ret = __kmpc_master(loc, global_tid); 1408 1409 if (__kmp_env_consistency_check) { 1410 /* there's no __kmpc_end_master called; so the (stats) */ 1411 /* actions of __kmpc_end_master are done here */ 1412 1413 if (global_tid < 0) { 1414 KMP_WARNING(ThreadIdentInvalid); 1415 } 1416 if (ret) { 1417 /* only one thread should do the pop since only */ 1418 /* one did the push (see __kmpc_master()) */ 1419 1420 __kmp_pop_sync(global_tid, ct_master, loc); 1421 } 1422 } 1423 1424 return (ret); 1425 } 1426 1427 /* The BARRIER for a SINGLE process section is always explicit */ 1428 /*! 1429 @ingroup WORK_SHARING 1430 @param loc source location information 1431 @param global_tid global thread number 1432 @return One if this thread should execute the single construct, zero otherwise. 1433 1434 Test whether to execute a <tt>single</tt> construct. 1435 There are no implicit barriers in the two "single" calls, rather the compiler 1436 should introduce an explicit barrier if it is required. 1437 */ 1438 1439 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1440 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1441 1442 if (rc) { 1443 // We are going to execute the single statement, so we should count it. 1444 KMP_COUNT_BLOCK(OMP_SINGLE); 1445 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1446 } 1447 1448 #if OMPT_SUPPORT && OMPT_TRACE 1449 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1450 kmp_team_t *team = this_thr->th.th_team; 1451 int tid = __kmp_tid_from_gtid(global_tid); 1452 1453 if (ompt_enabled) { 1454 if (rc) { 1455 if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) { 1456 ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)( 1457 team->t.ompt_team_info.parallel_id, 1458 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id, 1459 team->t.ompt_team_info.microtask); 1460 } 1461 } else { 1462 if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) { 1463 ompt_callbacks.ompt_callback(ompt_event_single_others_begin)( 1464 team->t.ompt_team_info.parallel_id, 1465 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1466 } 1467 this_thr->th.ompt_thread_info.state = ompt_state_wait_single; 1468 } 1469 } 1470 #endif 1471 1472 return rc; 1473 } 1474 1475 /*! 1476 @ingroup WORK_SHARING 1477 @param loc source location information 1478 @param global_tid global thread number 1479 1480 Mark the end of a <tt>single</tt> construct. This function should 1481 only be called by the thread that executed the block of code protected 1482 by the `single` construct. 1483 */ 1484 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1485 __kmp_exit_single(global_tid); 1486 KMP_POP_PARTITIONED_TIMER(); 1487 1488 #if OMPT_SUPPORT && OMPT_TRACE 1489 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1490 kmp_team_t *team = this_thr->th.th_team; 1491 int tid = __kmp_tid_from_gtid(global_tid); 1492 1493 if (ompt_enabled && 1494 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) { 1495 ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)( 1496 team->t.ompt_team_info.parallel_id, 1497 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); 1498 } 1499 #endif 1500 } 1501 1502 /*! 1503 @ingroup WORK_SHARING 1504 @param loc Source location 1505 @param global_tid Global thread id 1506 1507 Mark the end of a statically scheduled loop. 1508 */ 1509 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1510 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1511 1512 #if OMPT_SUPPORT && OMPT_TRACE 1513 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { 1514 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1515 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); 1516 ompt_callbacks.ompt_callback(ompt_event_loop_end)(team_info->parallel_id, 1517 task_info->task_id); 1518 } 1519 #endif 1520 1521 if (__kmp_env_consistency_check) 1522 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1523 } 1524 1525 // User routines which take C-style arguments (call by value) 1526 // different from the Fortran equivalent routines 1527 1528 void ompc_set_num_threads(int arg) { 1529 // !!!!! TODO: check the per-task binding 1530 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1531 } 1532 1533 void ompc_set_dynamic(int flag) { 1534 kmp_info_t *thread; 1535 1536 /* For the thread-private implementation of the internal controls */ 1537 thread = __kmp_entry_thread(); 1538 1539 __kmp_save_internal_controls(thread); 1540 1541 set__dynamic(thread, flag ? TRUE : FALSE); 1542 } 1543 1544 void ompc_set_nested(int flag) { 1545 kmp_info_t *thread; 1546 1547 /* For the thread-private internal controls implementation */ 1548 thread = __kmp_entry_thread(); 1549 1550 __kmp_save_internal_controls(thread); 1551 1552 set__nested(thread, flag ? TRUE : FALSE); 1553 } 1554 1555 void ompc_set_max_active_levels(int max_active_levels) { 1556 /* TO DO */ 1557 /* we want per-task implementation of this internal control */ 1558 1559 /* For the per-thread internal controls implementation */ 1560 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1561 } 1562 1563 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1564 // !!!!! TODO: check the per-task binding 1565 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1566 } 1567 1568 int ompc_get_ancestor_thread_num(int level) { 1569 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1570 } 1571 1572 int ompc_get_team_size(int level) { 1573 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1574 } 1575 1576 void kmpc_set_stacksize(int arg) { 1577 // __kmp_aux_set_stacksize initializes the library if needed 1578 __kmp_aux_set_stacksize(arg); 1579 } 1580 1581 void kmpc_set_stacksize_s(size_t arg) { 1582 // __kmp_aux_set_stacksize initializes the library if needed 1583 __kmp_aux_set_stacksize(arg); 1584 } 1585 1586 void kmpc_set_blocktime(int arg) { 1587 int gtid, tid; 1588 kmp_info_t *thread; 1589 1590 gtid = __kmp_entry_gtid(); 1591 tid = __kmp_tid_from_gtid(gtid); 1592 thread = __kmp_thread_from_gtid(gtid); 1593 1594 __kmp_aux_set_blocktime(arg, thread, tid); 1595 } 1596 1597 void kmpc_set_library(int arg) { 1598 // __kmp_user_set_library initializes the library if needed 1599 __kmp_user_set_library((enum library_type)arg); 1600 } 1601 1602 void kmpc_set_defaults(char const *str) { 1603 // __kmp_aux_set_defaults initializes the library if needed 1604 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1605 } 1606 1607 void kmpc_set_disp_num_buffers(int arg) { 1608 // ignore after initialization because some teams have already 1609 // allocated dispatch buffers 1610 if (__kmp_init_serial == 0 && arg > 0) 1611 __kmp_dispatch_num_buffers = arg; 1612 } 1613 1614 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1615 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1616 return -1; 1617 #else 1618 if (!TCR_4(__kmp_init_middle)) { 1619 __kmp_middle_initialize(); 1620 } 1621 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1622 #endif 1623 } 1624 1625 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1626 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1627 return -1; 1628 #else 1629 if (!TCR_4(__kmp_init_middle)) { 1630 __kmp_middle_initialize(); 1631 } 1632 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1633 #endif 1634 } 1635 1636 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1637 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1638 return -1; 1639 #else 1640 if (!TCR_4(__kmp_init_middle)) { 1641 __kmp_middle_initialize(); 1642 } 1643 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1644 #endif 1645 } 1646 1647 /* -------------------------------------------------------------------------- */ 1648 /*! 1649 @ingroup THREADPRIVATE 1650 @param loc source location information 1651 @param gtid global thread number 1652 @param cpy_size size of the cpy_data buffer 1653 @param cpy_data pointer to data to be copied 1654 @param cpy_func helper function to call for copying data 1655 @param didit flag variable: 1=single thread; 0=not single thread 1656 1657 __kmpc_copyprivate implements the interface for the private data broadcast 1658 needed for the copyprivate clause associated with a single region in an 1659 OpenMP<sup>*</sup> program (both C and Fortran). 1660 All threads participating in the parallel region call this routine. 1661 One of the threads (called the single thread) should have the <tt>didit</tt> 1662 variable set to 1 and all other threads should have that variable set to 0. 1663 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1664 1665 The OpenMP specification forbids the use of nowait on the single region when a 1666 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1667 barrier internally to avoid race conditions, so the code generation for the 1668 single region should avoid generating a barrier after the call to @ref 1669 __kmpc_copyprivate. 1670 1671 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1672 The <tt>loc</tt> parameter is a pointer to source location information. 1673 1674 Internal implementation: The single thread will first copy its descriptor 1675 address (cpy_data) to a team-private location, then the other threads will each 1676 call the function pointed to by the parameter cpy_func, which carries out the 1677 copy by copying the data using the cpy_data buffer. 1678 1679 The cpy_func routine used for the copy and the contents of the data area defined 1680 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1681 to be done. For instance, the cpy_data buffer can hold the actual data to be 1682 copied or it may hold a list of pointers to the data. The cpy_func routine must 1683 interpret the cpy_data buffer appropriately. 1684 1685 The interface to cpy_func is as follows: 1686 @code 1687 void cpy_func( void *destination, void *source ) 1688 @endcode 1689 where void *destination is the cpy_data pointer for the thread being copied to 1690 and void *source is the cpy_data pointer for the thread being copied from. 1691 */ 1692 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1693 void *cpy_data, void (*cpy_func)(void *, void *), 1694 kmp_int32 didit) { 1695 void **data_ptr; 1696 1697 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1698 1699 KMP_MB(); 1700 1701 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1702 1703 if (__kmp_env_consistency_check) { 1704 if (loc == 0) { 1705 KMP_WARNING(ConstructIdentInvalid); 1706 } 1707 } 1708 1709 // ToDo: Optimize the following two barriers into some kind of split barrier 1710 1711 if (didit) 1712 *data_ptr = cpy_data; 1713 1714 /* This barrier is not a barrier region boundary */ 1715 #if USE_ITT_NOTIFY 1716 __kmp_threads[gtid]->th.th_ident = loc; 1717 #endif 1718 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1719 1720 if (!didit) 1721 (*cpy_func)(cpy_data, *data_ptr); 1722 1723 // Consider next barrier a user-visible barrier for barrier region boundaries 1724 // Nesting checks are already handled by the single construct checks 1725 1726 #if USE_ITT_NOTIFY 1727 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 1728 // tasks can overwrite the location) 1729 #endif 1730 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1731 } 1732 1733 /* -------------------------------------------------------------------------- */ 1734 1735 #define INIT_LOCK __kmp_init_user_lock_with_checks 1736 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 1737 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 1738 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 1739 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 1740 #define ACQUIRE_NESTED_LOCK_TIMED \ 1741 __kmp_acquire_nested_user_lock_with_checks_timed 1742 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 1743 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 1744 #define TEST_LOCK __kmp_test_user_lock_with_checks 1745 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 1746 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 1747 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 1748 1749 // TODO: Make check abort messages use location info & pass it into 1750 // with_checks routines 1751 1752 #if KMP_USE_DYNAMIC_LOCK 1753 1754 // internal lock initializer 1755 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 1756 kmp_dyna_lockseq_t seq) { 1757 if (KMP_IS_D_LOCK(seq)) { 1758 KMP_INIT_D_LOCK(lock, seq); 1759 #if USE_ITT_BUILD 1760 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 1761 #endif 1762 } else { 1763 KMP_INIT_I_LOCK(lock, seq); 1764 #if USE_ITT_BUILD 1765 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1766 __kmp_itt_lock_creating(ilk->lock, loc); 1767 #endif 1768 } 1769 } 1770 1771 // internal nest lock initializer 1772 static __forceinline void 1773 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 1774 kmp_dyna_lockseq_t seq) { 1775 #if KMP_USE_TSX 1776 // Don't have nested lock implementation for speculative locks 1777 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 1778 seq = __kmp_user_lock_seq; 1779 #endif 1780 switch (seq) { 1781 case lockseq_tas: 1782 seq = lockseq_nested_tas; 1783 break; 1784 #if KMP_USE_FUTEX 1785 case lockseq_futex: 1786 seq = lockseq_nested_futex; 1787 break; 1788 #endif 1789 case lockseq_ticket: 1790 seq = lockseq_nested_ticket; 1791 break; 1792 case lockseq_queuing: 1793 seq = lockseq_nested_queuing; 1794 break; 1795 case lockseq_drdpa: 1796 seq = lockseq_nested_drdpa; 1797 break; 1798 default: 1799 seq = lockseq_nested_queuing; 1800 } 1801 KMP_INIT_I_LOCK(lock, seq); 1802 #if USE_ITT_BUILD 1803 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 1804 __kmp_itt_lock_creating(ilk->lock, loc); 1805 #endif 1806 } 1807 1808 /* initialize the lock with a hint */ 1809 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 1810 uintptr_t hint) { 1811 KMP_DEBUG_ASSERT(__kmp_init_serial); 1812 if (__kmp_env_consistency_check && user_lock == NULL) { 1813 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 1814 } 1815 1816 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1817 } 1818 1819 /* initialize the lock with a hint */ 1820 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 1821 void **user_lock, uintptr_t hint) { 1822 KMP_DEBUG_ASSERT(__kmp_init_serial); 1823 if (__kmp_env_consistency_check && user_lock == NULL) { 1824 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 1825 } 1826 1827 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 1828 } 1829 1830 #endif // KMP_USE_DYNAMIC_LOCK 1831 1832 /* initialize the lock */ 1833 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 1834 #if KMP_USE_DYNAMIC_LOCK 1835 1836 KMP_DEBUG_ASSERT(__kmp_init_serial); 1837 if (__kmp_env_consistency_check && user_lock == NULL) { 1838 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 1839 } 1840 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1841 1842 #else // KMP_USE_DYNAMIC_LOCK 1843 1844 static char const *const func = "omp_init_lock"; 1845 kmp_user_lock_p lck; 1846 KMP_DEBUG_ASSERT(__kmp_init_serial); 1847 1848 if (__kmp_env_consistency_check) { 1849 if (user_lock == NULL) { 1850 KMP_FATAL(LockIsUninitialized, func); 1851 } 1852 } 1853 1854 KMP_CHECK_USER_LOCK_INIT(); 1855 1856 if ((__kmp_user_lock_kind == lk_tas) && 1857 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 1858 lck = (kmp_user_lock_p)user_lock; 1859 } 1860 #if KMP_USE_FUTEX 1861 else if ((__kmp_user_lock_kind == lk_futex) && 1862 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 1863 lck = (kmp_user_lock_p)user_lock; 1864 } 1865 #endif 1866 else { 1867 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 1868 } 1869 INIT_LOCK(lck); 1870 __kmp_set_user_lock_location(lck, loc); 1871 1872 #if OMPT_SUPPORT && OMPT_TRACE 1873 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_lock)) { 1874 ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t)lck); 1875 } 1876 #endif 1877 1878 #if USE_ITT_BUILD 1879 __kmp_itt_lock_creating(lck); 1880 #endif /* USE_ITT_BUILD */ 1881 1882 #endif // KMP_USE_DYNAMIC_LOCK 1883 } // __kmpc_init_lock 1884 1885 /* initialize the lock */ 1886 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 1887 #if KMP_USE_DYNAMIC_LOCK 1888 1889 KMP_DEBUG_ASSERT(__kmp_init_serial); 1890 if (__kmp_env_consistency_check && user_lock == NULL) { 1891 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 1892 } 1893 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 1894 1895 #else // KMP_USE_DYNAMIC_LOCK 1896 1897 static char const *const func = "omp_init_nest_lock"; 1898 kmp_user_lock_p lck; 1899 KMP_DEBUG_ASSERT(__kmp_init_serial); 1900 1901 if (__kmp_env_consistency_check) { 1902 if (user_lock == NULL) { 1903 KMP_FATAL(LockIsUninitialized, func); 1904 } 1905 } 1906 1907 KMP_CHECK_USER_LOCK_INIT(); 1908 1909 if ((__kmp_user_lock_kind == lk_tas) && 1910 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 1911 OMP_NEST_LOCK_T_SIZE)) { 1912 lck = (kmp_user_lock_p)user_lock; 1913 } 1914 #if KMP_USE_FUTEX 1915 else if ((__kmp_user_lock_kind == lk_futex) && 1916 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 1917 OMP_NEST_LOCK_T_SIZE)) { 1918 lck = (kmp_user_lock_p)user_lock; 1919 } 1920 #endif 1921 else { 1922 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 1923 } 1924 1925 INIT_NESTED_LOCK(lck); 1926 __kmp_set_user_lock_location(lck, loc); 1927 1928 #if OMPT_SUPPORT && OMPT_TRACE 1929 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) { 1930 ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t)lck); 1931 } 1932 #endif 1933 1934 #if USE_ITT_BUILD 1935 __kmp_itt_lock_creating(lck); 1936 #endif /* USE_ITT_BUILD */ 1937 1938 #endif // KMP_USE_DYNAMIC_LOCK 1939 } // __kmpc_init_nest_lock 1940 1941 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 1942 #if KMP_USE_DYNAMIC_LOCK 1943 1944 #if USE_ITT_BUILD 1945 kmp_user_lock_p lck; 1946 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 1947 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 1948 } else { 1949 lck = (kmp_user_lock_p)user_lock; 1950 } 1951 __kmp_itt_lock_destroyed(lck); 1952 #endif 1953 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 1954 #else 1955 kmp_user_lock_p lck; 1956 1957 if ((__kmp_user_lock_kind == lk_tas) && 1958 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 1959 lck = (kmp_user_lock_p)user_lock; 1960 } 1961 #if KMP_USE_FUTEX 1962 else if ((__kmp_user_lock_kind == lk_futex) && 1963 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 1964 lck = (kmp_user_lock_p)user_lock; 1965 } 1966 #endif 1967 else { 1968 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 1969 } 1970 1971 #if OMPT_SUPPORT && OMPT_TRACE 1972 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) { 1973 ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t)lck); 1974 } 1975 #endif 1976 1977 #if USE_ITT_BUILD 1978 __kmp_itt_lock_destroyed(lck); 1979 #endif /* USE_ITT_BUILD */ 1980 DESTROY_LOCK(lck); 1981 1982 if ((__kmp_user_lock_kind == lk_tas) && 1983 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 1984 ; 1985 } 1986 #if KMP_USE_FUTEX 1987 else if ((__kmp_user_lock_kind == lk_futex) && 1988 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 1989 ; 1990 } 1991 #endif 1992 else { 1993 __kmp_user_lock_free(user_lock, gtid, lck); 1994 } 1995 #endif // KMP_USE_DYNAMIC_LOCK 1996 } // __kmpc_destroy_lock 1997 1998 /* destroy the lock */ 1999 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2000 #if KMP_USE_DYNAMIC_LOCK 2001 2002 #if USE_ITT_BUILD 2003 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2004 __kmp_itt_lock_destroyed(ilk->lock); 2005 #endif 2006 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2007 2008 #else // KMP_USE_DYNAMIC_LOCK 2009 2010 kmp_user_lock_p lck; 2011 2012 if ((__kmp_user_lock_kind == lk_tas) && 2013 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2014 OMP_NEST_LOCK_T_SIZE)) { 2015 lck = (kmp_user_lock_p)user_lock; 2016 } 2017 #if KMP_USE_FUTEX 2018 else if ((__kmp_user_lock_kind == lk_futex) && 2019 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2020 OMP_NEST_LOCK_T_SIZE)) { 2021 lck = (kmp_user_lock_p)user_lock; 2022 } 2023 #endif 2024 else { 2025 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2026 } 2027 2028 #if OMPT_SUPPORT && OMPT_TRACE 2029 if (ompt_enabled && 2030 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) { 2031 ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t)lck); 2032 } 2033 #endif 2034 2035 #if USE_ITT_BUILD 2036 __kmp_itt_lock_destroyed(lck); 2037 #endif /* USE_ITT_BUILD */ 2038 2039 DESTROY_NESTED_LOCK(lck); 2040 2041 if ((__kmp_user_lock_kind == lk_tas) && 2042 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2043 OMP_NEST_LOCK_T_SIZE)) { 2044 ; 2045 } 2046 #if KMP_USE_FUTEX 2047 else if ((__kmp_user_lock_kind == lk_futex) && 2048 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2049 OMP_NEST_LOCK_T_SIZE)) { 2050 ; 2051 } 2052 #endif 2053 else { 2054 __kmp_user_lock_free(user_lock, gtid, lck); 2055 } 2056 #endif // KMP_USE_DYNAMIC_LOCK 2057 } // __kmpc_destroy_nest_lock 2058 2059 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2060 KMP_COUNT_BLOCK(OMP_set_lock); 2061 #if KMP_USE_DYNAMIC_LOCK 2062 int tag = KMP_EXTRACT_D_TAG(user_lock); 2063 #if USE_ITT_BUILD 2064 __kmp_itt_lock_acquiring( 2065 (kmp_user_lock_p) 2066 user_lock); // itt function will get to the right lock object. 2067 #endif 2068 #if KMP_USE_INLINED_TAS 2069 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2070 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2071 } else 2072 #elif KMP_USE_INLINED_FUTEX 2073 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2074 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2075 } else 2076 #endif 2077 { 2078 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2079 } 2080 #if USE_ITT_BUILD 2081 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2082 #endif 2083 2084 #else // KMP_USE_DYNAMIC_LOCK 2085 2086 kmp_user_lock_p lck; 2087 2088 if ((__kmp_user_lock_kind == lk_tas) && 2089 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2090 lck = (kmp_user_lock_p)user_lock; 2091 } 2092 #if KMP_USE_FUTEX 2093 else if ((__kmp_user_lock_kind == lk_futex) && 2094 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2095 lck = (kmp_user_lock_p)user_lock; 2096 } 2097 #endif 2098 else { 2099 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2100 } 2101 2102 #if USE_ITT_BUILD 2103 __kmp_itt_lock_acquiring(lck); 2104 #endif /* USE_ITT_BUILD */ 2105 2106 ACQUIRE_LOCK(lck, gtid); 2107 2108 #if USE_ITT_BUILD 2109 __kmp_itt_lock_acquired(lck); 2110 #endif /* USE_ITT_BUILD */ 2111 2112 #if OMPT_SUPPORT && OMPT_TRACE 2113 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) { 2114 ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t)lck); 2115 } 2116 #endif 2117 2118 #endif // KMP_USE_DYNAMIC_LOCK 2119 } 2120 2121 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2122 #if KMP_USE_DYNAMIC_LOCK 2123 2124 #if USE_ITT_BUILD 2125 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2126 #endif 2127 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2128 #if USE_ITT_BUILD 2129 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2130 #endif 2131 2132 #if OMPT_SUPPORT && OMPT_TRACE 2133 if (ompt_enabled) { 2134 // missing support here: need to know whether acquired first or not 2135 } 2136 #endif 2137 2138 #else // KMP_USE_DYNAMIC_LOCK 2139 int acquire_status; 2140 kmp_user_lock_p lck; 2141 2142 if ((__kmp_user_lock_kind == lk_tas) && 2143 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2144 OMP_NEST_LOCK_T_SIZE)) { 2145 lck = (kmp_user_lock_p)user_lock; 2146 } 2147 #if KMP_USE_FUTEX 2148 else if ((__kmp_user_lock_kind == lk_futex) && 2149 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2150 OMP_NEST_LOCK_T_SIZE)) { 2151 lck = (kmp_user_lock_p)user_lock; 2152 } 2153 #endif 2154 else { 2155 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2156 } 2157 2158 #if USE_ITT_BUILD 2159 __kmp_itt_lock_acquiring(lck); 2160 #endif /* USE_ITT_BUILD */ 2161 2162 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2163 2164 #if USE_ITT_BUILD 2165 __kmp_itt_lock_acquired(lck); 2166 #endif /* USE_ITT_BUILD */ 2167 2168 #if OMPT_SUPPORT && OMPT_TRACE 2169 if (ompt_enabled) { 2170 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2171 if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)) 2172 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)( 2173 (uint64_t)lck); 2174 } else { 2175 if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)) 2176 ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)( 2177 (uint64_t)lck); 2178 } 2179 } 2180 #endif 2181 2182 #endif // KMP_USE_DYNAMIC_LOCK 2183 } 2184 2185 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2186 #if KMP_USE_DYNAMIC_LOCK 2187 2188 int tag = KMP_EXTRACT_D_TAG(user_lock); 2189 #if USE_ITT_BUILD 2190 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2191 #endif 2192 #if KMP_USE_INLINED_TAS 2193 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2194 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2195 } else 2196 #elif KMP_USE_INLINED_FUTEX 2197 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2198 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2199 } else 2200 #endif 2201 { 2202 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2203 } 2204 2205 #else // KMP_USE_DYNAMIC_LOCK 2206 2207 kmp_user_lock_p lck; 2208 2209 /* Can't use serial interval since not block structured */ 2210 /* release the lock */ 2211 2212 if ((__kmp_user_lock_kind == lk_tas) && 2213 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2214 #if KMP_OS_LINUX && \ 2215 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2216 // "fast" path implemented to fix customer performance issue 2217 #if USE_ITT_BUILD 2218 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2219 #endif /* USE_ITT_BUILD */ 2220 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2221 KMP_MB(); 2222 return; 2223 #else 2224 lck = (kmp_user_lock_p)user_lock; 2225 #endif 2226 } 2227 #if KMP_USE_FUTEX 2228 else if ((__kmp_user_lock_kind == lk_futex) && 2229 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2230 lck = (kmp_user_lock_p)user_lock; 2231 } 2232 #endif 2233 else { 2234 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2235 } 2236 2237 #if USE_ITT_BUILD 2238 __kmp_itt_lock_releasing(lck); 2239 #endif /* USE_ITT_BUILD */ 2240 2241 RELEASE_LOCK(lck, gtid); 2242 2243 #if OMPT_SUPPORT && OMPT_BLAME 2244 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_release_lock)) { 2245 ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t)lck); 2246 } 2247 #endif 2248 2249 #endif // KMP_USE_DYNAMIC_LOCK 2250 } 2251 2252 /* release the lock */ 2253 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2254 #if KMP_USE_DYNAMIC_LOCK 2255 2256 #if USE_ITT_BUILD 2257 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2258 #endif 2259 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2260 2261 #else // KMP_USE_DYNAMIC_LOCK 2262 2263 kmp_user_lock_p lck; 2264 2265 /* Can't use serial interval since not block structured */ 2266 2267 if ((__kmp_user_lock_kind == lk_tas) && 2268 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2269 OMP_NEST_LOCK_T_SIZE)) { 2270 #if KMP_OS_LINUX && \ 2271 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2272 // "fast" path implemented to fix customer performance issue 2273 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2274 #if USE_ITT_BUILD 2275 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2276 #endif /* USE_ITT_BUILD */ 2277 if (--(tl->lk.depth_locked) == 0) { 2278 TCW_4(tl->lk.poll, 0); 2279 } 2280 KMP_MB(); 2281 return; 2282 #else 2283 lck = (kmp_user_lock_p)user_lock; 2284 #endif 2285 } 2286 #if KMP_USE_FUTEX 2287 else if ((__kmp_user_lock_kind == lk_futex) && 2288 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2289 OMP_NEST_LOCK_T_SIZE)) { 2290 lck = (kmp_user_lock_p)user_lock; 2291 } 2292 #endif 2293 else { 2294 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2295 } 2296 2297 #if USE_ITT_BUILD 2298 __kmp_itt_lock_releasing(lck); 2299 #endif /* USE_ITT_BUILD */ 2300 2301 int release_status; 2302 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2303 #if OMPT_SUPPORT && OMPT_BLAME 2304 if (ompt_enabled) { 2305 if (release_status == KMP_LOCK_RELEASED) { 2306 if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) { 2307 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)( 2308 (uint64_t)lck); 2309 } 2310 } else if (ompt_callbacks.ompt_callback( 2311 ompt_event_release_nest_lock_prev)) { 2312 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)( 2313 (uint64_t)lck); 2314 } 2315 } 2316 #endif 2317 2318 #endif // KMP_USE_DYNAMIC_LOCK 2319 } 2320 2321 /* try to acquire the lock */ 2322 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2323 KMP_COUNT_BLOCK(OMP_test_lock); 2324 2325 #if KMP_USE_DYNAMIC_LOCK 2326 int rc; 2327 int tag = KMP_EXTRACT_D_TAG(user_lock); 2328 #if USE_ITT_BUILD 2329 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2330 #endif 2331 #if KMP_USE_INLINED_TAS 2332 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2333 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2334 } else 2335 #elif KMP_USE_INLINED_FUTEX 2336 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2337 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2338 } else 2339 #endif 2340 { 2341 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2342 } 2343 if (rc) { 2344 #if USE_ITT_BUILD 2345 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2346 #endif 2347 return FTN_TRUE; 2348 } else { 2349 #if USE_ITT_BUILD 2350 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2351 #endif 2352 return FTN_FALSE; 2353 } 2354 2355 #else // KMP_USE_DYNAMIC_LOCK 2356 2357 kmp_user_lock_p lck; 2358 int rc; 2359 2360 if ((__kmp_user_lock_kind == lk_tas) && 2361 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2362 lck = (kmp_user_lock_p)user_lock; 2363 } 2364 #if KMP_USE_FUTEX 2365 else if ((__kmp_user_lock_kind == lk_futex) && 2366 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2367 lck = (kmp_user_lock_p)user_lock; 2368 } 2369 #endif 2370 else { 2371 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2372 } 2373 2374 #if USE_ITT_BUILD 2375 __kmp_itt_lock_acquiring(lck); 2376 #endif /* USE_ITT_BUILD */ 2377 2378 rc = TEST_LOCK(lck, gtid); 2379 #if USE_ITT_BUILD 2380 if (rc) { 2381 __kmp_itt_lock_acquired(lck); 2382 } else { 2383 __kmp_itt_lock_cancelled(lck); 2384 } 2385 #endif /* USE_ITT_BUILD */ 2386 return (rc ? FTN_TRUE : FTN_FALSE); 2387 2388 /* Can't use serial interval since not block structured */ 2389 2390 #endif // KMP_USE_DYNAMIC_LOCK 2391 } 2392 2393 /* try to acquire the lock */ 2394 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2395 #if KMP_USE_DYNAMIC_LOCK 2396 int rc; 2397 #if USE_ITT_BUILD 2398 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2399 #endif 2400 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 2401 #if USE_ITT_BUILD 2402 if (rc) { 2403 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2404 } else { 2405 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2406 } 2407 #endif 2408 return rc; 2409 2410 #else // KMP_USE_DYNAMIC_LOCK 2411 2412 kmp_user_lock_p lck; 2413 int rc; 2414 2415 if ((__kmp_user_lock_kind == lk_tas) && 2416 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2417 OMP_NEST_LOCK_T_SIZE)) { 2418 lck = (kmp_user_lock_p)user_lock; 2419 } 2420 #if KMP_USE_FUTEX 2421 else if ((__kmp_user_lock_kind == lk_futex) && 2422 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2423 OMP_NEST_LOCK_T_SIZE)) { 2424 lck = (kmp_user_lock_p)user_lock; 2425 } 2426 #endif 2427 else { 2428 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 2429 } 2430 2431 #if USE_ITT_BUILD 2432 __kmp_itt_lock_acquiring(lck); 2433 #endif /* USE_ITT_BUILD */ 2434 2435 rc = TEST_NESTED_LOCK(lck, gtid); 2436 #if USE_ITT_BUILD 2437 if (rc) { 2438 __kmp_itt_lock_acquired(lck); 2439 } else { 2440 __kmp_itt_lock_cancelled(lck); 2441 } 2442 #endif /* USE_ITT_BUILD */ 2443 return rc; 2444 2445 /* Can't use serial interval since not block structured */ 2446 2447 #endif // KMP_USE_DYNAMIC_LOCK 2448 } 2449 2450 // Interface to fast scalable reduce methods routines 2451 2452 // keep the selected method in a thread local structure for cross-function 2453 // usage: will be used in __kmpc_end_reduce* functions; 2454 // another solution: to re-determine the method one more time in 2455 // __kmpc_end_reduce* functions (new prototype required then) 2456 // AT: which solution is better? 2457 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 2458 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 2459 2460 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 2461 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 2462 2463 // description of the packed_reduction_method variable: look at the macros in 2464 // kmp.h 2465 2466 // used in a critical section reduce block 2467 static __forceinline void 2468 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 2469 kmp_critical_name *crit) { 2470 2471 // this lock was visible to a customer and to the threading profile tool as a 2472 // serial overhead span (although it's used for an internal purpose only) 2473 // why was it visible in previous implementation? 2474 // should we keep it visible in new reduce block? 2475 kmp_user_lock_p lck; 2476 2477 #if KMP_USE_DYNAMIC_LOCK 2478 2479 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 2480 // Check if it is initialized. 2481 if (*lk == 0) { 2482 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2483 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 2484 KMP_GET_D_TAG(__kmp_user_lock_seq)); 2485 } else { 2486 __kmp_init_indirect_csptr(crit, loc, global_tid, 2487 KMP_GET_I_TAG(__kmp_user_lock_seq)); 2488 } 2489 } 2490 // Branch for accessing the actual lock object and set operation. This 2491 // branching is inevitable since this lock initialization does not follow the 2492 // normal dispatch path (lock table is not used). 2493 if (KMP_EXTRACT_D_TAG(lk) != 0) { 2494 lck = (kmp_user_lock_p)lk; 2495 KMP_DEBUG_ASSERT(lck != NULL); 2496 if (__kmp_env_consistency_check) { 2497 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2498 } 2499 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 2500 } else { 2501 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 2502 lck = ilk->lock; 2503 KMP_DEBUG_ASSERT(lck != NULL); 2504 if (__kmp_env_consistency_check) { 2505 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 2506 } 2507 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 2508 } 2509 2510 #else // KMP_USE_DYNAMIC_LOCK 2511 2512 // We know that the fast reduction code is only emitted by Intel compilers 2513 // with 32 byte critical sections. If there isn't enough space, then we 2514 // have to use a pointer. 2515 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 2516 lck = (kmp_user_lock_p)crit; 2517 } else { 2518 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 2519 } 2520 KMP_DEBUG_ASSERT(lck != NULL); 2521 2522 if (__kmp_env_consistency_check) 2523 __kmp_push_sync(global_tid, ct_critical, loc, lck); 2524 2525 __kmp_acquire_user_lock_with_checks(lck, global_tid); 2526 2527 #endif // KMP_USE_DYNAMIC_LOCK 2528 } 2529 2530 // used in a critical section reduce block 2531 static __forceinline void 2532 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 2533 kmp_critical_name *crit) { 2534 2535 kmp_user_lock_p lck; 2536 2537 #if KMP_USE_DYNAMIC_LOCK 2538 2539 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 2540 lck = (kmp_user_lock_p)crit; 2541 if (__kmp_env_consistency_check) 2542 __kmp_pop_sync(global_tid, ct_critical, loc); 2543 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 2544 } else { 2545 kmp_indirect_lock_t *ilk = 2546 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 2547 if (__kmp_env_consistency_check) 2548 __kmp_pop_sync(global_tid, ct_critical, loc); 2549 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 2550 } 2551 2552 #else // KMP_USE_DYNAMIC_LOCK 2553 2554 // We know that the fast reduction code is only emitted by Intel compilers 2555 // with 32 byte critical sections. If there isn't enough space, then we have 2556 // to use a pointer. 2557 if (__kmp_base_user_lock_size > 32) { 2558 lck = *((kmp_user_lock_p *)crit); 2559 KMP_ASSERT(lck != NULL); 2560 } else { 2561 lck = (kmp_user_lock_p)crit; 2562 } 2563 2564 if (__kmp_env_consistency_check) 2565 __kmp_pop_sync(global_tid, ct_critical, loc); 2566 2567 __kmp_release_user_lock_with_checks(lck, global_tid); 2568 2569 #endif // KMP_USE_DYNAMIC_LOCK 2570 } // __kmp_end_critical_section_reduce_block 2571 2572 /* 2.a.i. Reduce Block without a terminating barrier */ 2573 /*! 2574 @ingroup SYNCHRONIZATION 2575 @param loc source location information 2576 @param global_tid global thread number 2577 @param num_vars number of items (variables) to be reduced 2578 @param reduce_size size of data in bytes to be reduced 2579 @param reduce_data pointer to data to be reduced 2580 @param reduce_func callback function providing reduction operation on two 2581 operands and returning result of reduction in lhs_data 2582 @param lck pointer to the unique lock data structure 2583 @result 1 for the master thread, 0 for all other team threads, 2 for all team 2584 threads if atomic reduction needed 2585 2586 The nowait version is used for a reduce clause with the nowait argument. 2587 */ 2588 kmp_int32 2589 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 2590 size_t reduce_size, void *reduce_data, 2591 void (*reduce_func)(void *lhs_data, void *rhs_data), 2592 kmp_critical_name *lck) { 2593 2594 KMP_COUNT_BLOCK(REDUCE_nowait); 2595 int retval = 0; 2596 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2597 #if OMP_40_ENABLED 2598 kmp_team_t *team; 2599 kmp_info_t *th; 2600 int teams_swapped = 0, task_state; 2601 #endif 2602 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 2603 2604 // why do we need this initialization here at all? 2605 // Reduction clause can not be used as a stand-alone directive. 2606 2607 // do not call __kmp_serial_initialize(), it will be called by 2608 // __kmp_parallel_initialize() if needed 2609 // possible detection of false-positive race by the threadchecker ??? 2610 if (!TCR_4(__kmp_init_parallel)) 2611 __kmp_parallel_initialize(); 2612 2613 // check correctness of reduce block nesting 2614 #if KMP_USE_DYNAMIC_LOCK 2615 if (__kmp_env_consistency_check) 2616 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 2617 #else 2618 if (__kmp_env_consistency_check) 2619 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 2620 #endif 2621 2622 #if OMP_40_ENABLED 2623 th = __kmp_thread_from_gtid(global_tid); 2624 if (th->th.th_teams_microtask) { // AC: check if we are inside the teams 2625 // construct? 2626 team = th->th.th_team; 2627 if (team->t.t_level == th->th.th_teams_level) { 2628 // this is reduction at teams construct 2629 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 2630 // Let's swap teams temporarily for the reduction barrier 2631 teams_swapped = 1; 2632 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2633 th->th.th_team = team->t.t_parent; 2634 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 2635 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 2636 task_state = th->th.th_task_state; 2637 th->th.th_task_state = 0; 2638 } 2639 } 2640 #endif // OMP_40_ENABLED 2641 2642 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 2643 // the value should be kept in a variable 2644 // the variable should be either a construct-specific or thread-specific 2645 // property, not a team specific property 2646 // (a thread can reach the next reduce block on the next construct, reduce 2647 // method may differ on the next construct) 2648 // an ident_t "loc" parameter could be used as a construct-specific property 2649 // (what if loc == 0?) 2650 // (if both construct-specific and team-specific variables were shared, 2651 // then unness extra syncs should be needed) 2652 // a thread-specific variable is better regarding two issues above (next 2653 // construct and extra syncs) 2654 // a thread-specific "th_local.reduction_method" variable is used currently 2655 // each thread executes 'determine' and 'set' lines (no need to execute by one 2656 // thread, to avoid unness extra syncs) 2657 2658 packed_reduction_method = __kmp_determine_reduction_method( 2659 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 2660 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 2661 2662 if (packed_reduction_method == critical_reduce_block) { 2663 2664 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 2665 retval = 1; 2666 2667 } else if (packed_reduction_method == empty_reduce_block) { 2668 2669 // usage: if team size == 1, no synchronization is required ( Intel 2670 // platforms only ) 2671 retval = 1; 2672 2673 } else if (packed_reduction_method == atomic_reduce_block) { 2674 2675 retval = 2; 2676 2677 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 2678 // won't be called by the code gen) 2679 // (it's not quite good, because the checking block has been closed by 2680 // this 'pop', 2681 // but atomic operation has not been executed yet, will be executed 2682 // slightly later, literally on next instruction) 2683 if (__kmp_env_consistency_check) 2684 __kmp_pop_sync(global_tid, ct_reduce, loc); 2685 2686 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 2687 tree_reduce_block)) { 2688 2689 // AT: performance issue: a real barrier here 2690 // AT: (if master goes slow, other threads are blocked here waiting for the 2691 // master to come and release them) 2692 // AT: (it's not what a customer might expect specifying NOWAIT clause) 2693 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 2694 // be confusing to a customer) 2695 // AT: another implementation of *barrier_gather*nowait() (or some other design) 2696 // might go faster and be more in line with sense of NOWAIT 2697 // AT: TO DO: do epcc test and compare times 2698 2699 // this barrier should be invisible to a customer and to the threading profile 2700 // tool (it's neither a terminating barrier nor customer's code, it's 2701 // used for an internal purpose) 2702 #if USE_ITT_NOTIFY 2703 __kmp_threads[global_tid]->th.th_ident = loc; 2704 #endif 2705 retval = 2706 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 2707 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 2708 retval = (retval != 0) ? (0) : (1); 2709 2710 // all other workers except master should do this pop here 2711 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 2712 if (__kmp_env_consistency_check) { 2713 if (retval == 0) { 2714 __kmp_pop_sync(global_tid, ct_reduce, loc); 2715 } 2716 } 2717 2718 } else { 2719 2720 // should never reach this block 2721 KMP_ASSERT(0); // "unexpected method" 2722 } 2723 #if OMP_40_ENABLED 2724 if (teams_swapped) { 2725 // Restore thread structure 2726 th->th.th_info.ds.ds_tid = 0; 2727 th->th.th_team = team; 2728 th->th.th_team_nproc = team->t.t_nproc; 2729 th->th.th_task_team = team->t.t_task_team[task_state]; 2730 th->th.th_task_state = task_state; 2731 } 2732 #endif 2733 KA_TRACE( 2734 10, 2735 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 2736 global_tid, packed_reduction_method, retval)); 2737 2738 return retval; 2739 } 2740 2741 /*! 2742 @ingroup SYNCHRONIZATION 2743 @param loc source location information 2744 @param global_tid global thread id. 2745 @param lck pointer to the unique lock data structure 2746 2747 Finish the execution of a reduce nowait. 2748 */ 2749 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 2750 kmp_critical_name *lck) { 2751 2752 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2753 2754 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 2755 2756 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 2757 2758 if (packed_reduction_method == critical_reduce_block) { 2759 2760 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 2761 2762 } else if (packed_reduction_method == empty_reduce_block) { 2763 2764 // usage: if team size == 1, no synchronization is required ( on Intel 2765 // platforms only ) 2766 2767 } else if (packed_reduction_method == atomic_reduce_block) { 2768 2769 // neither master nor other workers should get here 2770 // (code gen does not generate this call in case 2: atomic reduce block) 2771 // actually it's better to remove this elseif at all; 2772 // after removal this value will checked by the 'else' and will assert 2773 2774 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 2775 tree_reduce_block)) { 2776 2777 // only master gets here 2778 2779 } else { 2780 2781 // should never reach this block 2782 KMP_ASSERT(0); // "unexpected method" 2783 } 2784 2785 if (__kmp_env_consistency_check) 2786 __kmp_pop_sync(global_tid, ct_reduce, loc); 2787 2788 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 2789 global_tid, packed_reduction_method)); 2790 2791 return; 2792 } 2793 2794 /* 2.a.ii. Reduce Block with a terminating barrier */ 2795 2796 /*! 2797 @ingroup SYNCHRONIZATION 2798 @param loc source location information 2799 @param global_tid global thread number 2800 @param num_vars number of items (variables) to be reduced 2801 @param reduce_size size of data in bytes to be reduced 2802 @param reduce_data pointer to data to be reduced 2803 @param reduce_func callback function providing reduction operation on two 2804 operands and returning result of reduction in lhs_data 2805 @param lck pointer to the unique lock data structure 2806 @result 1 for the master thread, 0 for all other team threads, 2 for all team 2807 threads if atomic reduction needed 2808 2809 A blocking reduce that includes an implicit barrier. 2810 */ 2811 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 2812 size_t reduce_size, void *reduce_data, 2813 void (*reduce_func)(void *lhs_data, void *rhs_data), 2814 kmp_critical_name *lck) { 2815 KMP_COUNT_BLOCK(REDUCE_wait); 2816 int retval = 0; 2817 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2818 2819 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 2820 2821 // why do we need this initialization here at all? 2822 // Reduction clause can not be a stand-alone directive. 2823 2824 // do not call __kmp_serial_initialize(), it will be called by 2825 // __kmp_parallel_initialize() if needed 2826 // possible detection of false-positive race by the threadchecker ??? 2827 if (!TCR_4(__kmp_init_parallel)) 2828 __kmp_parallel_initialize(); 2829 2830 // check correctness of reduce block nesting 2831 #if KMP_USE_DYNAMIC_LOCK 2832 if (__kmp_env_consistency_check) 2833 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 2834 #else 2835 if (__kmp_env_consistency_check) 2836 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 2837 #endif 2838 2839 packed_reduction_method = __kmp_determine_reduction_method( 2840 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 2841 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 2842 2843 if (packed_reduction_method == critical_reduce_block) { 2844 2845 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 2846 retval = 1; 2847 2848 } else if (packed_reduction_method == empty_reduce_block) { 2849 2850 // usage: if team size == 1, no synchronization is required ( Intel 2851 // platforms only ) 2852 retval = 1; 2853 2854 } else if (packed_reduction_method == atomic_reduce_block) { 2855 2856 retval = 2; 2857 2858 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 2859 tree_reduce_block)) { 2860 2861 // case tree_reduce_block: 2862 // this barrier should be visible to a customer and to the threading profile 2863 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 2864 #if USE_ITT_NOTIFY 2865 __kmp_threads[global_tid]->th.th_ident = 2866 loc; // needed for correct notification of frames 2867 #endif 2868 retval = 2869 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 2870 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 2871 retval = (retval != 0) ? (0) : (1); 2872 2873 // all other workers except master should do this pop here 2874 // ( none of other workers except master will enter __kmpc_end_reduce() ) 2875 if (__kmp_env_consistency_check) { 2876 if (retval == 0) { // 0: all other workers; 1: master 2877 __kmp_pop_sync(global_tid, ct_reduce, loc); 2878 } 2879 } 2880 2881 } else { 2882 2883 // should never reach this block 2884 KMP_ASSERT(0); // "unexpected method" 2885 } 2886 2887 KA_TRACE(10, 2888 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 2889 global_tid, packed_reduction_method, retval)); 2890 2891 return retval; 2892 } 2893 2894 /*! 2895 @ingroup SYNCHRONIZATION 2896 @param loc source location information 2897 @param global_tid global thread id. 2898 @param lck pointer to the unique lock data structure 2899 2900 Finish the execution of a blocking reduce. 2901 The <tt>lck</tt> pointer must be the same as that used in the corresponding 2902 start function. 2903 */ 2904 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 2905 kmp_critical_name *lck) { 2906 2907 PACKED_REDUCTION_METHOD_T packed_reduction_method; 2908 2909 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 2910 2911 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 2912 2913 // this barrier should be visible to a customer and to the threading profile 2914 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 2915 2916 if (packed_reduction_method == critical_reduce_block) { 2917 2918 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 2919 2920 // TODO: implicit barrier: should be exposed 2921 #if USE_ITT_NOTIFY 2922 __kmp_threads[global_tid]->th.th_ident = loc; 2923 #endif 2924 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 2925 2926 } else if (packed_reduction_method == empty_reduce_block) { 2927 2928 // usage: if team size==1, no synchronization is required (Intel platforms only) 2929 2930 // TODO: implicit barrier: should be exposed 2931 #if USE_ITT_NOTIFY 2932 __kmp_threads[global_tid]->th.th_ident = loc; 2933 #endif 2934 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 2935 2936 } else if (packed_reduction_method == atomic_reduce_block) { 2937 2938 // TODO: implicit barrier: should be exposed 2939 #if USE_ITT_NOTIFY 2940 __kmp_threads[global_tid]->th.th_ident = loc; 2941 #endif 2942 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 2943 2944 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 2945 tree_reduce_block)) { 2946 2947 // only master executes here (master releases all other workers) 2948 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 2949 global_tid); 2950 2951 } else { 2952 2953 // should never reach this block 2954 KMP_ASSERT(0); // "unexpected method" 2955 } 2956 2957 if (__kmp_env_consistency_check) 2958 __kmp_pop_sync(global_tid, ct_reduce, loc); 2959 2960 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 2961 global_tid, packed_reduction_method)); 2962 2963 return; 2964 } 2965 2966 #undef __KMP_GET_REDUCTION_METHOD 2967 #undef __KMP_SET_REDUCTION_METHOD 2968 2969 /* end of interface to fast scalable reduce routines */ 2970 2971 kmp_uint64 __kmpc_get_taskid() { 2972 2973 kmp_int32 gtid; 2974 kmp_info_t *thread; 2975 2976 gtid = __kmp_get_gtid(); 2977 if (gtid < 0) { 2978 return 0; 2979 }; // if 2980 thread = __kmp_thread_from_gtid(gtid); 2981 return thread->th.th_current_task->td_task_id; 2982 2983 } // __kmpc_get_taskid 2984 2985 kmp_uint64 __kmpc_get_parent_taskid() { 2986 2987 kmp_int32 gtid; 2988 kmp_info_t *thread; 2989 kmp_taskdata_t *parent_task; 2990 2991 gtid = __kmp_get_gtid(); 2992 if (gtid < 0) { 2993 return 0; 2994 }; // if 2995 thread = __kmp_thread_from_gtid(gtid); 2996 parent_task = thread->th.th_current_task->td_parent; 2997 return (parent_task == NULL ? 0 : parent_task->td_task_id); 2998 2999 } // __kmpc_get_parent_taskid 3000 3001 #if OMP_45_ENABLED 3002 /*! 3003 @ingroup WORK_SHARING 3004 @param loc source location information. 3005 @param gtid global thread number. 3006 @param num_dims number of associated doacross loops. 3007 @param dims info on loops bounds. 3008 3009 Initialize doacross loop information. 3010 Expect compiler send us inclusive bounds, 3011 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3012 */ 3013 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3014 struct kmp_dim *dims) { 3015 int j, idx; 3016 kmp_int64 last, trace_count; 3017 kmp_info_t *th = __kmp_threads[gtid]; 3018 kmp_team_t *team = th->th.th_team; 3019 kmp_uint32 *flags; 3020 kmp_disp_t *pr_buf = th->th.th_dispatch; 3021 dispatch_shared_info_t *sh_buf; 3022 3023 KA_TRACE( 3024 20, 3025 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3026 gtid, num_dims, !team->t.t_serialized)); 3027 KMP_DEBUG_ASSERT(dims != NULL); 3028 KMP_DEBUG_ASSERT(num_dims > 0); 3029 3030 if (team->t.t_serialized) { 3031 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3032 return; // no dependencies if team is serialized 3033 } 3034 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3035 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3036 // the next loop 3037 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3038 3039 // Save bounds info into allocated private buffer 3040 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3041 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3042 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3043 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3044 pr_buf->th_doacross_info[0] = 3045 (kmp_int64)num_dims; // first element is number of dimensions 3046 // Save also address of num_done in order to access it later without knowing 3047 // the buffer index 3048 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3049 pr_buf->th_doacross_info[2] = dims[0].lo; 3050 pr_buf->th_doacross_info[3] = dims[0].up; 3051 pr_buf->th_doacross_info[4] = dims[0].st; 3052 last = 5; 3053 for (j = 1; j < num_dims; ++j) { 3054 kmp_int64 3055 range_length; // To keep ranges of all dimensions but the first dims[0] 3056 if (dims[j].st == 1) { // most common case 3057 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3058 range_length = dims[j].up - dims[j].lo + 1; 3059 } else { 3060 if (dims[j].st > 0) { 3061 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3062 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3063 } else { // negative increment 3064 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3065 range_length = 3066 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3067 } 3068 } 3069 pr_buf->th_doacross_info[last++] = range_length; 3070 pr_buf->th_doacross_info[last++] = dims[j].lo; 3071 pr_buf->th_doacross_info[last++] = dims[j].up; 3072 pr_buf->th_doacross_info[last++] = dims[j].st; 3073 } 3074 3075 // Compute total trip count. 3076 // Start with range of dims[0] which we don't need to keep in the buffer. 3077 if (dims[0].st == 1) { // most common case 3078 trace_count = dims[0].up - dims[0].lo + 1; 3079 } else if (dims[0].st > 0) { 3080 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3081 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3082 } else { // negative increment 3083 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3084 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3085 } 3086 for (j = 1; j < num_dims; ++j) { 3087 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3088 } 3089 KMP_DEBUG_ASSERT(trace_count > 0); 3090 3091 // Check if shared buffer is not occupied by other loop (idx - 3092 // __kmp_dispatch_num_buffers) 3093 if (idx != sh_buf->doacross_buf_idx) { 3094 // Shared buffer is occupied, wait for it to be free 3095 __kmp_wait_yield_4((kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, 3096 NULL); 3097 } 3098 // Check if we are the first thread. After the CAS the first thread gets 0, 3099 // others get 1 if initialization is in progress, allocated pointer otherwise. 3100 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3101 (kmp_int64 *)&sh_buf->doacross_flags, NULL, (kmp_int64)1); 3102 if (flags == NULL) { 3103 // we are the first thread, allocate the array of flags 3104 kmp_int64 size = 3105 trace_count / 8 + 8; // in bytes, use single bit per iteration 3106 sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3107 } else if ((kmp_int64)flags == 1) { 3108 // initialization is still in progress, need to wait 3109 while ((volatile kmp_int64)sh_buf->doacross_flags == 1) { 3110 KMP_YIELD(TRUE); 3111 } 3112 } 3113 KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 3114 1); // check value of pointer 3115 pr_buf->th_doacross_flags = 3116 sh_buf->doacross_flags; // save private copy in order to not 3117 // touch shared buffer on each iteration 3118 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3119 } 3120 3121 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) { 3122 kmp_int32 shft, num_dims, i; 3123 kmp_uint32 flag; 3124 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3125 kmp_info_t *th = __kmp_threads[gtid]; 3126 kmp_team_t *team = th->th.th_team; 3127 kmp_disp_t *pr_buf; 3128 kmp_int64 lo, up, st; 3129 3130 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3131 if (team->t.t_serialized) { 3132 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3133 return; // no dependencies if team is serialized 3134 } 3135 3136 // calculate sequential iteration number and check out-of-bounds condition 3137 pr_buf = th->th.th_dispatch; 3138 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3139 num_dims = pr_buf->th_doacross_info[0]; 3140 lo = pr_buf->th_doacross_info[2]; 3141 up = pr_buf->th_doacross_info[3]; 3142 st = pr_buf->th_doacross_info[4]; 3143 if (st == 1) { // most common case 3144 if (vec[0] < lo || vec[0] > up) { 3145 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3146 "bounds [%lld,%lld]\n", 3147 gtid, vec[0], lo, up)); 3148 return; 3149 } 3150 iter_number = vec[0] - lo; 3151 } else if (st > 0) { 3152 if (vec[0] < lo || vec[0] > up) { 3153 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3154 "bounds [%lld,%lld]\n", 3155 gtid, vec[0], lo, up)); 3156 return; 3157 } 3158 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3159 } else { // negative increment 3160 if (vec[0] > lo || vec[0] < up) { 3161 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3162 "bounds [%lld,%lld]\n", 3163 gtid, vec[0], lo, up)); 3164 return; 3165 } 3166 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3167 } 3168 for (i = 1; i < num_dims; ++i) { 3169 kmp_int64 iter, ln; 3170 kmp_int32 j = i * 4; 3171 ln = pr_buf->th_doacross_info[j + 1]; 3172 lo = pr_buf->th_doacross_info[j + 2]; 3173 up = pr_buf->th_doacross_info[j + 3]; 3174 st = pr_buf->th_doacross_info[j + 4]; 3175 if (st == 1) { 3176 if (vec[i] < lo || vec[i] > up) { 3177 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3178 "bounds [%lld,%lld]\n", 3179 gtid, vec[i], lo, up)); 3180 return; 3181 } 3182 iter = vec[i] - lo; 3183 } else if (st > 0) { 3184 if (vec[i] < lo || vec[i] > up) { 3185 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3186 "bounds [%lld,%lld]\n", 3187 gtid, vec[i], lo, up)); 3188 return; 3189 } 3190 iter = (kmp_uint64)(vec[i] - lo) / st; 3191 } else { // st < 0 3192 if (vec[i] > lo || vec[i] < up) { 3193 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3194 "bounds [%lld,%lld]\n", 3195 gtid, vec[i], lo, up)); 3196 return; 3197 } 3198 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3199 } 3200 iter_number = iter + ln * iter_number; 3201 } 3202 shft = iter_number % 32; // use 32-bit granularity 3203 iter_number >>= 5; // divided by 32 3204 flag = 1 << shft; 3205 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3206 KMP_YIELD(TRUE); 3207 } 3208 KA_TRACE(20, 3209 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3210 gtid, (iter_number << 5) + shft)); 3211 } 3212 3213 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) { 3214 kmp_int32 shft, num_dims, i; 3215 kmp_uint32 flag; 3216 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3217 kmp_info_t *th = __kmp_threads[gtid]; 3218 kmp_team_t *team = th->th.th_team; 3219 kmp_disp_t *pr_buf; 3220 kmp_int64 lo, st; 3221 3222 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 3223 if (team->t.t_serialized) { 3224 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 3225 return; // no dependencies if team is serialized 3226 } 3227 3228 // calculate sequential iteration number (same as in "wait" but no 3229 // out-of-bounds checks) 3230 pr_buf = th->th.th_dispatch; 3231 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3232 num_dims = pr_buf->th_doacross_info[0]; 3233 lo = pr_buf->th_doacross_info[2]; 3234 st = pr_buf->th_doacross_info[4]; 3235 if (st == 1) { // most common case 3236 iter_number = vec[0] - lo; 3237 } else if (st > 0) { 3238 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3239 } else { // negative increment 3240 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3241 } 3242 for (i = 1; i < num_dims; ++i) { 3243 kmp_int64 iter, ln; 3244 kmp_int32 j = i * 4; 3245 ln = pr_buf->th_doacross_info[j + 1]; 3246 lo = pr_buf->th_doacross_info[j + 2]; 3247 st = pr_buf->th_doacross_info[j + 4]; 3248 if (st == 1) { 3249 iter = vec[i] - lo; 3250 } else if (st > 0) { 3251 iter = (kmp_uint64)(vec[i] - lo) / st; 3252 } else { // st < 0 3253 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3254 } 3255 iter_number = iter + ln * iter_number; 3256 } 3257 shft = iter_number % 32; // use 32-bit granularity 3258 iter_number >>= 5; // divided by 32 3259 flag = 1 << shft; 3260 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 3261 KMP_TEST_THEN_OR32((kmp_int32 *)&pr_buf->th_doacross_flags[iter_number], 3262 (kmp_int32)flag); 3263 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 3264 (iter_number << 5) + shft)); 3265 } 3266 3267 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 3268 kmp_int64 num_done; 3269 kmp_info_t *th = __kmp_threads[gtid]; 3270 kmp_team_t *team = th->th.th_team; 3271 kmp_disp_t *pr_buf = th->th.th_dispatch; 3272 3273 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 3274 if (team->t.t_serialized) { 3275 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 3276 return; // nothing to do 3277 } 3278 num_done = KMP_TEST_THEN_INC64((kmp_int64 *)pr_buf->th_doacross_info[1]) + 1; 3279 if (num_done == th->th.th_team_nproc) { 3280 // we are the last thread, need to free shared resources 3281 int idx = pr_buf->th_doacross_buf_idx - 1; 3282 dispatch_shared_info_t *sh_buf = 3283 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3284 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 3285 (kmp_int64)&sh_buf->doacross_num_done); 3286 KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done); 3287 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 3288 __kmp_thread_free(th, (void *)sh_buf->doacross_flags); 3289 sh_buf->doacross_flags = NULL; 3290 sh_buf->doacross_num_done = 0; 3291 sh_buf->doacross_buf_idx += 3292 __kmp_dispatch_num_buffers; // free buffer for future re-use 3293 } 3294 // free private resources (need to keep buffer index forever) 3295 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 3296 pr_buf->th_doacross_info = NULL; 3297 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 3298 } 3299 #endif 3300 3301 // end of file // 3302