1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #define MAX_MESSAGE 512 27 28 // flags will be used in future, e.g. to implement openmp_strict library 29 // restrictions 30 31 /*! 32 * @ingroup STARTUP_SHUTDOWN 33 * @param loc in source location information 34 * @param flags in for future use (currently ignored) 35 * 36 * Initialize the runtime library. This call is optional; if it is not made then 37 * it will be implicitly called by attempts to use other library functions. 38 */ 39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 40 // By default __kmpc_begin() is no-op. 41 char *env; 42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 43 __kmp_str_match_true(env)) { 44 __kmp_middle_initialize(); 45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 46 } else if (__kmp_ignore_mppbeg() == FALSE) { 47 // By default __kmp_ignore_mppbeg() returns TRUE. 48 __kmp_internal_begin(); 49 KC_TRACE(10, ("__kmpc_begin: called\n")); 50 } 51 } 52 53 /*! 54 * @ingroup STARTUP_SHUTDOWN 55 * @param loc source location information 56 * 57 * Shutdown the runtime library. This is also optional, and even if called will 58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 59 * zero. 60 */ 61 void __kmpc_end(ident_t *loc) { 62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 65 // returns FALSE and __kmpc_end() will unregister this root (it can cause 66 // library shut down). 67 if (__kmp_ignore_mppend() == FALSE) { 68 KC_TRACE(10, ("__kmpc_end: called\n")); 69 KA_TRACE(30, ("__kmpc_end\n")); 70 71 __kmp_internal_end_thread(-1); 72 } 73 } 74 75 /*! 76 @ingroup THREAD_STATES 77 @param loc Source location information. 78 @return The global thread index of the active thread. 79 80 This function can be called in any context. 81 82 If the runtime has ony been entered at the outermost level from a 83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 84 that which would be returned by omp_get_thread_num() in the outermost 85 active parallel construct. (Or zero if there is no active parallel 86 construct, since the master thread is necessarily thread zero). 87 88 If multiple non-OpenMP threads all enter an OpenMP construct then this 89 will be a unique thread identifier among all the threads created by 90 the OpenMP runtime (but the value cannote be defined in terms of 91 OpenMP thread ids returned by omp_get_thread_num()). 92 */ 93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 94 kmp_int32 gtid = __kmp_entry_gtid(); 95 96 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 97 98 return gtid; 99 } 100 101 /*! 102 @ingroup THREAD_STATES 103 @param loc Source location information. 104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 105 106 This function can be called in any context. 107 It returns the total number of threads under the control of the OpenMP runtime. 108 That is not a number that can be determined by any OpenMP standard calls, since 109 the library may be called from more than one non-OpenMP thread, and this 110 reflects the total over all such calls. Similarly the runtime maintains 111 underlying threads even when they are not active (since the cost of creating 112 and destroying OS threads is high), this call counts all such threads even if 113 they are not waiting for work. 114 */ 115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 116 KC_TRACE(10, 117 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 118 119 return TCR_4(__kmp_all_nth); 120 } 121 122 /*! 123 @ingroup THREAD_STATES 124 @param loc Source location information. 125 @return The thread number of the calling thread in the innermost active parallel 126 construct. 127 */ 128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 129 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 130 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 131 } 132 133 /*! 134 @ingroup THREAD_STATES 135 @param loc Source location information. 136 @return The number of threads in the innermost active parallel construct. 137 */ 138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 139 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 140 141 return __kmp_entry_thread()->th.th_team->t.t_nproc; 142 } 143 144 /*! 145 * @ingroup DEPRECATED 146 * @param loc location description 147 * 148 * This function need not be called. It always returns TRUE. 149 */ 150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 151 #ifndef KMP_DEBUG 152 153 return TRUE; 154 155 #else 156 157 const char *semi2; 158 const char *semi3; 159 int line_no; 160 161 if (__kmp_par_range == 0) { 162 return TRUE; 163 } 164 semi2 = loc->psource; 165 if (semi2 == NULL) { 166 return TRUE; 167 } 168 semi2 = strchr(semi2, ';'); 169 if (semi2 == NULL) { 170 return TRUE; 171 } 172 semi2 = strchr(semi2 + 1, ';'); 173 if (semi2 == NULL) { 174 return TRUE; 175 } 176 if (__kmp_par_range_filename[0]) { 177 const char *name = semi2 - 1; 178 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 179 name--; 180 } 181 if ((*name == '/') || (*name == ';')) { 182 name++; 183 } 184 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 185 return __kmp_par_range < 0; 186 } 187 } 188 semi3 = strchr(semi2 + 1, ';'); 189 if (__kmp_par_range_routine[0]) { 190 if ((semi3 != NULL) && (semi3 > semi2) && 191 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 192 return __kmp_par_range < 0; 193 } 194 } 195 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 196 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 197 return __kmp_par_range > 0; 198 } 199 return __kmp_par_range < 0; 200 } 201 return TRUE; 202 203 #endif /* KMP_DEBUG */ 204 } 205 206 /*! 207 @ingroup THREAD_STATES 208 @param loc Source location information. 209 @return 1 if this thread is executing inside an active parallel region, zero if 210 not. 211 */ 212 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 213 return __kmp_entry_thread()->th.th_root->r.r_active; 214 } 215 216 /*! 217 @ingroup PARALLEL 218 @param loc source location information 219 @param global_tid global thread number 220 @param num_threads number of threads requested for this parallel construct 221 222 Set the number of threads to be used by the next fork spawned by this thread. 223 This call is only required if the parallel construct has a `num_threads` clause. 224 */ 225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 226 kmp_int32 num_threads) { 227 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 228 global_tid, num_threads)); 229 230 __kmp_push_num_threads(loc, global_tid, num_threads); 231 } 232 233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 234 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 235 236 /* the num_threads are automatically popped */ 237 } 238 239 #if OMP_40_ENABLED 240 241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 242 kmp_int32 proc_bind) { 243 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 244 proc_bind)); 245 246 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 247 } 248 249 #endif /* OMP_40_ENABLED */ 250 251 /*! 252 @ingroup PARALLEL 253 @param loc source location information 254 @param argc total number of arguments in the ellipsis 255 @param microtask pointer to callback routine consisting of outlined parallel 256 construct 257 @param ... pointers to shared variables that aren't global 258 259 Do the actual fork and call the microtask in the relevant number of threads. 260 */ 261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 262 int gtid = __kmp_entry_gtid(); 263 264 #if (KMP_STATS_ENABLED) 265 int inParallel = __kmpc_in_parallel(loc); 266 if (inParallel) { 267 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 268 } else { 269 KMP_COUNT_BLOCK(OMP_PARALLEL); 270 } 271 #endif 272 273 // maybe to save thr_state is enough here 274 { 275 va_list ap; 276 va_start(ap, microtask); 277 278 #if OMPT_SUPPORT 279 ompt_frame_t *ompt_frame; 280 if (ompt_enabled.enabled) { 281 kmp_info_t *master_th = __kmp_threads[gtid]; 282 kmp_team_t *parent_team = master_th->th.th_team; 283 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 284 if (lwt) 285 ompt_frame = &(lwt->ompt_task_info.frame); 286 else { 287 int tid = __kmp_tid_from_gtid(gtid); 288 ompt_frame = &( 289 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 290 } 291 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 292 OMPT_STORE_RETURN_ADDRESS(gtid); 293 } 294 #endif 295 296 #if INCLUDE_SSC_MARKS 297 SSC_MARK_FORKING(); 298 #endif 299 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 300 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 301 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 302 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 303 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 304 &ap 305 #else 306 ap 307 #endif 308 ); 309 #if INCLUDE_SSC_MARKS 310 SSC_MARK_JOINING(); 311 #endif 312 __kmp_join_call(loc, gtid 313 #if OMPT_SUPPORT 314 , 315 fork_context_intel 316 #endif 317 ); 318 319 va_end(ap); 320 } 321 } 322 323 #if OMP_40_ENABLED 324 /*! 325 @ingroup PARALLEL 326 @param loc source location information 327 @param global_tid global thread number 328 @param num_teams number of teams requested for the teams construct 329 @param num_threads number of threads per team requested for the teams construct 330 331 Set the number of teams to be used by the teams construct. 332 This call is only required if the teams construct has a `num_teams` clause 333 or a `thread_limit` clause (or both). 334 */ 335 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 336 kmp_int32 num_teams, kmp_int32 num_threads) { 337 KA_TRACE(20, 338 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 339 global_tid, num_teams, num_threads)); 340 341 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 342 } 343 344 /*! 345 @ingroup PARALLEL 346 @param loc source location information 347 @param argc total number of arguments in the ellipsis 348 @param microtask pointer to callback routine consisting of outlined teams 349 construct 350 @param ... pointers to shared variables that aren't global 351 352 Do the actual fork and call the microtask in the relevant number of threads. 353 */ 354 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 355 ...) { 356 int gtid = __kmp_entry_gtid(); 357 kmp_info_t *this_thr = __kmp_threads[gtid]; 358 va_list ap; 359 va_start(ap, microtask); 360 361 KMP_COUNT_BLOCK(OMP_TEAMS); 362 363 // remember teams entry point and nesting level 364 this_thr->th.th_teams_microtask = microtask; 365 this_thr->th.th_teams_level = 366 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 367 368 #if OMPT_SUPPORT 369 kmp_team_t *parent_team = this_thr->th.th_team; 370 int tid = __kmp_tid_from_gtid(gtid); 371 if (ompt_enabled.enabled) { 372 parent_team->t.t_implicit_task_taskdata[tid] 373 .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 374 } 375 OMPT_STORE_RETURN_ADDRESS(gtid); 376 #endif 377 378 // check if __kmpc_push_num_teams called, set default number of teams 379 // otherwise 380 if (this_thr->th.th_teams_size.nteams == 0) { 381 __kmp_push_num_teams(loc, gtid, 0, 0); 382 } 383 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 384 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 385 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 386 387 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 388 VOLATILE_CAST(microtask_t) 389 __kmp_teams_master, // "wrapped" task 390 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 391 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 392 &ap 393 #else 394 ap 395 #endif 396 ); 397 __kmp_join_call(loc, gtid 398 #if OMPT_SUPPORT 399 , 400 fork_context_intel 401 #endif 402 ); 403 404 this_thr->th.th_teams_microtask = NULL; 405 this_thr->th.th_teams_level = 0; 406 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 407 va_end(ap); 408 } 409 #endif /* OMP_40_ENABLED */ 410 411 // I don't think this function should ever have been exported. 412 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 413 // openmp code ever called it, but it's been exported from the RTL for so 414 // long that I'm afraid to remove the definition. 415 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 416 417 /*! 418 @ingroup PARALLEL 419 @param loc source location information 420 @param global_tid global thread number 421 422 Enter a serialized parallel construct. This interface is used to handle a 423 conditional parallel region, like this, 424 @code 425 #pragma omp parallel if (condition) 426 @endcode 427 when the condition is false. 428 */ 429 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 430 // The implementation is now in kmp_runtime.cpp so that it can share static 431 // functions with kmp_fork_call since the tasks to be done are similar in 432 // each case. 433 #if OMPT_SUPPORT 434 OMPT_STORE_RETURN_ADDRESS(global_tid); 435 #endif 436 __kmp_serialized_parallel(loc, global_tid); 437 } 438 439 /*! 440 @ingroup PARALLEL 441 @param loc source location information 442 @param global_tid global thread number 443 444 Leave a serialized parallel construct. 445 */ 446 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 447 kmp_internal_control_t *top; 448 kmp_info_t *this_thr; 449 kmp_team_t *serial_team; 450 451 KC_TRACE(10, 452 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 453 454 /* skip all this code for autopar serialized loops since it results in 455 unacceptable overhead */ 456 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 457 return; 458 459 // Not autopar code 460 if (!TCR_4(__kmp_init_parallel)) 461 __kmp_parallel_initialize(); 462 463 this_thr = __kmp_threads[global_tid]; 464 serial_team = this_thr->th.th_serial_team; 465 466 #if OMP_45_ENABLED 467 kmp_task_team_t *task_team = this_thr->th.th_task_team; 468 469 // we need to wait for the proxy tasks before finishing the thread 470 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 471 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 472 #endif 473 474 KMP_MB(); 475 KMP_DEBUG_ASSERT(serial_team); 476 KMP_ASSERT(serial_team->t.t_serialized); 477 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 478 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 479 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 480 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 481 482 #if OMPT_SUPPORT 483 if (ompt_enabled.enabled && 484 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 485 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL; 486 if (ompt_enabled.ompt_callback_implicit_task) { 487 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 488 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 489 __kmp_tid_from_gtid(global_tid)); 490 } 491 492 // reset clear the task id only after unlinking the task 493 ompt_data_t *parent_task_data; 494 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 495 496 if (ompt_enabled.ompt_callback_parallel_end) { 497 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 498 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 499 ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 500 } 501 __ompt_lw_taskteam_unlink(this_thr); 502 this_thr->th.ompt_thread_info.state = omp_state_overhead; 503 } 504 #endif 505 506 /* If necessary, pop the internal control stack values and replace the team 507 * values */ 508 top = serial_team->t.t_control_stack_top; 509 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 510 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 511 serial_team->t.t_control_stack_top = top->next; 512 __kmp_free(top); 513 } 514 515 // if( serial_team -> t.t_serialized > 1 ) 516 serial_team->t.t_level--; 517 518 /* pop dispatch buffers stack */ 519 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 520 { 521 dispatch_private_info_t *disp_buffer = 522 serial_team->t.t_dispatch->th_disp_buffer; 523 serial_team->t.t_dispatch->th_disp_buffer = 524 serial_team->t.t_dispatch->th_disp_buffer->next; 525 __kmp_free(disp_buffer); 526 } 527 528 --serial_team->t.t_serialized; 529 if (serial_team->t.t_serialized == 0) { 530 531 /* return to the parallel section */ 532 533 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 534 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 535 __kmp_clear_x87_fpu_status_word(); 536 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 537 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 538 } 539 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 540 541 this_thr->th.th_team = serial_team->t.t_parent; 542 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 543 544 /* restore values cached in the thread */ 545 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 546 this_thr->th.th_team_master = 547 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 548 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 549 550 /* TODO the below shouldn't need to be adjusted for serialized teams */ 551 this_thr->th.th_dispatch = 552 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 553 554 __kmp_pop_current_task_from_thread(this_thr); 555 556 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 557 this_thr->th.th_current_task->td_flags.executing = 1; 558 559 if (__kmp_tasking_mode != tskm_immediate_exec) { 560 // Copy the task team from the new child / old parent team to the thread. 561 this_thr->th.th_task_team = 562 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 563 KA_TRACE(20, 564 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 565 "team %p\n", 566 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 567 } 568 } else { 569 if (__kmp_tasking_mode != tskm_immediate_exec) { 570 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 571 "depth of serial team %p to %d\n", 572 global_tid, serial_team, serial_team->t.t_serialized)); 573 } 574 } 575 576 if (__kmp_env_consistency_check) 577 __kmp_pop_parallel(global_tid, NULL); 578 #if OMPT_SUPPORT 579 if (ompt_enabled.enabled) 580 this_thr->th.ompt_thread_info.state = 581 ((this_thr->th.th_team_serialized) ? omp_state_work_serial 582 : omp_state_work_parallel); 583 #endif 584 } 585 586 /*! 587 @ingroup SYNCHRONIZATION 588 @param loc source location information. 589 590 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 591 depending on the memory ordering convention obeyed by the compiler 592 even that may not be necessary). 593 */ 594 void __kmpc_flush(ident_t *loc) { 595 KC_TRACE(10, ("__kmpc_flush: called\n")); 596 597 /* need explicit __mf() here since use volatile instead in library */ 598 KMP_MB(); /* Flush all pending memory write invalidates. */ 599 600 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 601 #if KMP_MIC 602 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 603 // We shouldn't need it, though, since the ABI rules require that 604 // * If the compiler generates NGO stores it also generates the fence 605 // * If users hand-code NGO stores they should insert the fence 606 // therefore no incomplete unordered stores should be visible. 607 #else 608 // C74404 609 // This is to address non-temporal store instructions (sfence needed). 610 // The clflush instruction is addressed either (mfence needed). 611 // Probably the non-temporal load monvtdqa instruction should also be 612 // addressed. 613 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 614 if (!__kmp_cpuinfo.initialized) { 615 __kmp_query_cpuid(&__kmp_cpuinfo); 616 } 617 if (!__kmp_cpuinfo.sse2) { 618 // CPU cannot execute SSE2 instructions. 619 } else { 620 #if KMP_COMPILER_ICC 621 _mm_mfence(); 622 #elif KMP_COMPILER_MSVC 623 MemoryBarrier(); 624 #else 625 __sync_synchronize(); 626 #endif // KMP_COMPILER_ICC 627 } 628 #endif // KMP_MIC 629 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 630 // Nothing to see here move along 631 #elif KMP_ARCH_PPC64 632 // Nothing needed here (we have a real MB above). 633 #if KMP_OS_CNK 634 // The flushing thread needs to yield here; this prevents a 635 // busy-waiting thread from saturating the pipeline. flush is 636 // often used in loops like this: 637 // while (!flag) { 638 // #pragma omp flush(flag) 639 // } 640 // and adding the yield here is good for at least a 10x speedup 641 // when running >2 threads per core (on the NAS LU benchmark). 642 __kmp_yield(TRUE); 643 #endif 644 #else 645 #error Unknown or unsupported architecture 646 #endif 647 648 #if OMPT_SUPPORT && OMPT_OPTIONAL 649 if (ompt_enabled.ompt_callback_flush) { 650 ompt_callbacks.ompt_callback(ompt_callback_flush)( 651 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 652 } 653 #endif 654 } 655 656 /* -------------------------------------------------------------------------- */ 657 /*! 658 @ingroup SYNCHRONIZATION 659 @param loc source location information 660 @param global_tid thread id. 661 662 Execute a barrier. 663 */ 664 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 665 KMP_COUNT_BLOCK(OMP_BARRIER); 666 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 667 668 if (!TCR_4(__kmp_init_parallel)) 669 __kmp_parallel_initialize(); 670 671 if (__kmp_env_consistency_check) { 672 if (loc == 0) { 673 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 674 } 675 676 __kmp_check_barrier(global_tid, ct_barrier, loc); 677 } 678 679 #if OMPT_SUPPORT 680 ompt_frame_t *ompt_frame; 681 if (ompt_enabled.enabled) { 682 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 683 if (ompt_frame->enter_frame == NULL) 684 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 685 OMPT_STORE_RETURN_ADDRESS(global_tid); 686 } 687 #endif 688 __kmp_threads[global_tid]->th.th_ident = loc; 689 // TODO: explicit barrier_wait_id: 690 // this function is called when 'barrier' directive is present or 691 // implicit barrier at the end of a worksharing construct. 692 // 1) better to add a per-thread barrier counter to a thread data structure 693 // 2) set to 0 when a new team is created 694 // 4) no sync is required 695 696 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 697 #if OMPT_SUPPORT && OMPT_OPTIONAL 698 if (ompt_enabled.enabled) { 699 ompt_frame->enter_frame = NULL; 700 } 701 #endif 702 } 703 704 /* The BARRIER for a MASTER section is always explicit */ 705 /*! 706 @ingroup WORK_SHARING 707 @param loc source location information. 708 @param global_tid global thread number . 709 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 710 */ 711 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 712 int status = 0; 713 714 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 715 716 if (!TCR_4(__kmp_init_parallel)) 717 __kmp_parallel_initialize(); 718 719 if (KMP_MASTER_GTID(global_tid)) { 720 KMP_COUNT_BLOCK(OMP_MASTER); 721 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 722 status = 1; 723 } 724 725 #if OMPT_SUPPORT && OMPT_OPTIONAL 726 if (status) { 727 if (ompt_enabled.ompt_callback_master) { 728 kmp_info_t *this_thr = __kmp_threads[global_tid]; 729 kmp_team_t *team = this_thr->th.th_team; 730 731 int tid = __kmp_tid_from_gtid(global_tid); 732 ompt_callbacks.ompt_callback(ompt_callback_master)( 733 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 734 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 735 OMPT_GET_RETURN_ADDRESS(0)); 736 } 737 } 738 #endif 739 740 if (__kmp_env_consistency_check) { 741 #if KMP_USE_DYNAMIC_LOCK 742 if (status) 743 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 744 else 745 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 746 #else 747 if (status) 748 __kmp_push_sync(global_tid, ct_master, loc, NULL); 749 else 750 __kmp_check_sync(global_tid, ct_master, loc, NULL); 751 #endif 752 } 753 754 return status; 755 } 756 757 /*! 758 @ingroup WORK_SHARING 759 @param loc source location information. 760 @param global_tid global thread number . 761 762 Mark the end of a <tt>master</tt> region. This should only be called by the 763 thread that executes the <tt>master</tt> region. 764 */ 765 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 766 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 767 768 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 769 KMP_POP_PARTITIONED_TIMER(); 770 771 #if OMPT_SUPPORT && OMPT_OPTIONAL 772 kmp_info_t *this_thr = __kmp_threads[global_tid]; 773 kmp_team_t *team = this_thr->th.th_team; 774 if (ompt_enabled.ompt_callback_master) { 775 int tid = __kmp_tid_from_gtid(global_tid); 776 ompt_callbacks.ompt_callback(ompt_callback_master)( 777 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 778 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 779 OMPT_GET_RETURN_ADDRESS(0)); 780 } 781 #endif 782 783 if (__kmp_env_consistency_check) { 784 if (global_tid < 0) 785 KMP_WARNING(ThreadIdentInvalid); 786 787 if (KMP_MASTER_GTID(global_tid)) 788 __kmp_pop_sync(global_tid, ct_master, loc); 789 } 790 } 791 792 /*! 793 @ingroup WORK_SHARING 794 @param loc source location information. 795 @param gtid global thread number. 796 797 Start execution of an <tt>ordered</tt> construct. 798 */ 799 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 800 int cid = 0; 801 kmp_info_t *th; 802 KMP_DEBUG_ASSERT(__kmp_init_serial); 803 804 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 805 806 if (!TCR_4(__kmp_init_parallel)) 807 __kmp_parallel_initialize(); 808 809 #if USE_ITT_BUILD 810 __kmp_itt_ordered_prep(gtid); 811 // TODO: ordered_wait_id 812 #endif /* USE_ITT_BUILD */ 813 814 th = __kmp_threads[gtid]; 815 816 #if OMPT_SUPPORT && OMPT_OPTIONAL 817 kmp_team_t *team; 818 ompt_wait_id_t lck; 819 void *codeptr_ra; 820 if (ompt_enabled.enabled) { 821 OMPT_STORE_RETURN_ADDRESS(gtid); 822 team = __kmp_team_from_gtid(gtid); 823 lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value; 824 /* OMPT state update */ 825 th->th.ompt_thread_info.wait_id = lck; 826 th->th.ompt_thread_info.state = omp_state_wait_ordered; 827 828 /* OMPT event callback */ 829 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 830 if (ompt_enabled.ompt_callback_mutex_acquire) { 831 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 832 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 833 (ompt_wait_id_t)lck, codeptr_ra); 834 } 835 } 836 #endif 837 838 if (th->th.th_dispatch->th_deo_fcn != 0) 839 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 840 else 841 __kmp_parallel_deo(>id, &cid, loc); 842 843 #if OMPT_SUPPORT && OMPT_OPTIONAL 844 if (ompt_enabled.enabled) { 845 /* OMPT state update */ 846 th->th.ompt_thread_info.state = omp_state_work_parallel; 847 th->th.ompt_thread_info.wait_id = 0; 848 849 /* OMPT event callback */ 850 if (ompt_enabled.ompt_callback_mutex_acquired) { 851 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 852 ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra); 853 } 854 } 855 #endif 856 857 #if USE_ITT_BUILD 858 __kmp_itt_ordered_start(gtid); 859 #endif /* USE_ITT_BUILD */ 860 } 861 862 /*! 863 @ingroup WORK_SHARING 864 @param loc source location information. 865 @param gtid global thread number. 866 867 End execution of an <tt>ordered</tt> construct. 868 */ 869 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 870 int cid = 0; 871 kmp_info_t *th; 872 873 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 874 875 #if USE_ITT_BUILD 876 __kmp_itt_ordered_end(gtid); 877 // TODO: ordered_wait_id 878 #endif /* USE_ITT_BUILD */ 879 880 th = __kmp_threads[gtid]; 881 882 if (th->th.th_dispatch->th_dxo_fcn != 0) 883 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 884 else 885 __kmp_parallel_dxo(>id, &cid, loc); 886 887 #if OMPT_SUPPORT && OMPT_OPTIONAL 888 OMPT_STORE_RETURN_ADDRESS(gtid); 889 if (ompt_enabled.ompt_callback_mutex_released) { 890 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 891 ompt_mutex_ordered, 892 (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 893 OMPT_LOAD_RETURN_ADDRESS(gtid)); 894 } 895 #endif 896 } 897 898 #if KMP_USE_DYNAMIC_LOCK 899 900 static __forceinline void 901 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 902 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 903 // Pointer to the allocated indirect lock is written to crit, while indexing 904 // is ignored. 905 void *idx; 906 kmp_indirect_lock_t **lck; 907 lck = (kmp_indirect_lock_t **)crit; 908 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 909 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 910 KMP_SET_I_LOCK_LOCATION(ilk, loc); 911 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 912 KA_TRACE(20, 913 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 914 #if USE_ITT_BUILD 915 __kmp_itt_critical_creating(ilk->lock, loc); 916 #endif 917 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 918 if (status == 0) { 919 #if USE_ITT_BUILD 920 __kmp_itt_critical_destroyed(ilk->lock); 921 #endif 922 // We don't really need to destroy the unclaimed lock here since it will be 923 // cleaned up at program exit. 924 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 925 } 926 KMP_DEBUG_ASSERT(*lck != NULL); 927 } 928 929 // Fast-path acquire tas lock 930 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 931 { \ 932 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 933 if (l->lk.poll != KMP_LOCK_FREE(tas) || \ 934 !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 935 KMP_LOCK_BUSY(gtid + 1, tas))) { \ 936 kmp_uint32 spins; \ 937 KMP_FSYNC_PREPARE(l); \ 938 KMP_INIT_YIELD(spins); \ 939 if (TCR_4(__kmp_nth) > \ 940 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 941 KMP_YIELD(TRUE); \ 942 } else { \ 943 KMP_YIELD_SPIN(spins); \ 944 } \ 945 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 946 while (l->lk.poll != KMP_LOCK_FREE(tas) || \ 947 !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 948 KMP_LOCK_BUSY(gtid + 1, tas))) { \ 949 __kmp_spin_backoff(&backoff); \ 950 if (TCR_4(__kmp_nth) > \ 951 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 952 KMP_YIELD(TRUE); \ 953 } else { \ 954 KMP_YIELD_SPIN(spins); \ 955 } \ 956 } \ 957 } \ 958 KMP_FSYNC_ACQUIRED(l); \ 959 } 960 961 // Fast-path test tas lock 962 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 963 { \ 964 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 965 rc = l->lk.poll == KMP_LOCK_FREE(tas) && \ 966 KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \ 967 KMP_LOCK_BUSY(gtid + 1, tas)); \ 968 } 969 970 // Fast-path release tas lock 971 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 972 { \ 973 TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \ 974 KMP_MB(); \ 975 } 976 977 #if KMP_USE_FUTEX 978 979 #include <sys/syscall.h> 980 #include <unistd.h> 981 #ifndef FUTEX_WAIT 982 #define FUTEX_WAIT 0 983 #endif 984 #ifndef FUTEX_WAKE 985 #define FUTEX_WAKE 1 986 #endif 987 988 // Fast-path acquire futex lock 989 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 990 { \ 991 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 992 kmp_int32 gtid_code = (gtid + 1) << 1; \ 993 KMP_MB(); \ 994 KMP_FSYNC_PREPARE(ftx); \ 995 kmp_int32 poll_val; \ 996 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 997 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 998 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 999 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1000 if (!cond) { \ 1001 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1002 poll_val | \ 1003 KMP_LOCK_BUSY(1, futex))) { \ 1004 continue; \ 1005 } \ 1006 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1007 } \ 1008 kmp_int32 rc; \ 1009 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1010 NULL, NULL, 0)) != 0) { \ 1011 continue; \ 1012 } \ 1013 gtid_code |= 1; \ 1014 } \ 1015 KMP_FSYNC_ACQUIRED(ftx); \ 1016 } 1017 1018 // Fast-path test futex lock 1019 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1020 { \ 1021 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1022 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1023 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1024 KMP_FSYNC_ACQUIRED(ftx); \ 1025 rc = TRUE; \ 1026 } else { \ 1027 rc = FALSE; \ 1028 } \ 1029 } 1030 1031 // Fast-path release futex lock 1032 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1033 { \ 1034 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1035 KMP_MB(); \ 1036 KMP_FSYNC_RELEASING(ftx); \ 1037 kmp_int32 poll_val = \ 1038 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1039 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1040 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1041 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1042 } \ 1043 KMP_MB(); \ 1044 KMP_YIELD(TCR_4(__kmp_nth) > \ 1045 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1046 } 1047 1048 #endif // KMP_USE_FUTEX 1049 1050 #else // KMP_USE_DYNAMIC_LOCK 1051 1052 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1053 ident_t const *loc, 1054 kmp_int32 gtid) { 1055 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1056 1057 // Because of the double-check, the following load doesn't need to be volatile 1058 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1059 1060 if (lck == NULL) { 1061 void *idx; 1062 1063 // Allocate & initialize the lock. 1064 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1065 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1066 __kmp_init_user_lock_with_checks(lck); 1067 __kmp_set_user_lock_location(lck, loc); 1068 #if USE_ITT_BUILD 1069 __kmp_itt_critical_creating(lck); 1070 // __kmp_itt_critical_creating() should be called *before* the first usage 1071 // of underlying lock. It is the only place where we can guarantee it. There 1072 // are chances the lock will destroyed with no usage, but it is not a 1073 // problem, because this is not real event seen by user but rather setting 1074 // name for object (lock). See more details in kmp_itt.h. 1075 #endif /* USE_ITT_BUILD */ 1076 1077 // Use a cmpxchg instruction to slam the start of the critical section with 1078 // the lock pointer. If another thread beat us to it, deallocate the lock, 1079 // and use the lock that the other thread allocated. 1080 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1081 1082 if (status == 0) { 1083 // Deallocate the lock and reload the value. 1084 #if USE_ITT_BUILD 1085 __kmp_itt_critical_destroyed(lck); 1086 // Let ITT know the lock is destroyed and the same memory location may be reused 1087 // for another purpose. 1088 #endif /* USE_ITT_BUILD */ 1089 __kmp_destroy_user_lock_with_checks(lck); 1090 __kmp_user_lock_free(&idx, gtid, lck); 1091 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1092 KMP_DEBUG_ASSERT(lck != NULL); 1093 } 1094 } 1095 return lck; 1096 } 1097 1098 #endif // KMP_USE_DYNAMIC_LOCK 1099 1100 /*! 1101 @ingroup WORK_SHARING 1102 @param loc source location information. 1103 @param global_tid global thread number . 1104 @param crit identity of the critical section. This could be a pointer to a lock 1105 associated with the critical section, or some other suitably unique value. 1106 1107 Enter code protected by a `critical` construct. 1108 This function blocks until the executing thread can enter the critical section. 1109 */ 1110 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1111 kmp_critical_name *crit) { 1112 #if KMP_USE_DYNAMIC_LOCK 1113 #if OMPT_SUPPORT && OMPT_OPTIONAL 1114 OMPT_STORE_RETURN_ADDRESS(global_tid); 1115 #endif // OMPT_SUPPORT 1116 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1117 #else 1118 KMP_COUNT_BLOCK(OMP_CRITICAL); 1119 KMP_TIME_PARTITIONED_BLOCK( 1120 OMP_critical_wait); /* Time spent waiting to enter the critical section */ 1121 #if OMPT_SUPPORT && OMPT_OPTIONAL 1122 omp_state_t prev_state = omp_state_undefined; 1123 ompt_thread_info_t ti; 1124 #endif 1125 kmp_user_lock_p lck; 1126 1127 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1128 1129 // TODO: add THR_OVHD_STATE 1130 1131 KMP_CHECK_USER_LOCK_INIT(); 1132 1133 if ((__kmp_user_lock_kind == lk_tas) && 1134 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1135 lck = (kmp_user_lock_p)crit; 1136 } 1137 #if KMP_USE_FUTEX 1138 else if ((__kmp_user_lock_kind == lk_futex) && 1139 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1140 lck = (kmp_user_lock_p)crit; 1141 } 1142 #endif 1143 else { // ticket, queuing or drdpa 1144 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1145 } 1146 1147 if (__kmp_env_consistency_check) 1148 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1149 1150 // since the critical directive binds to all threads, not just the current 1151 // team we have to check this even if we are in a serialized team. 1152 // also, even if we are the uber thread, we still have to conduct the lock, 1153 // as we have to contend with sibling threads. 1154 1155 #if USE_ITT_BUILD 1156 __kmp_itt_critical_acquiring(lck); 1157 #endif /* USE_ITT_BUILD */ 1158 #if OMPT_SUPPORT && OMPT_OPTIONAL 1159 OMPT_STORE_RETURN_ADDRESS(gtid); 1160 void *codeptr_ra = NULL; 1161 if (ompt_enabled.enabled) { 1162 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1163 /* OMPT state update */ 1164 prev_state = ti.state; 1165 ti.wait_id = (ompt_wait_id_t)lck; 1166 ti.state = omp_state_wait_critical; 1167 1168 /* OMPT event callback */ 1169 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1170 if (ompt_enabled.ompt_callback_mutex_acquire) { 1171 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1172 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1173 (ompt_wait_id_t)crit, codeptr_ra); 1174 } 1175 } 1176 #endif 1177 // Value of 'crit' should be good for using as a critical_id of the critical 1178 // section directive. 1179 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1180 1181 #if USE_ITT_BUILD 1182 __kmp_itt_critical_acquired(lck); 1183 #endif /* USE_ITT_BUILD */ 1184 #if OMPT_SUPPORT && OMPT_OPTIONAL 1185 if (ompt_enabled.enabled) { 1186 /* OMPT state update */ 1187 ti.state = prev_state; 1188 ti.wait_id = 0; 1189 1190 /* OMPT event callback */ 1191 if (ompt_enabled.ompt_callback_mutex_acquired) { 1192 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1193 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra); 1194 } 1195 } 1196 #endif 1197 1198 KMP_START_EXPLICIT_TIMER(OMP_critical); 1199 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1200 #endif // KMP_USE_DYNAMIC_LOCK 1201 } 1202 1203 #if KMP_USE_DYNAMIC_LOCK 1204 1205 // Converts the given hint to an internal lock implementation 1206 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1207 #if KMP_USE_TSX 1208 #define KMP_TSX_LOCK(seq) lockseq_##seq 1209 #else 1210 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1211 #endif 1212 1213 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1214 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1215 #else 1216 #define KMP_CPUINFO_RTM 0 1217 #endif 1218 1219 // Hints that do not require further logic 1220 if (hint & kmp_lock_hint_hle) 1221 return KMP_TSX_LOCK(hle); 1222 if (hint & kmp_lock_hint_rtm) 1223 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1224 if (hint & kmp_lock_hint_adaptive) 1225 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1226 1227 // Rule out conflicting hints first by returning the default lock 1228 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1229 return __kmp_user_lock_seq; 1230 if ((hint & omp_lock_hint_speculative) && 1231 (hint & omp_lock_hint_nonspeculative)) 1232 return __kmp_user_lock_seq; 1233 1234 // Do not even consider speculation when it appears to be contended 1235 if (hint & omp_lock_hint_contended) 1236 return lockseq_queuing; 1237 1238 // Uncontended lock without speculation 1239 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1240 return lockseq_tas; 1241 1242 // HLE lock for speculation 1243 if (hint & omp_lock_hint_speculative) 1244 return KMP_TSX_LOCK(hle); 1245 1246 return __kmp_user_lock_seq; 1247 } 1248 1249 #if OMPT_SUPPORT && OMPT_OPTIONAL 1250 static kmp_mutex_impl_t 1251 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1252 if (user_lock) { 1253 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1254 case 0: 1255 break; 1256 #if KMP_USE_FUTEX 1257 case locktag_futex: 1258 return kmp_mutex_impl_queuing; 1259 #endif 1260 case locktag_tas: 1261 return kmp_mutex_impl_spin; 1262 #if KMP_USE_TSX 1263 case locktag_hle: 1264 return kmp_mutex_impl_speculative; 1265 #endif 1266 default: 1267 return ompt_mutex_impl_unknown; 1268 } 1269 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1270 } 1271 KMP_ASSERT(ilock); 1272 switch (ilock->type) { 1273 #if KMP_USE_TSX 1274 case locktag_adaptive: 1275 case locktag_rtm: 1276 return kmp_mutex_impl_speculative; 1277 #endif 1278 case locktag_nested_tas: 1279 return kmp_mutex_impl_spin; 1280 #if KMP_USE_FUTEX 1281 case locktag_nested_futex: 1282 #endif 1283 case locktag_ticket: 1284 case locktag_queuing: 1285 case locktag_drdpa: 1286 case locktag_nested_ticket: 1287 case locktag_nested_queuing: 1288 case locktag_nested_drdpa: 1289 return kmp_mutex_impl_queuing; 1290 default: 1291 return ompt_mutex_impl_unknown; 1292 } 1293 } 1294 1295 // For locks without dynamic binding 1296 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1297 switch (__kmp_user_lock_kind) { 1298 case lk_tas: 1299 return kmp_mutex_impl_spin; 1300 #if KMP_USE_FUTEX 1301 case lk_futex: 1302 #endif 1303 case lk_ticket: 1304 case lk_queuing: 1305 case lk_drdpa: 1306 return kmp_mutex_impl_queuing; 1307 #if KMP_USE_TSX 1308 case lk_hle: 1309 case lk_rtm: 1310 case lk_adaptive: 1311 return kmp_mutex_impl_speculative; 1312 #endif 1313 default: 1314 return ompt_mutex_impl_unknown; 1315 } 1316 } 1317 #endif 1318 1319 /*! 1320 @ingroup WORK_SHARING 1321 @param loc source location information. 1322 @param global_tid global thread number. 1323 @param crit identity of the critical section. This could be a pointer to a lock 1324 associated with the critical section, or some other suitably unique value. 1325 @param hint the lock hint. 1326 1327 Enter code protected by a `critical` construct with a hint. The hint value is 1328 used to suggest a lock implementation. This function blocks until the executing 1329 thread can enter the critical section unless the hint suggests use of 1330 speculative execution and the hardware supports it. 1331 */ 1332 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1333 kmp_critical_name *crit, uintptr_t hint) { 1334 KMP_COUNT_BLOCK(OMP_CRITICAL); 1335 kmp_user_lock_p lck; 1336 #if OMPT_SUPPORT && OMPT_OPTIONAL 1337 omp_state_t prev_state = omp_state_undefined; 1338 ompt_thread_info_t ti; 1339 // This is the case, if called from __kmpc_critical: 1340 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1341 if (!codeptr) 1342 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1343 #endif 1344 1345 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1346 1347 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1348 // Check if it is initialized. 1349 if (*lk == 0) { 1350 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1351 if (KMP_IS_D_LOCK(lckseq)) { 1352 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1353 KMP_GET_D_TAG(lckseq)); 1354 } else { 1355 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1356 } 1357 } 1358 // Branch for accessing the actual lock object and set operation. This 1359 // branching is inevitable since this lock initialization does not follow the 1360 // normal dispatch path (lock table is not used). 1361 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1362 lck = (kmp_user_lock_p)lk; 1363 if (__kmp_env_consistency_check) { 1364 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1365 __kmp_map_hint_to_lock(hint)); 1366 } 1367 #if USE_ITT_BUILD 1368 __kmp_itt_critical_acquiring(lck); 1369 #endif 1370 #if OMPT_SUPPORT && OMPT_OPTIONAL 1371 if (ompt_enabled.enabled) { 1372 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1373 /* OMPT state update */ 1374 prev_state = ti.state; 1375 ti.wait_id = (ompt_wait_id_t)lck; 1376 ti.state = omp_state_wait_critical; 1377 1378 /* OMPT event callback */ 1379 if (ompt_enabled.ompt_callback_mutex_acquire) { 1380 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1381 ompt_mutex_critical, (unsigned int)hint, 1382 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr); 1383 } 1384 } 1385 #endif 1386 #if KMP_USE_INLINED_TAS 1387 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1388 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1389 } else 1390 #elif KMP_USE_INLINED_FUTEX 1391 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1392 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1393 } else 1394 #endif 1395 { 1396 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1397 } 1398 } else { 1399 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1400 lck = ilk->lock; 1401 if (__kmp_env_consistency_check) { 1402 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1403 __kmp_map_hint_to_lock(hint)); 1404 } 1405 #if USE_ITT_BUILD 1406 __kmp_itt_critical_acquiring(lck); 1407 #endif 1408 #if OMPT_SUPPORT && OMPT_OPTIONAL 1409 if (ompt_enabled.enabled) { 1410 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1411 /* OMPT state update */ 1412 prev_state = ti.state; 1413 ti.wait_id = (ompt_wait_id_t)lck; 1414 ti.state = omp_state_wait_critical; 1415 1416 /* OMPT event callback */ 1417 if (ompt_enabled.ompt_callback_mutex_acquire) { 1418 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1419 ompt_mutex_critical, (unsigned int)hint, 1420 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr); 1421 } 1422 } 1423 #endif 1424 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1425 } 1426 1427 #if USE_ITT_BUILD 1428 __kmp_itt_critical_acquired(lck); 1429 #endif /* USE_ITT_BUILD */ 1430 #if OMPT_SUPPORT && OMPT_OPTIONAL 1431 if (ompt_enabled.enabled) { 1432 /* OMPT state update */ 1433 ti.state = prev_state; 1434 ti.wait_id = 0; 1435 1436 /* OMPT event callback */ 1437 if (ompt_enabled.ompt_callback_mutex_acquired) { 1438 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1439 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr); 1440 } 1441 } 1442 #endif 1443 1444 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1445 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1446 } // __kmpc_critical_with_hint 1447 1448 #endif // KMP_USE_DYNAMIC_LOCK 1449 1450 /*! 1451 @ingroup WORK_SHARING 1452 @param loc source location information. 1453 @param global_tid global thread number . 1454 @param crit identity of the critical section. This could be a pointer to a lock 1455 associated with the critical section, or some other suitably unique value. 1456 1457 Leave a critical section, releasing any lock that was held during its execution. 1458 */ 1459 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1460 kmp_critical_name *crit) { 1461 kmp_user_lock_p lck; 1462 1463 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1464 1465 #if KMP_USE_DYNAMIC_LOCK 1466 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1467 lck = (kmp_user_lock_p)crit; 1468 KMP_ASSERT(lck != NULL); 1469 if (__kmp_env_consistency_check) { 1470 __kmp_pop_sync(global_tid, ct_critical, loc); 1471 } 1472 #if USE_ITT_BUILD 1473 __kmp_itt_critical_releasing(lck); 1474 #endif 1475 #if KMP_USE_INLINED_TAS 1476 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1477 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1478 } else 1479 #elif KMP_USE_INLINED_FUTEX 1480 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1481 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1482 } else 1483 #endif 1484 { 1485 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1486 } 1487 } else { 1488 kmp_indirect_lock_t *ilk = 1489 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1490 KMP_ASSERT(ilk != NULL); 1491 lck = ilk->lock; 1492 if (__kmp_env_consistency_check) { 1493 __kmp_pop_sync(global_tid, ct_critical, loc); 1494 } 1495 #if USE_ITT_BUILD 1496 __kmp_itt_critical_releasing(lck); 1497 #endif 1498 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1499 } 1500 1501 #else // KMP_USE_DYNAMIC_LOCK 1502 1503 if ((__kmp_user_lock_kind == lk_tas) && 1504 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1505 lck = (kmp_user_lock_p)crit; 1506 } 1507 #if KMP_USE_FUTEX 1508 else if ((__kmp_user_lock_kind == lk_futex) && 1509 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1510 lck = (kmp_user_lock_p)crit; 1511 } 1512 #endif 1513 else { // ticket, queuing or drdpa 1514 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1515 } 1516 1517 KMP_ASSERT(lck != NULL); 1518 1519 if (__kmp_env_consistency_check) 1520 __kmp_pop_sync(global_tid, ct_critical, loc); 1521 1522 #if USE_ITT_BUILD 1523 __kmp_itt_critical_releasing(lck); 1524 #endif /* USE_ITT_BUILD */ 1525 // Value of 'crit' should be good for using as a critical_id of the critical 1526 // section directive. 1527 __kmp_release_user_lock_with_checks(lck, global_tid); 1528 1529 #endif // KMP_USE_DYNAMIC_LOCK 1530 1531 #if OMPT_SUPPORT && OMPT_OPTIONAL 1532 /* OMPT release event triggers after lock is released; place here to trigger 1533 * for all #if branches */ 1534 OMPT_STORE_RETURN_ADDRESS(global_tid); 1535 if (ompt_enabled.ompt_callback_mutex_released) { 1536 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1537 ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1538 } 1539 #endif 1540 1541 KMP_POP_PARTITIONED_TIMER(); 1542 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1543 } 1544 1545 /*! 1546 @ingroup SYNCHRONIZATION 1547 @param loc source location information 1548 @param global_tid thread id. 1549 @return one if the thread should execute the master block, zero otherwise 1550 1551 Start execution of a combined barrier and master. The barrier is executed inside 1552 this function. 1553 */ 1554 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1555 int status; 1556 1557 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1558 1559 if (!TCR_4(__kmp_init_parallel)) 1560 __kmp_parallel_initialize(); 1561 1562 if (__kmp_env_consistency_check) 1563 __kmp_check_barrier(global_tid, ct_barrier, loc); 1564 1565 #if OMPT_SUPPORT 1566 ompt_frame_t *ompt_frame; 1567 if (ompt_enabled.enabled) { 1568 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1569 if (ompt_frame->enter_frame == NULL) 1570 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1571 OMPT_STORE_RETURN_ADDRESS(global_tid); 1572 } 1573 #endif 1574 #if USE_ITT_NOTIFY 1575 __kmp_threads[global_tid]->th.th_ident = loc; 1576 #endif 1577 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1578 #if OMPT_SUPPORT && OMPT_OPTIONAL 1579 if (ompt_enabled.enabled) { 1580 ompt_frame->enter_frame = NULL; 1581 } 1582 #endif 1583 1584 return (status != 0) ? 0 : 1; 1585 } 1586 1587 /*! 1588 @ingroup SYNCHRONIZATION 1589 @param loc source location information 1590 @param global_tid thread id. 1591 1592 Complete the execution of a combined barrier and master. This function should 1593 only be called at the completion of the <tt>master</tt> code. Other threads will 1594 still be waiting at the barrier and this call releases them. 1595 */ 1596 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1597 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1598 1599 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1600 } 1601 1602 /*! 1603 @ingroup SYNCHRONIZATION 1604 @param loc source location information 1605 @param global_tid thread id. 1606 @return one if the thread should execute the master block, zero otherwise 1607 1608 Start execution of a combined barrier and master(nowait) construct. 1609 The barrier is executed inside this function. 1610 There is no equivalent "end" function, since the 1611 */ 1612 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1613 kmp_int32 ret; 1614 1615 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1616 1617 if (!TCR_4(__kmp_init_parallel)) 1618 __kmp_parallel_initialize(); 1619 1620 if (__kmp_env_consistency_check) { 1621 if (loc == 0) { 1622 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1623 } 1624 __kmp_check_barrier(global_tid, ct_barrier, loc); 1625 } 1626 1627 #if OMPT_SUPPORT 1628 ompt_frame_t *ompt_frame; 1629 if (ompt_enabled.enabled) { 1630 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1631 if (ompt_frame->enter_frame == NULL) 1632 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1633 OMPT_STORE_RETURN_ADDRESS(global_tid); 1634 } 1635 #endif 1636 #if USE_ITT_NOTIFY 1637 __kmp_threads[global_tid]->th.th_ident = loc; 1638 #endif 1639 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1640 #if OMPT_SUPPORT && OMPT_OPTIONAL 1641 if (ompt_enabled.enabled) { 1642 ompt_frame->enter_frame = NULL; 1643 } 1644 #endif 1645 1646 ret = __kmpc_master(loc, global_tid); 1647 1648 if (__kmp_env_consistency_check) { 1649 /* there's no __kmpc_end_master called; so the (stats) */ 1650 /* actions of __kmpc_end_master are done here */ 1651 1652 if (global_tid < 0) { 1653 KMP_WARNING(ThreadIdentInvalid); 1654 } 1655 if (ret) { 1656 /* only one thread should do the pop since only */ 1657 /* one did the push (see __kmpc_master()) */ 1658 1659 __kmp_pop_sync(global_tid, ct_master, loc); 1660 } 1661 } 1662 1663 return (ret); 1664 } 1665 1666 /* The BARRIER for a SINGLE process section is always explicit */ 1667 /*! 1668 @ingroup WORK_SHARING 1669 @param loc source location information 1670 @param global_tid global thread number 1671 @return One if this thread should execute the single construct, zero otherwise. 1672 1673 Test whether to execute a <tt>single</tt> construct. 1674 There are no implicit barriers in the two "single" calls, rather the compiler 1675 should introduce an explicit barrier if it is required. 1676 */ 1677 1678 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1679 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1680 1681 if (rc) { 1682 // We are going to execute the single statement, so we should count it. 1683 KMP_COUNT_BLOCK(OMP_SINGLE); 1684 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1685 } 1686 1687 #if OMPT_SUPPORT && OMPT_OPTIONAL 1688 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1689 kmp_team_t *team = this_thr->th.th_team; 1690 int tid = __kmp_tid_from_gtid(global_tid); 1691 1692 if (ompt_enabled.enabled) { 1693 if (rc) { 1694 if (ompt_enabled.ompt_callback_work) { 1695 ompt_callbacks.ompt_callback(ompt_callback_work)( 1696 ompt_work_single_executor, ompt_scope_begin, 1697 &(team->t.ompt_team_info.parallel_data), 1698 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1699 1, OMPT_GET_RETURN_ADDRESS(0)); 1700 } 1701 } else { 1702 if (ompt_enabled.ompt_callback_work) { 1703 ompt_callbacks.ompt_callback(ompt_callback_work)( 1704 ompt_work_single_other, ompt_scope_begin, 1705 &(team->t.ompt_team_info.parallel_data), 1706 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1707 1, OMPT_GET_RETURN_ADDRESS(0)); 1708 ompt_callbacks.ompt_callback(ompt_callback_work)( 1709 ompt_work_single_other, ompt_scope_end, 1710 &(team->t.ompt_team_info.parallel_data), 1711 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1712 1, OMPT_GET_RETURN_ADDRESS(0)); 1713 } 1714 } 1715 } 1716 #endif 1717 1718 return rc; 1719 } 1720 1721 /*! 1722 @ingroup WORK_SHARING 1723 @param loc source location information 1724 @param global_tid global thread number 1725 1726 Mark the end of a <tt>single</tt> construct. This function should 1727 only be called by the thread that executed the block of code protected 1728 by the `single` construct. 1729 */ 1730 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1731 __kmp_exit_single(global_tid); 1732 KMP_POP_PARTITIONED_TIMER(); 1733 1734 #if OMPT_SUPPORT && OMPT_OPTIONAL 1735 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1736 kmp_team_t *team = this_thr->th.th_team; 1737 int tid = __kmp_tid_from_gtid(global_tid); 1738 1739 if (ompt_enabled.ompt_callback_work) { 1740 ompt_callbacks.ompt_callback(ompt_callback_work)( 1741 ompt_work_single_executor, ompt_scope_end, 1742 &(team->t.ompt_team_info.parallel_data), 1743 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1744 OMPT_GET_RETURN_ADDRESS(0)); 1745 } 1746 #endif 1747 } 1748 1749 /*! 1750 @ingroup WORK_SHARING 1751 @param loc Source location 1752 @param global_tid Global thread id 1753 1754 Mark the end of a statically scheduled loop. 1755 */ 1756 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1757 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1758 1759 #if OMPT_SUPPORT && OMPT_OPTIONAL 1760 if (ompt_enabled.ompt_callback_work) { 1761 ompt_work_type_t ompt_work_type = ompt_work_loop; 1762 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1763 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1764 // Determine workshare type 1765 if (loc != NULL) { 1766 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1767 ompt_work_type = ompt_work_loop; 1768 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1769 ompt_work_type = ompt_work_sections; 1770 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1771 ompt_work_type = ompt_work_distribute; 1772 } else { 1773 // use default set above. 1774 // a warning about this case is provided in __kmpc_for_static_init 1775 } 1776 KMP_DEBUG_ASSERT(ompt_work_type); 1777 } 1778 ompt_callbacks.ompt_callback(ompt_callback_work)( 1779 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1780 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1781 } 1782 #endif 1783 1784 if (__kmp_env_consistency_check) 1785 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1786 } 1787 1788 // User routines which take C-style arguments (call by value) 1789 // different from the Fortran equivalent routines 1790 1791 void ompc_set_num_threads(int arg) { 1792 // !!!!! TODO: check the per-task binding 1793 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1794 } 1795 1796 void ompc_set_dynamic(int flag) { 1797 kmp_info_t *thread; 1798 1799 /* For the thread-private implementation of the internal controls */ 1800 thread = __kmp_entry_thread(); 1801 1802 __kmp_save_internal_controls(thread); 1803 1804 set__dynamic(thread, flag ? TRUE : FALSE); 1805 } 1806 1807 void ompc_set_nested(int flag) { 1808 kmp_info_t *thread; 1809 1810 /* For the thread-private internal controls implementation */ 1811 thread = __kmp_entry_thread(); 1812 1813 __kmp_save_internal_controls(thread); 1814 1815 set__nested(thread, flag ? TRUE : FALSE); 1816 } 1817 1818 void ompc_set_max_active_levels(int max_active_levels) { 1819 /* TO DO */ 1820 /* we want per-task implementation of this internal control */ 1821 1822 /* For the per-thread internal controls implementation */ 1823 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1824 } 1825 1826 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1827 // !!!!! TODO: check the per-task binding 1828 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1829 } 1830 1831 int ompc_get_ancestor_thread_num(int level) { 1832 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1833 } 1834 1835 int ompc_get_team_size(int level) { 1836 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1837 } 1838 1839 void kmpc_set_stacksize(int arg) { 1840 // __kmp_aux_set_stacksize initializes the library if needed 1841 __kmp_aux_set_stacksize(arg); 1842 } 1843 1844 void kmpc_set_stacksize_s(size_t arg) { 1845 // __kmp_aux_set_stacksize initializes the library if needed 1846 __kmp_aux_set_stacksize(arg); 1847 } 1848 1849 void kmpc_set_blocktime(int arg) { 1850 int gtid, tid; 1851 kmp_info_t *thread; 1852 1853 gtid = __kmp_entry_gtid(); 1854 tid = __kmp_tid_from_gtid(gtid); 1855 thread = __kmp_thread_from_gtid(gtid); 1856 1857 __kmp_aux_set_blocktime(arg, thread, tid); 1858 } 1859 1860 void kmpc_set_library(int arg) { 1861 // __kmp_user_set_library initializes the library if needed 1862 __kmp_user_set_library((enum library_type)arg); 1863 } 1864 1865 void kmpc_set_defaults(char const *str) { 1866 // __kmp_aux_set_defaults initializes the library if needed 1867 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1868 } 1869 1870 void kmpc_set_disp_num_buffers(int arg) { 1871 // ignore after initialization because some teams have already 1872 // allocated dispatch buffers 1873 if (__kmp_init_serial == 0 && arg > 0) 1874 __kmp_dispatch_num_buffers = arg; 1875 } 1876 1877 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1878 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1879 return -1; 1880 #else 1881 if (!TCR_4(__kmp_init_middle)) { 1882 __kmp_middle_initialize(); 1883 } 1884 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1885 #endif 1886 } 1887 1888 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1889 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1890 return -1; 1891 #else 1892 if (!TCR_4(__kmp_init_middle)) { 1893 __kmp_middle_initialize(); 1894 } 1895 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1896 #endif 1897 } 1898 1899 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1900 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1901 return -1; 1902 #else 1903 if (!TCR_4(__kmp_init_middle)) { 1904 __kmp_middle_initialize(); 1905 } 1906 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1907 #endif 1908 } 1909 1910 /* -------------------------------------------------------------------------- */ 1911 /*! 1912 @ingroup THREADPRIVATE 1913 @param loc source location information 1914 @param gtid global thread number 1915 @param cpy_size size of the cpy_data buffer 1916 @param cpy_data pointer to data to be copied 1917 @param cpy_func helper function to call for copying data 1918 @param didit flag variable: 1=single thread; 0=not single thread 1919 1920 __kmpc_copyprivate implements the interface for the private data broadcast 1921 needed for the copyprivate clause associated with a single region in an 1922 OpenMP<sup>*</sup> program (both C and Fortran). 1923 All threads participating in the parallel region call this routine. 1924 One of the threads (called the single thread) should have the <tt>didit</tt> 1925 variable set to 1 and all other threads should have that variable set to 0. 1926 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1927 1928 The OpenMP specification forbids the use of nowait on the single region when a 1929 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1930 barrier internally to avoid race conditions, so the code generation for the 1931 single region should avoid generating a barrier after the call to @ref 1932 __kmpc_copyprivate. 1933 1934 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1935 The <tt>loc</tt> parameter is a pointer to source location information. 1936 1937 Internal implementation: The single thread will first copy its descriptor 1938 address (cpy_data) to a team-private location, then the other threads will each 1939 call the function pointed to by the parameter cpy_func, which carries out the 1940 copy by copying the data using the cpy_data buffer. 1941 1942 The cpy_func routine used for the copy and the contents of the data area defined 1943 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1944 to be done. For instance, the cpy_data buffer can hold the actual data to be 1945 copied or it may hold a list of pointers to the data. The cpy_func routine must 1946 interpret the cpy_data buffer appropriately. 1947 1948 The interface to cpy_func is as follows: 1949 @code 1950 void cpy_func( void *destination, void *source ) 1951 @endcode 1952 where void *destination is the cpy_data pointer for the thread being copied to 1953 and void *source is the cpy_data pointer for the thread being copied from. 1954 */ 1955 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1956 void *cpy_data, void (*cpy_func)(void *, void *), 1957 kmp_int32 didit) { 1958 void **data_ptr; 1959 1960 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1961 1962 KMP_MB(); 1963 1964 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1965 1966 if (__kmp_env_consistency_check) { 1967 if (loc == 0) { 1968 KMP_WARNING(ConstructIdentInvalid); 1969 } 1970 } 1971 1972 // ToDo: Optimize the following two barriers into some kind of split barrier 1973 1974 if (didit) 1975 *data_ptr = cpy_data; 1976 1977 #if OMPT_SUPPORT 1978 ompt_frame_t *ompt_frame; 1979 if (ompt_enabled.enabled) { 1980 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1981 if (ompt_frame->enter_frame == NULL) 1982 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1983 OMPT_STORE_RETURN_ADDRESS(gtid); 1984 } 1985 #endif 1986 /* This barrier is not a barrier region boundary */ 1987 #if USE_ITT_NOTIFY 1988 __kmp_threads[gtid]->th.th_ident = loc; 1989 #endif 1990 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1991 1992 if (!didit) 1993 (*cpy_func)(cpy_data, *data_ptr); 1994 1995 // Consider next barrier a user-visible barrier for barrier region boundaries 1996 // Nesting checks are already handled by the single construct checks 1997 1998 #if OMPT_SUPPORT 1999 if (ompt_enabled.enabled) { 2000 OMPT_STORE_RETURN_ADDRESS(gtid); 2001 } 2002 #endif 2003 #if USE_ITT_NOTIFY 2004 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2005 // tasks can overwrite the location) 2006 #endif 2007 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2008 #if OMPT_SUPPORT && OMPT_OPTIONAL 2009 if (ompt_enabled.enabled) { 2010 ompt_frame->enter_frame = NULL; 2011 } 2012 #endif 2013 } 2014 2015 /* -------------------------------------------------------------------------- */ 2016 2017 #define INIT_LOCK __kmp_init_user_lock_with_checks 2018 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2019 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2020 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2021 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2022 #define ACQUIRE_NESTED_LOCK_TIMED \ 2023 __kmp_acquire_nested_user_lock_with_checks_timed 2024 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2025 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2026 #define TEST_LOCK __kmp_test_user_lock_with_checks 2027 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2028 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2029 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2030 2031 // TODO: Make check abort messages use location info & pass it into 2032 // with_checks routines 2033 2034 #if KMP_USE_DYNAMIC_LOCK 2035 2036 // internal lock initializer 2037 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2038 kmp_dyna_lockseq_t seq) { 2039 if (KMP_IS_D_LOCK(seq)) { 2040 KMP_INIT_D_LOCK(lock, seq); 2041 #if USE_ITT_BUILD 2042 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2043 #endif 2044 } else { 2045 KMP_INIT_I_LOCK(lock, seq); 2046 #if USE_ITT_BUILD 2047 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2048 __kmp_itt_lock_creating(ilk->lock, loc); 2049 #endif 2050 } 2051 } 2052 2053 // internal nest lock initializer 2054 static __forceinline void 2055 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2056 kmp_dyna_lockseq_t seq) { 2057 #if KMP_USE_TSX 2058 // Don't have nested lock implementation for speculative locks 2059 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2060 seq = __kmp_user_lock_seq; 2061 #endif 2062 switch (seq) { 2063 case lockseq_tas: 2064 seq = lockseq_nested_tas; 2065 break; 2066 #if KMP_USE_FUTEX 2067 case lockseq_futex: 2068 seq = lockseq_nested_futex; 2069 break; 2070 #endif 2071 case lockseq_ticket: 2072 seq = lockseq_nested_ticket; 2073 break; 2074 case lockseq_queuing: 2075 seq = lockseq_nested_queuing; 2076 break; 2077 case lockseq_drdpa: 2078 seq = lockseq_nested_drdpa; 2079 break; 2080 default: 2081 seq = lockseq_nested_queuing; 2082 } 2083 KMP_INIT_I_LOCK(lock, seq); 2084 #if USE_ITT_BUILD 2085 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2086 __kmp_itt_lock_creating(ilk->lock, loc); 2087 #endif 2088 } 2089 2090 /* initialize the lock with a hint */ 2091 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2092 uintptr_t hint) { 2093 KMP_DEBUG_ASSERT(__kmp_init_serial); 2094 if (__kmp_env_consistency_check && user_lock == NULL) { 2095 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2096 } 2097 2098 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2099 2100 #if OMPT_SUPPORT && OMPT_OPTIONAL 2101 // This is the case, if called from omp_init_lock_with_hint: 2102 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2103 if (!codeptr) 2104 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2105 if (ompt_enabled.ompt_callback_lock_init) { 2106 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2107 ompt_mutex_lock, (omp_lock_hint_t)hint, 2108 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2109 codeptr); 2110 } 2111 #endif 2112 } 2113 2114 /* initialize the lock with a hint */ 2115 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2116 void **user_lock, uintptr_t hint) { 2117 KMP_DEBUG_ASSERT(__kmp_init_serial); 2118 if (__kmp_env_consistency_check && user_lock == NULL) { 2119 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2120 } 2121 2122 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2123 2124 #if OMPT_SUPPORT && OMPT_OPTIONAL 2125 // This is the case, if called from omp_init_lock_with_hint: 2126 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2127 if (!codeptr) 2128 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2129 if (ompt_enabled.ompt_callback_lock_init) { 2130 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2131 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2132 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2133 codeptr); 2134 } 2135 #endif 2136 } 2137 2138 #endif // KMP_USE_DYNAMIC_LOCK 2139 2140 /* initialize the lock */ 2141 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2142 #if KMP_USE_DYNAMIC_LOCK 2143 2144 KMP_DEBUG_ASSERT(__kmp_init_serial); 2145 if (__kmp_env_consistency_check && user_lock == NULL) { 2146 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2147 } 2148 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2149 2150 #if OMPT_SUPPORT && OMPT_OPTIONAL 2151 // This is the case, if called from omp_init_lock_with_hint: 2152 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2153 if (!codeptr) 2154 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2155 if (ompt_enabled.ompt_callback_lock_init) { 2156 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2157 ompt_mutex_lock, omp_lock_hint_none, 2158 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2159 codeptr); 2160 } 2161 #endif 2162 2163 #else // KMP_USE_DYNAMIC_LOCK 2164 2165 static char const *const func = "omp_init_lock"; 2166 kmp_user_lock_p lck; 2167 KMP_DEBUG_ASSERT(__kmp_init_serial); 2168 2169 if (__kmp_env_consistency_check) { 2170 if (user_lock == NULL) { 2171 KMP_FATAL(LockIsUninitialized, func); 2172 } 2173 } 2174 2175 KMP_CHECK_USER_LOCK_INIT(); 2176 2177 if ((__kmp_user_lock_kind == lk_tas) && 2178 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2179 lck = (kmp_user_lock_p)user_lock; 2180 } 2181 #if KMP_USE_FUTEX 2182 else if ((__kmp_user_lock_kind == lk_futex) && 2183 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2184 lck = (kmp_user_lock_p)user_lock; 2185 } 2186 #endif 2187 else { 2188 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2189 } 2190 INIT_LOCK(lck); 2191 __kmp_set_user_lock_location(lck, loc); 2192 2193 #if OMPT_SUPPORT && OMPT_OPTIONAL 2194 // This is the case, if called from omp_init_lock_with_hint: 2195 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2196 if (!codeptr) 2197 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2198 if (ompt_enabled.ompt_callback_lock_init) { 2199 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2200 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2201 (ompt_wait_id_t)user_lock, codeptr); 2202 } 2203 #endif 2204 2205 #if USE_ITT_BUILD 2206 __kmp_itt_lock_creating(lck); 2207 #endif /* USE_ITT_BUILD */ 2208 2209 #endif // KMP_USE_DYNAMIC_LOCK 2210 } // __kmpc_init_lock 2211 2212 /* initialize the lock */ 2213 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2214 #if KMP_USE_DYNAMIC_LOCK 2215 2216 KMP_DEBUG_ASSERT(__kmp_init_serial); 2217 if (__kmp_env_consistency_check && user_lock == NULL) { 2218 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2219 } 2220 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2221 2222 #if OMPT_SUPPORT && OMPT_OPTIONAL 2223 // This is the case, if called from omp_init_lock_with_hint: 2224 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2225 if (!codeptr) 2226 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2227 if (ompt_enabled.ompt_callback_lock_init) { 2228 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2229 ompt_mutex_nest_lock, omp_lock_hint_none, 2230 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2231 codeptr); 2232 } 2233 #endif 2234 2235 #else // KMP_USE_DYNAMIC_LOCK 2236 2237 static char const *const func = "omp_init_nest_lock"; 2238 kmp_user_lock_p lck; 2239 KMP_DEBUG_ASSERT(__kmp_init_serial); 2240 2241 if (__kmp_env_consistency_check) { 2242 if (user_lock == NULL) { 2243 KMP_FATAL(LockIsUninitialized, func); 2244 } 2245 } 2246 2247 KMP_CHECK_USER_LOCK_INIT(); 2248 2249 if ((__kmp_user_lock_kind == lk_tas) && 2250 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2251 OMP_NEST_LOCK_T_SIZE)) { 2252 lck = (kmp_user_lock_p)user_lock; 2253 } 2254 #if KMP_USE_FUTEX 2255 else if ((__kmp_user_lock_kind == lk_futex) && 2256 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2257 OMP_NEST_LOCK_T_SIZE)) { 2258 lck = (kmp_user_lock_p)user_lock; 2259 } 2260 #endif 2261 else { 2262 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2263 } 2264 2265 INIT_NESTED_LOCK(lck); 2266 __kmp_set_user_lock_location(lck, loc); 2267 2268 #if OMPT_SUPPORT && OMPT_OPTIONAL 2269 // This is the case, if called from omp_init_lock_with_hint: 2270 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2271 if (!codeptr) 2272 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2273 if (ompt_enabled.ompt_callback_lock_init) { 2274 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2275 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2276 (ompt_wait_id_t)user_lock, codeptr); 2277 } 2278 #endif 2279 2280 #if USE_ITT_BUILD 2281 __kmp_itt_lock_creating(lck); 2282 #endif /* USE_ITT_BUILD */ 2283 2284 #endif // KMP_USE_DYNAMIC_LOCK 2285 } // __kmpc_init_nest_lock 2286 2287 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2288 #if KMP_USE_DYNAMIC_LOCK 2289 2290 #if USE_ITT_BUILD 2291 kmp_user_lock_p lck; 2292 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2293 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2294 } else { 2295 lck = (kmp_user_lock_p)user_lock; 2296 } 2297 __kmp_itt_lock_destroyed(lck); 2298 #endif 2299 #if OMPT_SUPPORT && OMPT_OPTIONAL 2300 // This is the case, if called from omp_init_lock_with_hint: 2301 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2302 if (!codeptr) 2303 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2304 if (ompt_enabled.ompt_callback_lock_destroy) { 2305 kmp_user_lock_p lck; 2306 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2307 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2308 } else { 2309 lck = (kmp_user_lock_p)user_lock; 2310 } 2311 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2312 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2313 } 2314 #endif 2315 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2316 #else 2317 kmp_user_lock_p lck; 2318 2319 if ((__kmp_user_lock_kind == lk_tas) && 2320 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2321 lck = (kmp_user_lock_p)user_lock; 2322 } 2323 #if KMP_USE_FUTEX 2324 else if ((__kmp_user_lock_kind == lk_futex) && 2325 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2326 lck = (kmp_user_lock_p)user_lock; 2327 } 2328 #endif 2329 else { 2330 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2331 } 2332 2333 #if OMPT_SUPPORT && OMPT_OPTIONAL 2334 // This is the case, if called from omp_init_lock_with_hint: 2335 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2336 if (!codeptr) 2337 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2338 if (ompt_enabled.ompt_callback_lock_destroy) { 2339 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2340 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2341 } 2342 #endif 2343 2344 #if USE_ITT_BUILD 2345 __kmp_itt_lock_destroyed(lck); 2346 #endif /* USE_ITT_BUILD */ 2347 DESTROY_LOCK(lck); 2348 2349 if ((__kmp_user_lock_kind == lk_tas) && 2350 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2351 ; 2352 } 2353 #if KMP_USE_FUTEX 2354 else if ((__kmp_user_lock_kind == lk_futex) && 2355 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2356 ; 2357 } 2358 #endif 2359 else { 2360 __kmp_user_lock_free(user_lock, gtid, lck); 2361 } 2362 #endif // KMP_USE_DYNAMIC_LOCK 2363 } // __kmpc_destroy_lock 2364 2365 /* destroy the lock */ 2366 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2367 #if KMP_USE_DYNAMIC_LOCK 2368 2369 #if USE_ITT_BUILD 2370 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2371 __kmp_itt_lock_destroyed(ilk->lock); 2372 #endif 2373 #if OMPT_SUPPORT && OMPT_OPTIONAL 2374 // This is the case, if called from omp_init_lock_with_hint: 2375 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2376 if (!codeptr) 2377 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2378 if (ompt_enabled.ompt_callback_lock_destroy) { 2379 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2380 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2381 } 2382 #endif 2383 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2384 2385 #else // KMP_USE_DYNAMIC_LOCK 2386 2387 kmp_user_lock_p lck; 2388 2389 if ((__kmp_user_lock_kind == lk_tas) && 2390 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2391 OMP_NEST_LOCK_T_SIZE)) { 2392 lck = (kmp_user_lock_p)user_lock; 2393 } 2394 #if KMP_USE_FUTEX 2395 else if ((__kmp_user_lock_kind == lk_futex) && 2396 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2397 OMP_NEST_LOCK_T_SIZE)) { 2398 lck = (kmp_user_lock_p)user_lock; 2399 } 2400 #endif 2401 else { 2402 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2403 } 2404 2405 #if OMPT_SUPPORT && OMPT_OPTIONAL 2406 // This is the case, if called from omp_init_lock_with_hint: 2407 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2408 if (!codeptr) 2409 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2410 if (ompt_enabled.ompt_callback_lock_destroy) { 2411 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2412 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2413 } 2414 #endif 2415 2416 #if USE_ITT_BUILD 2417 __kmp_itt_lock_destroyed(lck); 2418 #endif /* USE_ITT_BUILD */ 2419 2420 DESTROY_NESTED_LOCK(lck); 2421 2422 if ((__kmp_user_lock_kind == lk_tas) && 2423 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2424 OMP_NEST_LOCK_T_SIZE)) { 2425 ; 2426 } 2427 #if KMP_USE_FUTEX 2428 else if ((__kmp_user_lock_kind == lk_futex) && 2429 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2430 OMP_NEST_LOCK_T_SIZE)) { 2431 ; 2432 } 2433 #endif 2434 else { 2435 __kmp_user_lock_free(user_lock, gtid, lck); 2436 } 2437 #endif // KMP_USE_DYNAMIC_LOCK 2438 } // __kmpc_destroy_nest_lock 2439 2440 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2441 KMP_COUNT_BLOCK(OMP_set_lock); 2442 #if KMP_USE_DYNAMIC_LOCK 2443 int tag = KMP_EXTRACT_D_TAG(user_lock); 2444 #if USE_ITT_BUILD 2445 __kmp_itt_lock_acquiring( 2446 (kmp_user_lock_p) 2447 user_lock); // itt function will get to the right lock object. 2448 #endif 2449 #if OMPT_SUPPORT && OMPT_OPTIONAL 2450 // This is the case, if called from omp_init_lock_with_hint: 2451 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2452 if (!codeptr) 2453 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2454 if (ompt_enabled.ompt_callback_mutex_acquire) { 2455 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2456 ompt_mutex_lock, omp_lock_hint_none, 2457 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2458 codeptr); 2459 } 2460 #endif 2461 #if KMP_USE_INLINED_TAS 2462 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2463 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2464 } else 2465 #elif KMP_USE_INLINED_FUTEX 2466 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2467 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2468 } else 2469 #endif 2470 { 2471 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2472 } 2473 #if USE_ITT_BUILD 2474 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2475 #endif 2476 #if OMPT_SUPPORT && OMPT_OPTIONAL 2477 if (ompt_enabled.ompt_callback_mutex_acquired) { 2478 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2479 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2480 } 2481 #endif 2482 2483 #else // KMP_USE_DYNAMIC_LOCK 2484 2485 kmp_user_lock_p lck; 2486 2487 if ((__kmp_user_lock_kind == lk_tas) && 2488 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2489 lck = (kmp_user_lock_p)user_lock; 2490 } 2491 #if KMP_USE_FUTEX 2492 else if ((__kmp_user_lock_kind == lk_futex) && 2493 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2494 lck = (kmp_user_lock_p)user_lock; 2495 } 2496 #endif 2497 else { 2498 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2499 } 2500 2501 #if USE_ITT_BUILD 2502 __kmp_itt_lock_acquiring(lck); 2503 #endif /* USE_ITT_BUILD */ 2504 #if OMPT_SUPPORT && OMPT_OPTIONAL 2505 // This is the case, if called from omp_init_lock_with_hint: 2506 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2507 if (!codeptr) 2508 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2509 if (ompt_enabled.ompt_callback_mutex_acquire) { 2510 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2511 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2512 (ompt_wait_id_t)lck, codeptr); 2513 } 2514 #endif 2515 2516 ACQUIRE_LOCK(lck, gtid); 2517 2518 #if USE_ITT_BUILD 2519 __kmp_itt_lock_acquired(lck); 2520 #endif /* USE_ITT_BUILD */ 2521 2522 #if OMPT_SUPPORT && OMPT_OPTIONAL 2523 if (ompt_enabled.ompt_callback_mutex_acquired) { 2524 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2525 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2526 } 2527 #endif 2528 2529 #endif // KMP_USE_DYNAMIC_LOCK 2530 } 2531 2532 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2533 #if KMP_USE_DYNAMIC_LOCK 2534 2535 #if USE_ITT_BUILD 2536 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2537 #endif 2538 #if OMPT_SUPPORT && OMPT_OPTIONAL 2539 // This is the case, if called from omp_init_lock_with_hint: 2540 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2541 if (!codeptr) 2542 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2543 if (ompt_enabled.enabled) { 2544 if (ompt_enabled.ompt_callback_mutex_acquire) { 2545 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2546 ompt_mutex_nest_lock, omp_lock_hint_none, 2547 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2548 codeptr); 2549 } 2550 } 2551 #endif 2552 int acquire_status = 2553 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2554 #if USE_ITT_BUILD 2555 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2556 #endif 2557 2558 #if OMPT_SUPPORT && OMPT_OPTIONAL 2559 if (ompt_enabled.enabled) { 2560 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2561 if (ompt_enabled.ompt_callback_mutex_acquired) { 2562 // lock_first 2563 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2564 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2565 } 2566 } else { 2567 if (ompt_enabled.ompt_callback_nest_lock) { 2568 // lock_next 2569 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2570 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr); 2571 } 2572 } 2573 } 2574 #endif 2575 2576 #else // KMP_USE_DYNAMIC_LOCK 2577 int acquire_status; 2578 kmp_user_lock_p lck; 2579 2580 if ((__kmp_user_lock_kind == lk_tas) && 2581 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2582 OMP_NEST_LOCK_T_SIZE)) { 2583 lck = (kmp_user_lock_p)user_lock; 2584 } 2585 #if KMP_USE_FUTEX 2586 else if ((__kmp_user_lock_kind == lk_futex) && 2587 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2588 OMP_NEST_LOCK_T_SIZE)) { 2589 lck = (kmp_user_lock_p)user_lock; 2590 } 2591 #endif 2592 else { 2593 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2594 } 2595 2596 #if USE_ITT_BUILD 2597 __kmp_itt_lock_acquiring(lck); 2598 #endif /* USE_ITT_BUILD */ 2599 #if OMPT_SUPPORT && OMPT_OPTIONAL 2600 // This is the case, if called from omp_init_lock_with_hint: 2601 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2602 if (!codeptr) 2603 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2604 if (ompt_enabled.enabled) { 2605 if (ompt_enabled.ompt_callback_mutex_acquire) { 2606 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2607 ompt_mutex_nest_lock, omp_lock_hint_none, 2608 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr); 2609 } 2610 } 2611 #endif 2612 2613 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2614 2615 #if USE_ITT_BUILD 2616 __kmp_itt_lock_acquired(lck); 2617 #endif /* USE_ITT_BUILD */ 2618 2619 #if OMPT_SUPPORT && OMPT_OPTIONAL 2620 if (ompt_enabled.enabled) { 2621 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2622 if (ompt_enabled.ompt_callback_mutex_acquired) { 2623 // lock_first 2624 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2625 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2626 } 2627 } else { 2628 if (ompt_enabled.ompt_callback_nest_lock) { 2629 // lock_next 2630 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2631 ompt_scope_begin, (ompt_wait_id_t)lck, codeptr); 2632 } 2633 } 2634 } 2635 #endif 2636 2637 #endif // KMP_USE_DYNAMIC_LOCK 2638 } 2639 2640 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2641 #if KMP_USE_DYNAMIC_LOCK 2642 2643 int tag = KMP_EXTRACT_D_TAG(user_lock); 2644 #if USE_ITT_BUILD 2645 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2646 #endif 2647 #if KMP_USE_INLINED_TAS 2648 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2649 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2650 } else 2651 #elif KMP_USE_INLINED_FUTEX 2652 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2653 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2654 } else 2655 #endif 2656 { 2657 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2658 } 2659 2660 #if OMPT_SUPPORT && OMPT_OPTIONAL 2661 // This is the case, if called from omp_init_lock_with_hint: 2662 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2663 if (!codeptr) 2664 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2665 if (ompt_enabled.ompt_callback_mutex_released) { 2666 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2667 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2668 } 2669 #endif 2670 2671 #else // KMP_USE_DYNAMIC_LOCK 2672 2673 kmp_user_lock_p lck; 2674 2675 /* Can't use serial interval since not block structured */ 2676 /* release the lock */ 2677 2678 if ((__kmp_user_lock_kind == lk_tas) && 2679 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2680 #if KMP_OS_LINUX && \ 2681 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2682 // "fast" path implemented to fix customer performance issue 2683 #if USE_ITT_BUILD 2684 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2685 #endif /* USE_ITT_BUILD */ 2686 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2687 KMP_MB(); 2688 2689 #if OMPT_SUPPORT && OMPT_OPTIONAL 2690 // This is the case, if called from omp_init_lock_with_hint: 2691 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2692 if (!codeptr) 2693 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2694 if (ompt_enabled.ompt_callback_mutex_released) { 2695 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2696 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2697 } 2698 #endif 2699 2700 return; 2701 #else 2702 lck = (kmp_user_lock_p)user_lock; 2703 #endif 2704 } 2705 #if KMP_USE_FUTEX 2706 else if ((__kmp_user_lock_kind == lk_futex) && 2707 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2708 lck = (kmp_user_lock_p)user_lock; 2709 } 2710 #endif 2711 else { 2712 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2713 } 2714 2715 #if USE_ITT_BUILD 2716 __kmp_itt_lock_releasing(lck); 2717 #endif /* USE_ITT_BUILD */ 2718 2719 RELEASE_LOCK(lck, gtid); 2720 2721 #if OMPT_SUPPORT && OMPT_OPTIONAL 2722 // This is the case, if called from omp_init_lock_with_hint: 2723 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2724 if (!codeptr) 2725 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2726 if (ompt_enabled.ompt_callback_mutex_released) { 2727 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2728 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2729 } 2730 #endif 2731 2732 #endif // KMP_USE_DYNAMIC_LOCK 2733 } 2734 2735 /* release the lock */ 2736 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2737 #if KMP_USE_DYNAMIC_LOCK 2738 2739 #if USE_ITT_BUILD 2740 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2741 #endif 2742 int release_status = 2743 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2744 2745 #if OMPT_SUPPORT && OMPT_OPTIONAL 2746 // This is the case, if called from omp_init_lock_with_hint: 2747 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2748 if (!codeptr) 2749 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2750 if (ompt_enabled.enabled) { 2751 if (release_status == KMP_LOCK_RELEASED) { 2752 if (ompt_enabled.ompt_callback_mutex_released) { 2753 // release_lock_last 2754 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2755 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2756 } 2757 } else if (ompt_enabled.ompt_callback_nest_lock) { 2758 // release_lock_prev 2759 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2760 ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr); 2761 } 2762 } 2763 #endif 2764 2765 #else // KMP_USE_DYNAMIC_LOCK 2766 2767 kmp_user_lock_p lck; 2768 2769 /* Can't use serial interval since not block structured */ 2770 2771 if ((__kmp_user_lock_kind == lk_tas) && 2772 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2773 OMP_NEST_LOCK_T_SIZE)) { 2774 #if KMP_OS_LINUX && \ 2775 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2776 // "fast" path implemented to fix customer performance issue 2777 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2778 #if USE_ITT_BUILD 2779 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2780 #endif /* USE_ITT_BUILD */ 2781 2782 #if OMPT_SUPPORT && OMPT_OPTIONAL 2783 int release_status = KMP_LOCK_STILL_HELD; 2784 #endif 2785 2786 if (--(tl->lk.depth_locked) == 0) { 2787 TCW_4(tl->lk.poll, 0); 2788 #if OMPT_SUPPORT && OMPT_OPTIONAL 2789 release_status = KMP_LOCK_RELEASED; 2790 #endif 2791 } 2792 KMP_MB(); 2793 2794 #if OMPT_SUPPORT && OMPT_OPTIONAL 2795 // This is the case, if called from omp_init_lock_with_hint: 2796 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2797 if (!codeptr) 2798 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2799 if (ompt_enabled.enabled) { 2800 if (release_status == KMP_LOCK_RELEASED) { 2801 if (ompt_enabled.ompt_callback_mutex_released) { 2802 // release_lock_last 2803 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2804 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2805 } 2806 } else if (ompt_enabled.ompt_callback_nest_lock) { 2807 // release_lock_previous 2808 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2809 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr); 2810 } 2811 } 2812 #endif 2813 2814 return; 2815 #else 2816 lck = (kmp_user_lock_p)user_lock; 2817 #endif 2818 } 2819 #if KMP_USE_FUTEX 2820 else if ((__kmp_user_lock_kind == lk_futex) && 2821 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2822 OMP_NEST_LOCK_T_SIZE)) { 2823 lck = (kmp_user_lock_p)user_lock; 2824 } 2825 #endif 2826 else { 2827 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2828 } 2829 2830 #if USE_ITT_BUILD 2831 __kmp_itt_lock_releasing(lck); 2832 #endif /* USE_ITT_BUILD */ 2833 2834 int release_status; 2835 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2836 #if OMPT_SUPPORT && OMPT_OPTIONAL 2837 // This is the case, if called from omp_init_lock_with_hint: 2838 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2839 if (!codeptr) 2840 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2841 if (ompt_enabled.enabled) { 2842 if (release_status == KMP_LOCK_RELEASED) { 2843 if (ompt_enabled.ompt_callback_mutex_released) { 2844 // release_lock_last 2845 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2846 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2847 } 2848 } else if (ompt_enabled.ompt_callback_nest_lock) { 2849 // release_lock_previous 2850 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2851 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr); 2852 } 2853 } 2854 #endif 2855 2856 #endif // KMP_USE_DYNAMIC_LOCK 2857 } 2858 2859 /* try to acquire the lock */ 2860 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2861 KMP_COUNT_BLOCK(OMP_test_lock); 2862 2863 #if KMP_USE_DYNAMIC_LOCK 2864 int rc; 2865 int tag = KMP_EXTRACT_D_TAG(user_lock); 2866 #if USE_ITT_BUILD 2867 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2868 #endif 2869 #if OMPT_SUPPORT && OMPT_OPTIONAL 2870 // This is the case, if called from omp_init_lock_with_hint: 2871 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2872 if (!codeptr) 2873 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2874 if (ompt_enabled.ompt_callback_mutex_acquire) { 2875 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2876 ompt_mutex_lock, omp_lock_hint_none, 2877 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2878 codeptr); 2879 } 2880 #endif 2881 #if KMP_USE_INLINED_TAS 2882 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2883 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2884 } else 2885 #elif KMP_USE_INLINED_FUTEX 2886 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2887 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2888 } else 2889 #endif 2890 { 2891 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2892 } 2893 if (rc) { 2894 #if USE_ITT_BUILD 2895 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2896 #endif 2897 #if OMPT_SUPPORT && OMPT_OPTIONAL 2898 if (ompt_enabled.ompt_callback_mutex_acquired) { 2899 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2900 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2901 } 2902 #endif 2903 return FTN_TRUE; 2904 } else { 2905 #if USE_ITT_BUILD 2906 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2907 #endif 2908 return FTN_FALSE; 2909 } 2910 2911 #else // KMP_USE_DYNAMIC_LOCK 2912 2913 kmp_user_lock_p lck; 2914 int rc; 2915 2916 if ((__kmp_user_lock_kind == lk_tas) && 2917 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2918 lck = (kmp_user_lock_p)user_lock; 2919 } 2920 #if KMP_USE_FUTEX 2921 else if ((__kmp_user_lock_kind == lk_futex) && 2922 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2923 lck = (kmp_user_lock_p)user_lock; 2924 } 2925 #endif 2926 else { 2927 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2928 } 2929 2930 #if USE_ITT_BUILD 2931 __kmp_itt_lock_acquiring(lck); 2932 #endif /* USE_ITT_BUILD */ 2933 #if OMPT_SUPPORT && OMPT_OPTIONAL 2934 // This is the case, if called from omp_init_lock_with_hint: 2935 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2936 if (!codeptr) 2937 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2938 if (ompt_enabled.ompt_callback_mutex_acquire) { 2939 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2940 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2941 (ompt_wait_id_t)lck, codeptr); 2942 } 2943 #endif 2944 2945 rc = TEST_LOCK(lck, gtid); 2946 #if USE_ITT_BUILD 2947 if (rc) { 2948 __kmp_itt_lock_acquired(lck); 2949 } else { 2950 __kmp_itt_lock_cancelled(lck); 2951 } 2952 #endif /* USE_ITT_BUILD */ 2953 #if OMPT_SUPPORT && OMPT_OPTIONAL 2954 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 2955 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2956 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2957 } 2958 #endif 2959 2960 return (rc ? FTN_TRUE : FTN_FALSE); 2961 2962 /* Can't use serial interval since not block structured */ 2963 2964 #endif // KMP_USE_DYNAMIC_LOCK 2965 } 2966 2967 /* try to acquire the lock */ 2968 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2969 #if KMP_USE_DYNAMIC_LOCK 2970 int rc; 2971 #if USE_ITT_BUILD 2972 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2973 #endif 2974 #if OMPT_SUPPORT && OMPT_OPTIONAL 2975 // This is the case, if called from omp_init_lock_with_hint: 2976 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2977 if (!codeptr) 2978 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2979 if (ompt_enabled.ompt_callback_mutex_acquire) { 2980 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2981 ompt_mutex_nest_lock, omp_lock_hint_none, 2982 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2983 codeptr); 2984 } 2985 #endif 2986 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 2987 #if USE_ITT_BUILD 2988 if (rc) { 2989 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2990 } else { 2991 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2992 } 2993 #endif 2994 #if OMPT_SUPPORT && OMPT_OPTIONAL 2995 if (ompt_enabled.enabled && rc) { 2996 if (rc == 1) { 2997 if (ompt_enabled.ompt_callback_mutex_acquired) { 2998 // lock_first 2999 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3000 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 3001 } 3002 } else { 3003 if (ompt_enabled.ompt_callback_nest_lock) { 3004 // lock_next 3005 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3006 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr); 3007 } 3008 } 3009 } 3010 #endif 3011 return rc; 3012 3013 #else // KMP_USE_DYNAMIC_LOCK 3014 3015 kmp_user_lock_p lck; 3016 int rc; 3017 3018 if ((__kmp_user_lock_kind == lk_tas) && 3019 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3020 OMP_NEST_LOCK_T_SIZE)) { 3021 lck = (kmp_user_lock_p)user_lock; 3022 } 3023 #if KMP_USE_FUTEX 3024 else if ((__kmp_user_lock_kind == lk_futex) && 3025 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3026 OMP_NEST_LOCK_T_SIZE)) { 3027 lck = (kmp_user_lock_p)user_lock; 3028 } 3029 #endif 3030 else { 3031 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3032 } 3033 3034 #if USE_ITT_BUILD 3035 __kmp_itt_lock_acquiring(lck); 3036 #endif /* USE_ITT_BUILD */ 3037 3038 #if OMPT_SUPPORT && OMPT_OPTIONAL 3039 // This is the case, if called from omp_init_lock_with_hint: 3040 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3041 if (!codeptr) 3042 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3043 if (ompt_enabled.enabled) && 3044 ompt_enabled.ompt_callback_mutex_acquire) { 3045 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3046 ompt_mutex_nest_lock, omp_lock_hint_none, 3047 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr); 3048 } 3049 #endif 3050 3051 rc = TEST_NESTED_LOCK(lck, gtid); 3052 #if USE_ITT_BUILD 3053 if (rc) { 3054 __kmp_itt_lock_acquired(lck); 3055 } else { 3056 __kmp_itt_lock_cancelled(lck); 3057 } 3058 #endif /* USE_ITT_BUILD */ 3059 #if OMPT_SUPPORT && OMPT_OPTIONAL 3060 if (ompt_enabled.enabled && rc) { 3061 if (rc == 1) { 3062 if (ompt_enabled.ompt_callback_mutex_acquired) { 3063 // lock_first 3064 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3065 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 3066 } 3067 } else { 3068 if (ompt_enabled.ompt_callback_nest_lock) { 3069 // lock_next 3070 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3071 ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr); 3072 } 3073 } 3074 } 3075 #endif 3076 return rc; 3077 3078 /* Can't use serial interval since not block structured */ 3079 3080 #endif // KMP_USE_DYNAMIC_LOCK 3081 } 3082 3083 // Interface to fast scalable reduce methods routines 3084 3085 // keep the selected method in a thread local structure for cross-function 3086 // usage: will be used in __kmpc_end_reduce* functions; 3087 // another solution: to re-determine the method one more time in 3088 // __kmpc_end_reduce* functions (new prototype required then) 3089 // AT: which solution is better? 3090 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3091 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3092 3093 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3094 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3095 3096 // description of the packed_reduction_method variable: look at the macros in 3097 // kmp.h 3098 3099 // used in a critical section reduce block 3100 static __forceinline void 3101 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3102 kmp_critical_name *crit) { 3103 3104 // this lock was visible to a customer and to the threading profile tool as a 3105 // serial overhead span (although it's used for an internal purpose only) 3106 // why was it visible in previous implementation? 3107 // should we keep it visible in new reduce block? 3108 kmp_user_lock_p lck; 3109 3110 #if KMP_USE_DYNAMIC_LOCK 3111 3112 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3113 // Check if it is initialized. 3114 if (*lk == 0) { 3115 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3116 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3117 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3118 } else { 3119 __kmp_init_indirect_csptr(crit, loc, global_tid, 3120 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3121 } 3122 } 3123 // Branch for accessing the actual lock object and set operation. This 3124 // branching is inevitable since this lock initialization does not follow the 3125 // normal dispatch path (lock table is not used). 3126 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3127 lck = (kmp_user_lock_p)lk; 3128 KMP_DEBUG_ASSERT(lck != NULL); 3129 if (__kmp_env_consistency_check) { 3130 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3131 } 3132 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3133 } else { 3134 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3135 lck = ilk->lock; 3136 KMP_DEBUG_ASSERT(lck != NULL); 3137 if (__kmp_env_consistency_check) { 3138 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3139 } 3140 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3141 } 3142 3143 #else // KMP_USE_DYNAMIC_LOCK 3144 3145 // We know that the fast reduction code is only emitted by Intel compilers 3146 // with 32 byte critical sections. If there isn't enough space, then we 3147 // have to use a pointer. 3148 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3149 lck = (kmp_user_lock_p)crit; 3150 } else { 3151 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3152 } 3153 KMP_DEBUG_ASSERT(lck != NULL); 3154 3155 if (__kmp_env_consistency_check) 3156 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3157 3158 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3159 3160 #endif // KMP_USE_DYNAMIC_LOCK 3161 } 3162 3163 // used in a critical section reduce block 3164 static __forceinline void 3165 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3166 kmp_critical_name *crit) { 3167 3168 kmp_user_lock_p lck; 3169 3170 #if KMP_USE_DYNAMIC_LOCK 3171 3172 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3173 lck = (kmp_user_lock_p)crit; 3174 if (__kmp_env_consistency_check) 3175 __kmp_pop_sync(global_tid, ct_critical, loc); 3176 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3177 } else { 3178 kmp_indirect_lock_t *ilk = 3179 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3180 if (__kmp_env_consistency_check) 3181 __kmp_pop_sync(global_tid, ct_critical, loc); 3182 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3183 } 3184 3185 #else // KMP_USE_DYNAMIC_LOCK 3186 3187 // We know that the fast reduction code is only emitted by Intel compilers 3188 // with 32 byte critical sections. If there isn't enough space, then we have 3189 // to use a pointer. 3190 if (__kmp_base_user_lock_size > 32) { 3191 lck = *((kmp_user_lock_p *)crit); 3192 KMP_ASSERT(lck != NULL); 3193 } else { 3194 lck = (kmp_user_lock_p)crit; 3195 } 3196 3197 if (__kmp_env_consistency_check) 3198 __kmp_pop_sync(global_tid, ct_critical, loc); 3199 3200 __kmp_release_user_lock_with_checks(lck, global_tid); 3201 3202 #endif // KMP_USE_DYNAMIC_LOCK 3203 } // __kmp_end_critical_section_reduce_block 3204 3205 #if OMP_40_ENABLED 3206 static __forceinline int 3207 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3208 int *task_state) { 3209 kmp_team_t *team; 3210 3211 // Check if we are inside the teams construct? 3212 if (th->th.th_teams_microtask) { 3213 *team_p = team = th->th.th_team; 3214 if (team->t.t_level == th->th.th_teams_level) { 3215 // This is reduction at teams construct. 3216 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3217 // Let's swap teams temporarily for the reduction. 3218 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3219 th->th.th_team = team->t.t_parent; 3220 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3221 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3222 *task_state = th->th.th_task_state; 3223 th->th.th_task_state = 0; 3224 3225 return 1; 3226 } 3227 } 3228 return 0; 3229 } 3230 3231 static __forceinline void 3232 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3233 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3234 th->th.th_info.ds.ds_tid = 0; 3235 th->th.th_team = team; 3236 th->th.th_team_nproc = team->t.t_nproc; 3237 th->th.th_task_team = team->t.t_task_team[task_state]; 3238 th->th.th_task_state = task_state; 3239 } 3240 #endif 3241 3242 /* 2.a.i. Reduce Block without a terminating barrier */ 3243 /*! 3244 @ingroup SYNCHRONIZATION 3245 @param loc source location information 3246 @param global_tid global thread number 3247 @param num_vars number of items (variables) to be reduced 3248 @param reduce_size size of data in bytes to be reduced 3249 @param reduce_data pointer to data to be reduced 3250 @param reduce_func callback function providing reduction operation on two 3251 operands and returning result of reduction in lhs_data 3252 @param lck pointer to the unique lock data structure 3253 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3254 threads if atomic reduction needed 3255 3256 The nowait version is used for a reduce clause with the nowait argument. 3257 */ 3258 kmp_int32 3259 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3260 size_t reduce_size, void *reduce_data, 3261 void (*reduce_func)(void *lhs_data, void *rhs_data), 3262 kmp_critical_name *lck) { 3263 3264 KMP_COUNT_BLOCK(REDUCE_nowait); 3265 int retval = 0; 3266 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3267 #if OMP_40_ENABLED 3268 kmp_info_t *th; 3269 kmp_team_t *team; 3270 int teams_swapped = 0, task_state; 3271 #endif 3272 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3273 3274 // why do we need this initialization here at all? 3275 // Reduction clause can not be used as a stand-alone directive. 3276 3277 // do not call __kmp_serial_initialize(), it will be called by 3278 // __kmp_parallel_initialize() if needed 3279 // possible detection of false-positive race by the threadchecker ??? 3280 if (!TCR_4(__kmp_init_parallel)) 3281 __kmp_parallel_initialize(); 3282 3283 // check correctness of reduce block nesting 3284 #if KMP_USE_DYNAMIC_LOCK 3285 if (__kmp_env_consistency_check) 3286 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3287 #else 3288 if (__kmp_env_consistency_check) 3289 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3290 #endif 3291 3292 #if OMP_40_ENABLED 3293 th = __kmp_thread_from_gtid(global_tid); 3294 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3295 #endif // OMP_40_ENABLED 3296 3297 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3298 // the value should be kept in a variable 3299 // the variable should be either a construct-specific or thread-specific 3300 // property, not a team specific property 3301 // (a thread can reach the next reduce block on the next construct, reduce 3302 // method may differ on the next construct) 3303 // an ident_t "loc" parameter could be used as a construct-specific property 3304 // (what if loc == 0?) 3305 // (if both construct-specific and team-specific variables were shared, 3306 // then unness extra syncs should be needed) 3307 // a thread-specific variable is better regarding two issues above (next 3308 // construct and extra syncs) 3309 // a thread-specific "th_local.reduction_method" variable is used currently 3310 // each thread executes 'determine' and 'set' lines (no need to execute by one 3311 // thread, to avoid unness extra syncs) 3312 3313 packed_reduction_method = __kmp_determine_reduction_method( 3314 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3315 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3316 3317 if (packed_reduction_method == critical_reduce_block) { 3318 3319 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3320 retval = 1; 3321 3322 } else if (packed_reduction_method == empty_reduce_block) { 3323 3324 // usage: if team size == 1, no synchronization is required ( Intel 3325 // platforms only ) 3326 retval = 1; 3327 3328 } else if (packed_reduction_method == atomic_reduce_block) { 3329 3330 retval = 2; 3331 3332 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3333 // won't be called by the code gen) 3334 // (it's not quite good, because the checking block has been closed by 3335 // this 'pop', 3336 // but atomic operation has not been executed yet, will be executed 3337 // slightly later, literally on next instruction) 3338 if (__kmp_env_consistency_check) 3339 __kmp_pop_sync(global_tid, ct_reduce, loc); 3340 3341 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3342 tree_reduce_block)) { 3343 3344 // AT: performance issue: a real barrier here 3345 // AT: (if master goes slow, other threads are blocked here waiting for the 3346 // master to come and release them) 3347 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3348 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3349 // be confusing to a customer) 3350 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3351 // might go faster and be more in line with sense of NOWAIT 3352 // AT: TO DO: do epcc test and compare times 3353 3354 // this barrier should be invisible to a customer and to the threading profile 3355 // tool (it's neither a terminating barrier nor customer's code, it's 3356 // used for an internal purpose) 3357 #if OMPT_SUPPORT 3358 // JP: can this barrier potentially leed to task scheduling? 3359 // JP: as long as there is a barrier in the implementation, OMPT should and 3360 // will provide the barrier events 3361 // so we set-up the necessary frame/return addresses. 3362 ompt_frame_t *ompt_frame; 3363 if (ompt_enabled.enabled) { 3364 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3365 if (ompt_frame->enter_frame == NULL) 3366 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3367 OMPT_STORE_RETURN_ADDRESS(global_tid); 3368 } 3369 #endif 3370 #if USE_ITT_NOTIFY 3371 __kmp_threads[global_tid]->th.th_ident = loc; 3372 #endif 3373 retval = 3374 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3375 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3376 retval = (retval != 0) ? (0) : (1); 3377 #if OMPT_SUPPORT && OMPT_OPTIONAL 3378 if (ompt_enabled.enabled) { 3379 ompt_frame->enter_frame = NULL; 3380 } 3381 #endif 3382 3383 // all other workers except master should do this pop here 3384 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3385 if (__kmp_env_consistency_check) { 3386 if (retval == 0) { 3387 __kmp_pop_sync(global_tid, ct_reduce, loc); 3388 } 3389 } 3390 3391 } else { 3392 3393 // should never reach this block 3394 KMP_ASSERT(0); // "unexpected method" 3395 } 3396 #if OMP_40_ENABLED 3397 if (teams_swapped) { 3398 __kmp_restore_swapped_teams(th, team, task_state); 3399 } 3400 #endif 3401 KA_TRACE( 3402 10, 3403 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3404 global_tid, packed_reduction_method, retval)); 3405 3406 return retval; 3407 } 3408 3409 /*! 3410 @ingroup SYNCHRONIZATION 3411 @param loc source location information 3412 @param global_tid global thread id. 3413 @param lck pointer to the unique lock data structure 3414 3415 Finish the execution of a reduce nowait. 3416 */ 3417 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3418 kmp_critical_name *lck) { 3419 3420 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3421 3422 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3423 3424 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3425 3426 if (packed_reduction_method == critical_reduce_block) { 3427 3428 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3429 3430 } else if (packed_reduction_method == empty_reduce_block) { 3431 3432 // usage: if team size == 1, no synchronization is required ( on Intel 3433 // platforms only ) 3434 3435 } else if (packed_reduction_method == atomic_reduce_block) { 3436 3437 // neither master nor other workers should get here 3438 // (code gen does not generate this call in case 2: atomic reduce block) 3439 // actually it's better to remove this elseif at all; 3440 // after removal this value will checked by the 'else' and will assert 3441 3442 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3443 tree_reduce_block)) { 3444 3445 // only master gets here 3446 3447 } else { 3448 3449 // should never reach this block 3450 KMP_ASSERT(0); // "unexpected method" 3451 } 3452 3453 if (__kmp_env_consistency_check) 3454 __kmp_pop_sync(global_tid, ct_reduce, loc); 3455 3456 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3457 global_tid, packed_reduction_method)); 3458 3459 return; 3460 } 3461 3462 /* 2.a.ii. Reduce Block with a terminating barrier */ 3463 3464 /*! 3465 @ingroup SYNCHRONIZATION 3466 @param loc source location information 3467 @param global_tid global thread number 3468 @param num_vars number of items (variables) to be reduced 3469 @param reduce_size size of data in bytes to be reduced 3470 @param reduce_data pointer to data to be reduced 3471 @param reduce_func callback function providing reduction operation on two 3472 operands and returning result of reduction in lhs_data 3473 @param lck pointer to the unique lock data structure 3474 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3475 threads if atomic reduction needed 3476 3477 A blocking reduce that includes an implicit barrier. 3478 */ 3479 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3480 size_t reduce_size, void *reduce_data, 3481 void (*reduce_func)(void *lhs_data, void *rhs_data), 3482 kmp_critical_name *lck) { 3483 KMP_COUNT_BLOCK(REDUCE_wait); 3484 int retval = 0; 3485 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3486 #if OMP_40_ENABLED 3487 kmp_info_t *th; 3488 kmp_team_t *team; 3489 int teams_swapped = 0, task_state; 3490 #endif 3491 3492 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3493 3494 // why do we need this initialization here at all? 3495 // Reduction clause can not be a stand-alone directive. 3496 3497 // do not call __kmp_serial_initialize(), it will be called by 3498 // __kmp_parallel_initialize() if needed 3499 // possible detection of false-positive race by the threadchecker ??? 3500 if (!TCR_4(__kmp_init_parallel)) 3501 __kmp_parallel_initialize(); 3502 3503 // check correctness of reduce block nesting 3504 #if KMP_USE_DYNAMIC_LOCK 3505 if (__kmp_env_consistency_check) 3506 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3507 #else 3508 if (__kmp_env_consistency_check) 3509 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3510 #endif 3511 3512 #if OMP_40_ENABLED 3513 th = __kmp_thread_from_gtid(global_tid); 3514 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3515 #endif // OMP_40_ENABLED 3516 3517 packed_reduction_method = __kmp_determine_reduction_method( 3518 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3519 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3520 3521 if (packed_reduction_method == critical_reduce_block) { 3522 3523 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3524 retval = 1; 3525 3526 } else if (packed_reduction_method == empty_reduce_block) { 3527 3528 // usage: if team size == 1, no synchronization is required ( Intel 3529 // platforms only ) 3530 retval = 1; 3531 3532 } else if (packed_reduction_method == atomic_reduce_block) { 3533 3534 retval = 2; 3535 3536 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3537 tree_reduce_block)) { 3538 3539 // case tree_reduce_block: 3540 // this barrier should be visible to a customer and to the threading profile 3541 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3542 #if OMPT_SUPPORT 3543 ompt_frame_t *ompt_frame; 3544 if (ompt_enabled.enabled) { 3545 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3546 if (ompt_frame->enter_frame == NULL) 3547 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3548 OMPT_STORE_RETURN_ADDRESS(global_tid); 3549 } 3550 #endif 3551 #if USE_ITT_NOTIFY 3552 __kmp_threads[global_tid]->th.th_ident = 3553 loc; // needed for correct notification of frames 3554 #endif 3555 retval = 3556 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3557 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3558 retval = (retval != 0) ? (0) : (1); 3559 #if OMPT_SUPPORT && OMPT_OPTIONAL 3560 if (ompt_enabled.enabled) { 3561 ompt_frame->enter_frame = NULL; 3562 } 3563 #endif 3564 3565 // all other workers except master should do this pop here 3566 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3567 if (__kmp_env_consistency_check) { 3568 if (retval == 0) { // 0: all other workers; 1: master 3569 __kmp_pop_sync(global_tid, ct_reduce, loc); 3570 } 3571 } 3572 3573 } else { 3574 3575 // should never reach this block 3576 KMP_ASSERT(0); // "unexpected method" 3577 } 3578 #if OMP_40_ENABLED 3579 if (teams_swapped) { 3580 __kmp_restore_swapped_teams(th, team, task_state); 3581 } 3582 #endif 3583 3584 KA_TRACE(10, 3585 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3586 global_tid, packed_reduction_method, retval)); 3587 3588 return retval; 3589 } 3590 3591 /*! 3592 @ingroup SYNCHRONIZATION 3593 @param loc source location information 3594 @param global_tid global thread id. 3595 @param lck pointer to the unique lock data structure 3596 3597 Finish the execution of a blocking reduce. 3598 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3599 start function. 3600 */ 3601 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3602 kmp_critical_name *lck) { 3603 3604 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3605 #if OMP_40_ENABLED 3606 kmp_info_t *th; 3607 kmp_team_t *team; 3608 int teams_swapped = 0, task_state; 3609 #endif 3610 3611 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3612 3613 #if OMP_40_ENABLED 3614 th = __kmp_thread_from_gtid(global_tid); 3615 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3616 #endif // OMP_40_ENABLED 3617 3618 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3619 3620 // this barrier should be visible to a customer and to the threading profile 3621 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3622 3623 if (packed_reduction_method == critical_reduce_block) { 3624 3625 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3626 3627 // TODO: implicit barrier: should be exposed 3628 #if OMPT_SUPPORT 3629 ompt_frame_t *ompt_frame; 3630 if (ompt_enabled.enabled) { 3631 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3632 if (ompt_frame->enter_frame == NULL) 3633 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3634 OMPT_STORE_RETURN_ADDRESS(global_tid); 3635 } 3636 #endif 3637 #if USE_ITT_NOTIFY 3638 __kmp_threads[global_tid]->th.th_ident = loc; 3639 #endif 3640 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3641 #if OMPT_SUPPORT && OMPT_OPTIONAL 3642 if (ompt_enabled.enabled) { 3643 ompt_frame->enter_frame = NULL; 3644 } 3645 #endif 3646 3647 } else if (packed_reduction_method == empty_reduce_block) { 3648 3649 // usage: if team size==1, no synchronization is required (Intel platforms only) 3650 3651 // TODO: implicit barrier: should be exposed 3652 #if OMPT_SUPPORT 3653 ompt_frame_t *ompt_frame; 3654 if (ompt_enabled.enabled) { 3655 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3656 if (ompt_frame->enter_frame == NULL) 3657 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3658 OMPT_STORE_RETURN_ADDRESS(global_tid); 3659 } 3660 #endif 3661 #if USE_ITT_NOTIFY 3662 __kmp_threads[global_tid]->th.th_ident = loc; 3663 #endif 3664 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3665 #if OMPT_SUPPORT && OMPT_OPTIONAL 3666 if (ompt_enabled.enabled) { 3667 ompt_frame->enter_frame = NULL; 3668 } 3669 #endif 3670 3671 } else if (packed_reduction_method == atomic_reduce_block) { 3672 3673 #if OMPT_SUPPORT 3674 ompt_frame_t *ompt_frame; 3675 if (ompt_enabled.enabled) { 3676 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3677 if (ompt_frame->enter_frame == NULL) 3678 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3679 OMPT_STORE_RETURN_ADDRESS(global_tid); 3680 } 3681 #endif 3682 // TODO: implicit barrier: should be exposed 3683 #if USE_ITT_NOTIFY 3684 __kmp_threads[global_tid]->th.th_ident = loc; 3685 #endif 3686 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3687 #if OMPT_SUPPORT && OMPT_OPTIONAL 3688 if (ompt_enabled.enabled) { 3689 ompt_frame->enter_frame = NULL; 3690 } 3691 #endif 3692 3693 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3694 tree_reduce_block)) { 3695 3696 // only master executes here (master releases all other workers) 3697 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3698 global_tid); 3699 3700 } else { 3701 3702 // should never reach this block 3703 KMP_ASSERT(0); // "unexpected method" 3704 } 3705 #if OMP_40_ENABLED 3706 if (teams_swapped) { 3707 __kmp_restore_swapped_teams(th, team, task_state); 3708 } 3709 #endif 3710 3711 if (__kmp_env_consistency_check) 3712 __kmp_pop_sync(global_tid, ct_reduce, loc); 3713 3714 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3715 global_tid, packed_reduction_method)); 3716 3717 return; 3718 } 3719 3720 #undef __KMP_GET_REDUCTION_METHOD 3721 #undef __KMP_SET_REDUCTION_METHOD 3722 3723 /* end of interface to fast scalable reduce routines */ 3724 3725 kmp_uint64 __kmpc_get_taskid() { 3726 3727 kmp_int32 gtid; 3728 kmp_info_t *thread; 3729 3730 gtid = __kmp_get_gtid(); 3731 if (gtid < 0) { 3732 return 0; 3733 } 3734 thread = __kmp_thread_from_gtid(gtid); 3735 return thread->th.th_current_task->td_task_id; 3736 3737 } // __kmpc_get_taskid 3738 3739 kmp_uint64 __kmpc_get_parent_taskid() { 3740 3741 kmp_int32 gtid; 3742 kmp_info_t *thread; 3743 kmp_taskdata_t *parent_task; 3744 3745 gtid = __kmp_get_gtid(); 3746 if (gtid < 0) { 3747 return 0; 3748 } 3749 thread = __kmp_thread_from_gtid(gtid); 3750 parent_task = thread->th.th_current_task->td_parent; 3751 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3752 3753 } // __kmpc_get_parent_taskid 3754 3755 #if OMP_45_ENABLED 3756 /*! 3757 @ingroup WORK_SHARING 3758 @param loc source location information. 3759 @param gtid global thread number. 3760 @param num_dims number of associated doacross loops. 3761 @param dims info on loops bounds. 3762 3763 Initialize doacross loop information. 3764 Expect compiler send us inclusive bounds, 3765 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3766 */ 3767 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3768 struct kmp_dim *dims) { 3769 int j, idx; 3770 kmp_int64 last, trace_count; 3771 kmp_info_t *th = __kmp_threads[gtid]; 3772 kmp_team_t *team = th->th.th_team; 3773 kmp_uint32 *flags; 3774 kmp_disp_t *pr_buf = th->th.th_dispatch; 3775 dispatch_shared_info_t *sh_buf; 3776 3777 KA_TRACE( 3778 20, 3779 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3780 gtid, num_dims, !team->t.t_serialized)); 3781 KMP_DEBUG_ASSERT(dims != NULL); 3782 KMP_DEBUG_ASSERT(num_dims > 0); 3783 3784 if (team->t.t_serialized) { 3785 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3786 return; // no dependencies if team is serialized 3787 } 3788 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3789 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3790 // the next loop 3791 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3792 3793 // Save bounds info into allocated private buffer 3794 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3795 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3796 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3797 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3798 pr_buf->th_doacross_info[0] = 3799 (kmp_int64)num_dims; // first element is number of dimensions 3800 // Save also address of num_done in order to access it later without knowing 3801 // the buffer index 3802 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3803 pr_buf->th_doacross_info[2] = dims[0].lo; 3804 pr_buf->th_doacross_info[3] = dims[0].up; 3805 pr_buf->th_doacross_info[4] = dims[0].st; 3806 last = 5; 3807 for (j = 1; j < num_dims; ++j) { 3808 kmp_int64 3809 range_length; // To keep ranges of all dimensions but the first dims[0] 3810 if (dims[j].st == 1) { // most common case 3811 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3812 range_length = dims[j].up - dims[j].lo + 1; 3813 } else { 3814 if (dims[j].st > 0) { 3815 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3816 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3817 } else { // negative increment 3818 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3819 range_length = 3820 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3821 } 3822 } 3823 pr_buf->th_doacross_info[last++] = range_length; 3824 pr_buf->th_doacross_info[last++] = dims[j].lo; 3825 pr_buf->th_doacross_info[last++] = dims[j].up; 3826 pr_buf->th_doacross_info[last++] = dims[j].st; 3827 } 3828 3829 // Compute total trip count. 3830 // Start with range of dims[0] which we don't need to keep in the buffer. 3831 if (dims[0].st == 1) { // most common case 3832 trace_count = dims[0].up - dims[0].lo + 1; 3833 } else if (dims[0].st > 0) { 3834 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3835 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3836 } else { // negative increment 3837 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3838 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3839 } 3840 for (j = 1; j < num_dims; ++j) { 3841 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3842 } 3843 KMP_DEBUG_ASSERT(trace_count > 0); 3844 3845 // Check if shared buffer is not occupied by other loop (idx - 3846 // __kmp_dispatch_num_buffers) 3847 if (idx != sh_buf->doacross_buf_idx) { 3848 // Shared buffer is occupied, wait for it to be free 3849 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3850 __kmp_eq_4, NULL); 3851 } 3852 #if KMP_32_BIT_ARCH 3853 // Check if we are the first thread. After the CAS the first thread gets 0, 3854 // others get 1 if initialization is in progress, allocated pointer otherwise. 3855 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3856 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3857 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3858 #else 3859 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3860 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3861 #endif 3862 if (flags == NULL) { 3863 // we are the first thread, allocate the array of flags 3864 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3865 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3866 KMP_MB(); 3867 sh_buf->doacross_flags = flags; 3868 } else if (flags == (kmp_uint32 *)1) { 3869 #if KMP_32_BIT_ARCH 3870 // initialization is still in progress, need to wait 3871 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3872 #else 3873 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3874 #endif 3875 KMP_YIELD(TRUE); 3876 KMP_MB(); 3877 } else { 3878 KMP_MB(); 3879 } 3880 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3881 pr_buf->th_doacross_flags = 3882 sh_buf->doacross_flags; // save private copy in order to not 3883 // touch shared buffer on each iteration 3884 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3885 } 3886 3887 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) { 3888 kmp_int32 shft, num_dims, i; 3889 kmp_uint32 flag; 3890 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3891 kmp_info_t *th = __kmp_threads[gtid]; 3892 kmp_team_t *team = th->th.th_team; 3893 kmp_disp_t *pr_buf; 3894 kmp_int64 lo, up, st; 3895 3896 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3897 if (team->t.t_serialized) { 3898 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3899 return; // no dependencies if team is serialized 3900 } 3901 3902 // calculate sequential iteration number and check out-of-bounds condition 3903 pr_buf = th->th.th_dispatch; 3904 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3905 num_dims = pr_buf->th_doacross_info[0]; 3906 lo = pr_buf->th_doacross_info[2]; 3907 up = pr_buf->th_doacross_info[3]; 3908 st = pr_buf->th_doacross_info[4]; 3909 if (st == 1) { // most common case 3910 if (vec[0] < lo || vec[0] > up) { 3911 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3912 "bounds [%lld,%lld]\n", 3913 gtid, vec[0], lo, up)); 3914 return; 3915 } 3916 iter_number = vec[0] - lo; 3917 } else if (st > 0) { 3918 if (vec[0] < lo || vec[0] > up) { 3919 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3920 "bounds [%lld,%lld]\n", 3921 gtid, vec[0], lo, up)); 3922 return; 3923 } 3924 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3925 } else { // negative increment 3926 if (vec[0] > lo || vec[0] < up) { 3927 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3928 "bounds [%lld,%lld]\n", 3929 gtid, vec[0], lo, up)); 3930 return; 3931 } 3932 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3933 } 3934 for (i = 1; i < num_dims; ++i) { 3935 kmp_int64 iter, ln; 3936 kmp_int32 j = i * 4; 3937 ln = pr_buf->th_doacross_info[j + 1]; 3938 lo = pr_buf->th_doacross_info[j + 2]; 3939 up = pr_buf->th_doacross_info[j + 3]; 3940 st = pr_buf->th_doacross_info[j + 4]; 3941 if (st == 1) { 3942 if (vec[i] < lo || vec[i] > up) { 3943 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3944 "bounds [%lld,%lld]\n", 3945 gtid, vec[i], lo, up)); 3946 return; 3947 } 3948 iter = vec[i] - lo; 3949 } else if (st > 0) { 3950 if (vec[i] < lo || vec[i] > up) { 3951 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3952 "bounds [%lld,%lld]\n", 3953 gtid, vec[i], lo, up)); 3954 return; 3955 } 3956 iter = (kmp_uint64)(vec[i] - lo) / st; 3957 } else { // st < 0 3958 if (vec[i] > lo || vec[i] < up) { 3959 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3960 "bounds [%lld,%lld]\n", 3961 gtid, vec[i], lo, up)); 3962 return; 3963 } 3964 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3965 } 3966 iter_number = iter + ln * iter_number; 3967 } 3968 shft = iter_number % 32; // use 32-bit granularity 3969 iter_number >>= 5; // divided by 32 3970 flag = 1 << shft; 3971 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3972 KMP_YIELD(TRUE); 3973 } 3974 KMP_MB(); 3975 KA_TRACE(20, 3976 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3977 gtid, (iter_number << 5) + shft)); 3978 } 3979 3980 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) { 3981 kmp_int32 shft, num_dims, i; 3982 kmp_uint32 flag; 3983 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3984 kmp_info_t *th = __kmp_threads[gtid]; 3985 kmp_team_t *team = th->th.th_team; 3986 kmp_disp_t *pr_buf; 3987 kmp_int64 lo, st; 3988 3989 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 3990 if (team->t.t_serialized) { 3991 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 3992 return; // no dependencies if team is serialized 3993 } 3994 3995 // calculate sequential iteration number (same as in "wait" but no 3996 // out-of-bounds checks) 3997 pr_buf = th->th.th_dispatch; 3998 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3999 num_dims = pr_buf->th_doacross_info[0]; 4000 lo = pr_buf->th_doacross_info[2]; 4001 st = pr_buf->th_doacross_info[4]; 4002 if (st == 1) { // most common case 4003 iter_number = vec[0] - lo; 4004 } else if (st > 0) { 4005 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4006 } else { // negative increment 4007 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4008 } 4009 for (i = 1; i < num_dims; ++i) { 4010 kmp_int64 iter, ln; 4011 kmp_int32 j = i * 4; 4012 ln = pr_buf->th_doacross_info[j + 1]; 4013 lo = pr_buf->th_doacross_info[j + 2]; 4014 st = pr_buf->th_doacross_info[j + 4]; 4015 if (st == 1) { 4016 iter = vec[i] - lo; 4017 } else if (st > 0) { 4018 iter = (kmp_uint64)(vec[i] - lo) / st; 4019 } else { // st < 0 4020 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4021 } 4022 iter_number = iter + ln * iter_number; 4023 } 4024 shft = iter_number % 32; // use 32-bit granularity 4025 iter_number >>= 5; // divided by 32 4026 flag = 1 << shft; 4027 KMP_MB(); 4028 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4029 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4030 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4031 (iter_number << 5) + shft)); 4032 } 4033 4034 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4035 kmp_int32 num_done; 4036 kmp_info_t *th = __kmp_threads[gtid]; 4037 kmp_team_t *team = th->th.th_team; 4038 kmp_disp_t *pr_buf = th->th.th_dispatch; 4039 4040 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4041 if (team->t.t_serialized) { 4042 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4043 return; // nothing to do 4044 } 4045 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4046 if (num_done == th->th.th_team_nproc) { 4047 // we are the last thread, need to free shared resources 4048 int idx = pr_buf->th_doacross_buf_idx - 1; 4049 dispatch_shared_info_t *sh_buf = 4050 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4051 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4052 (kmp_int64)&sh_buf->doacross_num_done); 4053 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4054 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4055 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4056 sh_buf->doacross_flags = NULL; 4057 sh_buf->doacross_num_done = 0; 4058 sh_buf->doacross_buf_idx += 4059 __kmp_dispatch_num_buffers; // free buffer for future re-use 4060 } 4061 // free private resources (need to keep buffer index forever) 4062 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4063 pr_buf->th_doacross_info = NULL; 4064 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4065 } 4066 #endif 4067 4068 // end of file // 4069