1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #define __KMP_IMP 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #define MAX_MESSAGE 512 27 28 // flags will be used in future, e.g. to implement openmp_strict library 29 // restrictions 30 31 /*! 32 * @ingroup STARTUP_SHUTDOWN 33 * @param loc in source location information 34 * @param flags in for future use (currently ignored) 35 * 36 * Initialize the runtime library. This call is optional; if it is not made then 37 * it will be implicitly called by attempts to use other library functions. 38 */ 39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 40 // By default __kmpc_begin() is no-op. 41 char *env; 42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 43 __kmp_str_match_true(env)) { 44 __kmp_middle_initialize(); 45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 46 } else if (__kmp_ignore_mppbeg() == FALSE) { 47 // By default __kmp_ignore_mppbeg() returns TRUE. 48 __kmp_internal_begin(); 49 KC_TRACE(10, ("__kmpc_begin: called\n")); 50 } 51 } 52 53 /*! 54 * @ingroup STARTUP_SHUTDOWN 55 * @param loc source location information 56 * 57 * Shutdown the runtime library. This is also optional, and even if called will 58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 59 * zero. 60 */ 61 void __kmpc_end(ident_t *loc) { 62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 65 // returns FALSE and __kmpc_end() will unregister this root (it can cause 66 // library shut down). 67 if (__kmp_ignore_mppend() == FALSE) { 68 KC_TRACE(10, ("__kmpc_end: called\n")); 69 KA_TRACE(30, ("__kmpc_end\n")); 70 71 __kmp_internal_end_thread(-1); 72 } 73 #if KMP_OS_WINDOWS && OMPT_SUPPORT 74 // Normal exit process on Windows does not allow worker threads of the final 75 // parallel region to finish reporting their events, so shutting down the 76 // library here fixes the issue at least for the cases where __kmpc_end() is 77 // placed properly. 78 if (ompt_enabled.enabled) 79 __kmp_internal_end_library(__kmp_gtid_get_specific()); 80 #endif 81 } 82 83 /*! 84 @ingroup THREAD_STATES 85 @param loc Source location information. 86 @return The global thread index of the active thread. 87 88 This function can be called in any context. 89 90 If the runtime has ony been entered at the outermost level from a 91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 92 that which would be returned by omp_get_thread_num() in the outermost 93 active parallel construct. (Or zero if there is no active parallel 94 construct, since the master thread is necessarily thread zero). 95 96 If multiple non-OpenMP threads all enter an OpenMP construct then this 97 will be a unique thread identifier among all the threads created by 98 the OpenMP runtime (but the value cannote be defined in terms of 99 OpenMP thread ids returned by omp_get_thread_num()). 100 */ 101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 102 kmp_int32 gtid = __kmp_entry_gtid(); 103 104 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 105 106 return gtid; 107 } 108 109 /*! 110 @ingroup THREAD_STATES 111 @param loc Source location information. 112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 113 114 This function can be called in any context. 115 It returns the total number of threads under the control of the OpenMP runtime. 116 That is not a number that can be determined by any OpenMP standard calls, since 117 the library may be called from more than one non-OpenMP thread, and this 118 reflects the total over all such calls. Similarly the runtime maintains 119 underlying threads even when they are not active (since the cost of creating 120 and destroying OS threads is high), this call counts all such threads even if 121 they are not waiting for work. 122 */ 123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 124 KC_TRACE(10, 125 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 126 127 return TCR_4(__kmp_all_nth); 128 } 129 130 /*! 131 @ingroup THREAD_STATES 132 @param loc Source location information. 133 @return The thread number of the calling thread in the innermost active parallel 134 construct. 135 */ 136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 137 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 138 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 139 } 140 141 /*! 142 @ingroup THREAD_STATES 143 @param loc Source location information. 144 @return The number of threads in the innermost active parallel construct. 145 */ 146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 147 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 148 149 return __kmp_entry_thread()->th.th_team->t.t_nproc; 150 } 151 152 /*! 153 * @ingroup DEPRECATED 154 * @param loc location description 155 * 156 * This function need not be called. It always returns TRUE. 157 */ 158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 159 #ifndef KMP_DEBUG 160 161 return TRUE; 162 163 #else 164 165 const char *semi2; 166 const char *semi3; 167 int line_no; 168 169 if (__kmp_par_range == 0) { 170 return TRUE; 171 } 172 semi2 = loc->psource; 173 if (semi2 == NULL) { 174 return TRUE; 175 } 176 semi2 = strchr(semi2, ';'); 177 if (semi2 == NULL) { 178 return TRUE; 179 } 180 semi2 = strchr(semi2 + 1, ';'); 181 if (semi2 == NULL) { 182 return TRUE; 183 } 184 if (__kmp_par_range_filename[0]) { 185 const char *name = semi2 - 1; 186 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 187 name--; 188 } 189 if ((*name == '/') || (*name == ';')) { 190 name++; 191 } 192 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 193 return __kmp_par_range < 0; 194 } 195 } 196 semi3 = strchr(semi2 + 1, ';'); 197 if (__kmp_par_range_routine[0]) { 198 if ((semi3 != NULL) && (semi3 > semi2) && 199 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 200 return __kmp_par_range < 0; 201 } 202 } 203 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 204 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 205 return __kmp_par_range > 0; 206 } 207 return __kmp_par_range < 0; 208 } 209 return TRUE; 210 211 #endif /* KMP_DEBUG */ 212 } 213 214 /*! 215 @ingroup THREAD_STATES 216 @param loc Source location information. 217 @return 1 if this thread is executing inside an active parallel region, zero if 218 not. 219 */ 220 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 221 return __kmp_entry_thread()->th.th_root->r.r_active; 222 } 223 224 /*! 225 @ingroup PARALLEL 226 @param loc source location information 227 @param global_tid global thread number 228 @param num_threads number of threads requested for this parallel construct 229 230 Set the number of threads to be used by the next fork spawned by this thread. 231 This call is only required if the parallel construct has a `num_threads` clause. 232 */ 233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 234 kmp_int32 num_threads) { 235 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 236 global_tid, num_threads)); 237 238 __kmp_push_num_threads(loc, global_tid, num_threads); 239 } 240 241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 242 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 243 244 /* the num_threads are automatically popped */ 245 } 246 247 #if OMP_40_ENABLED 248 249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 250 kmp_int32 proc_bind) { 251 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 252 proc_bind)); 253 254 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 255 } 256 257 #endif /* OMP_40_ENABLED */ 258 259 /*! 260 @ingroup PARALLEL 261 @param loc source location information 262 @param argc total number of arguments in the ellipsis 263 @param microtask pointer to callback routine consisting of outlined parallel 264 construct 265 @param ... pointers to shared variables that aren't global 266 267 Do the actual fork and call the microtask in the relevant number of threads. 268 */ 269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 270 int gtid = __kmp_entry_gtid(); 271 272 #if (KMP_STATS_ENABLED) 273 // If we were in a serial region, then stop the serial timer, record 274 // the event, and start parallel region timer 275 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 276 if (previous_state == stats_state_e::SERIAL_REGION) { 277 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 278 } else { 279 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 280 } 281 int inParallel = __kmpc_in_parallel(loc); 282 if (inParallel) { 283 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 284 } else { 285 KMP_COUNT_BLOCK(OMP_PARALLEL); 286 } 287 #endif 288 289 // maybe to save thr_state is enough here 290 { 291 va_list ap; 292 va_start(ap, microtask); 293 294 #if OMPT_SUPPORT 295 ompt_frame_t *ompt_frame; 296 if (ompt_enabled.enabled) { 297 kmp_info_t *master_th = __kmp_threads[gtid]; 298 kmp_team_t *parent_team = master_th->th.th_team; 299 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 300 if (lwt) 301 ompt_frame = &(lwt->ompt_task_info.frame); 302 else { 303 int tid = __kmp_tid_from_gtid(gtid); 304 ompt_frame = &( 305 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 306 } 307 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 308 OMPT_STORE_RETURN_ADDRESS(gtid); 309 } 310 #endif 311 312 #if INCLUDE_SSC_MARKS 313 SSC_MARK_FORKING(); 314 #endif 315 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 316 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 317 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 320 &ap 321 #else 322 ap 323 #endif 324 ); 325 #if INCLUDE_SSC_MARKS 326 SSC_MARK_JOINING(); 327 #endif 328 __kmp_join_call(loc, gtid 329 #if OMPT_SUPPORT 330 , 331 fork_context_intel 332 #endif 333 ); 334 335 va_end(ap); 336 } 337 338 #if KMP_STATS_ENABLED 339 if (previous_state == stats_state_e::SERIAL_REGION) { 340 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 341 } else { 342 KMP_POP_PARTITIONED_TIMER(); 343 } 344 #endif // KMP_STATS_ENABLED 345 } 346 347 #if OMP_40_ENABLED 348 /*! 349 @ingroup PARALLEL 350 @param loc source location information 351 @param global_tid global thread number 352 @param num_teams number of teams requested for the teams construct 353 @param num_threads number of threads per team requested for the teams construct 354 355 Set the number of teams to be used by the teams construct. 356 This call is only required if the teams construct has a `num_teams` clause 357 or a `thread_limit` clause (or both). 358 */ 359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 360 kmp_int32 num_teams, kmp_int32 num_threads) { 361 KA_TRACE(20, 362 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 363 global_tid, num_teams, num_threads)); 364 365 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 366 } 367 368 /*! 369 @ingroup PARALLEL 370 @param loc source location information 371 @param argc total number of arguments in the ellipsis 372 @param microtask pointer to callback routine consisting of outlined teams 373 construct 374 @param ... pointers to shared variables that aren't global 375 376 Do the actual fork and call the microtask in the relevant number of threads. 377 */ 378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 379 ...) { 380 int gtid = __kmp_entry_gtid(); 381 kmp_info_t *this_thr = __kmp_threads[gtid]; 382 va_list ap; 383 va_start(ap, microtask); 384 385 KMP_COUNT_BLOCK(OMP_TEAMS); 386 387 // remember teams entry point and nesting level 388 this_thr->th.th_teams_microtask = microtask; 389 this_thr->th.th_teams_level = 390 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 391 392 #if OMPT_SUPPORT 393 kmp_team_t *parent_team = this_thr->th.th_team; 394 int tid = __kmp_tid_from_gtid(gtid); 395 if (ompt_enabled.enabled) { 396 parent_team->t.t_implicit_task_taskdata[tid] 397 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 398 } 399 OMPT_STORE_RETURN_ADDRESS(gtid); 400 #endif 401 402 // check if __kmpc_push_num_teams called, set default number of teams 403 // otherwise 404 if (this_thr->th.th_teams_size.nteams == 0) { 405 __kmp_push_num_teams(loc, gtid, 0, 0); 406 } 407 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 408 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 409 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 410 411 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 412 VOLATILE_CAST(microtask_t) 413 __kmp_teams_master, // "wrapped" task 414 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 416 &ap 417 #else 418 ap 419 #endif 420 ); 421 __kmp_join_call(loc, gtid 422 #if OMPT_SUPPORT 423 , 424 fork_context_intel 425 #endif 426 ); 427 428 this_thr->th.th_teams_microtask = NULL; 429 this_thr->th.th_teams_level = 0; 430 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 431 va_end(ap); 432 } 433 #endif /* OMP_40_ENABLED */ 434 435 // I don't think this function should ever have been exported. 436 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 437 // openmp code ever called it, but it's been exported from the RTL for so 438 // long that I'm afraid to remove the definition. 439 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 440 441 /*! 442 @ingroup PARALLEL 443 @param loc source location information 444 @param global_tid global thread number 445 446 Enter a serialized parallel construct. This interface is used to handle a 447 conditional parallel region, like this, 448 @code 449 #pragma omp parallel if (condition) 450 @endcode 451 when the condition is false. 452 */ 453 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 454 // The implementation is now in kmp_runtime.cpp so that it can share static 455 // functions with kmp_fork_call since the tasks to be done are similar in 456 // each case. 457 #if OMPT_SUPPORT 458 OMPT_STORE_RETURN_ADDRESS(global_tid); 459 #endif 460 __kmp_serialized_parallel(loc, global_tid); 461 } 462 463 /*! 464 @ingroup PARALLEL 465 @param loc source location information 466 @param global_tid global thread number 467 468 Leave a serialized parallel construct. 469 */ 470 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 471 kmp_internal_control_t *top; 472 kmp_info_t *this_thr; 473 kmp_team_t *serial_team; 474 475 KC_TRACE(10, 476 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 477 478 /* skip all this code for autopar serialized loops since it results in 479 unacceptable overhead */ 480 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 481 return; 482 483 // Not autopar code 484 if (!TCR_4(__kmp_init_parallel)) 485 __kmp_parallel_initialize(); 486 487 #if OMP_50_ENABLED 488 __kmp_resume_if_soft_paused(); 489 #endif 490 491 this_thr = __kmp_threads[global_tid]; 492 serial_team = this_thr->th.th_serial_team; 493 494 #if OMP_45_ENABLED 495 kmp_task_team_t *task_team = this_thr->th.th_task_team; 496 497 // we need to wait for the proxy tasks before finishing the thread 498 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 499 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 500 #endif 501 502 KMP_MB(); 503 KMP_DEBUG_ASSERT(serial_team); 504 KMP_ASSERT(serial_team->t.t_serialized); 505 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 506 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 507 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 508 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 509 510 #if OMPT_SUPPORT 511 if (ompt_enabled.enabled && 512 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 513 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none; 514 if (ompt_enabled.ompt_callback_implicit_task) { 515 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 516 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 517 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit); 518 } 519 520 // reset clear the task id only after unlinking the task 521 ompt_data_t *parent_task_data; 522 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 523 524 if (ompt_enabled.ompt_callback_parallel_end) { 525 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 526 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 527 ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 528 } 529 __ompt_lw_taskteam_unlink(this_thr); 530 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 531 } 532 #endif 533 534 /* If necessary, pop the internal control stack values and replace the team 535 * values */ 536 top = serial_team->t.t_control_stack_top; 537 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 538 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 539 serial_team->t.t_control_stack_top = top->next; 540 __kmp_free(top); 541 } 542 543 // if( serial_team -> t.t_serialized > 1 ) 544 serial_team->t.t_level--; 545 546 /* pop dispatch buffers stack */ 547 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 548 { 549 dispatch_private_info_t *disp_buffer = 550 serial_team->t.t_dispatch->th_disp_buffer; 551 serial_team->t.t_dispatch->th_disp_buffer = 552 serial_team->t.t_dispatch->th_disp_buffer->next; 553 __kmp_free(disp_buffer); 554 } 555 #if OMP_50_ENABLED 556 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore 557 #endif 558 559 --serial_team->t.t_serialized; 560 if (serial_team->t.t_serialized == 0) { 561 562 /* return to the parallel section */ 563 564 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 565 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 566 __kmp_clear_x87_fpu_status_word(); 567 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 568 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 569 } 570 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 571 572 this_thr->th.th_team = serial_team->t.t_parent; 573 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 574 575 /* restore values cached in the thread */ 576 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 577 this_thr->th.th_team_master = 578 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 579 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 580 581 /* TODO the below shouldn't need to be adjusted for serialized teams */ 582 this_thr->th.th_dispatch = 583 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 584 585 __kmp_pop_current_task_from_thread(this_thr); 586 587 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 588 this_thr->th.th_current_task->td_flags.executing = 1; 589 590 if (__kmp_tasking_mode != tskm_immediate_exec) { 591 // Copy the task team from the new child / old parent team to the thread. 592 this_thr->th.th_task_team = 593 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 594 KA_TRACE(20, 595 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 596 "team %p\n", 597 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 598 } 599 } else { 600 if (__kmp_tasking_mode != tskm_immediate_exec) { 601 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 602 "depth of serial team %p to %d\n", 603 global_tid, serial_team, serial_team->t.t_serialized)); 604 } 605 } 606 607 if (__kmp_env_consistency_check) 608 __kmp_pop_parallel(global_tid, NULL); 609 #if OMPT_SUPPORT 610 if (ompt_enabled.enabled) 611 this_thr->th.ompt_thread_info.state = 612 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial 613 : ompt_state_work_parallel); 614 #endif 615 } 616 617 /*! 618 @ingroup SYNCHRONIZATION 619 @param loc source location information. 620 621 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 622 depending on the memory ordering convention obeyed by the compiler 623 even that may not be necessary). 624 */ 625 void __kmpc_flush(ident_t *loc) { 626 KC_TRACE(10, ("__kmpc_flush: called\n")); 627 628 /* need explicit __mf() here since use volatile instead in library */ 629 KMP_MB(); /* Flush all pending memory write invalidates. */ 630 631 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 632 #if KMP_MIC 633 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 634 // We shouldn't need it, though, since the ABI rules require that 635 // * If the compiler generates NGO stores it also generates the fence 636 // * If users hand-code NGO stores they should insert the fence 637 // therefore no incomplete unordered stores should be visible. 638 #else 639 // C74404 640 // This is to address non-temporal store instructions (sfence needed). 641 // The clflush instruction is addressed either (mfence needed). 642 // Probably the non-temporal load monvtdqa instruction should also be 643 // addressed. 644 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 645 if (!__kmp_cpuinfo.initialized) { 646 __kmp_query_cpuid(&__kmp_cpuinfo); 647 } 648 if (!__kmp_cpuinfo.sse2) { 649 // CPU cannot execute SSE2 instructions. 650 } else { 651 #if KMP_COMPILER_ICC 652 _mm_mfence(); 653 #elif KMP_COMPILER_MSVC 654 MemoryBarrier(); 655 #else 656 __sync_synchronize(); 657 #endif // KMP_COMPILER_ICC 658 } 659 #endif // KMP_MIC 660 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 661 // Nothing to see here move along 662 #elif KMP_ARCH_PPC64 663 // Nothing needed here (we have a real MB above). 664 #if KMP_OS_CNK 665 // The flushing thread needs to yield here; this prevents a 666 // busy-waiting thread from saturating the pipeline. flush is 667 // often used in loops like this: 668 // while (!flag) { 669 // #pragma omp flush(flag) 670 // } 671 // and adding the yield here is good for at least a 10x speedup 672 // when running >2 threads per core (on the NAS LU benchmark). 673 __kmp_yield(TRUE); 674 #endif 675 #else 676 #error Unknown or unsupported architecture 677 #endif 678 679 #if OMPT_SUPPORT && OMPT_OPTIONAL 680 if (ompt_enabled.ompt_callback_flush) { 681 ompt_callbacks.ompt_callback(ompt_callback_flush)( 682 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 683 } 684 #endif 685 } 686 687 /* -------------------------------------------------------------------------- */ 688 /*! 689 @ingroup SYNCHRONIZATION 690 @param loc source location information 691 @param global_tid thread id. 692 693 Execute a barrier. 694 */ 695 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 696 KMP_COUNT_BLOCK(OMP_BARRIER); 697 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 698 699 if (!TCR_4(__kmp_init_parallel)) 700 __kmp_parallel_initialize(); 701 702 #if OMP_50_ENABLED 703 __kmp_resume_if_soft_paused(); 704 #endif 705 706 if (__kmp_env_consistency_check) { 707 if (loc == 0) { 708 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 709 } 710 711 __kmp_check_barrier(global_tid, ct_barrier, loc); 712 } 713 714 #if OMPT_SUPPORT 715 ompt_frame_t *ompt_frame; 716 if (ompt_enabled.enabled) { 717 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 718 if (ompt_frame->enter_frame.ptr == NULL) 719 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 720 OMPT_STORE_RETURN_ADDRESS(global_tid); 721 } 722 #endif 723 __kmp_threads[global_tid]->th.th_ident = loc; 724 // TODO: explicit barrier_wait_id: 725 // this function is called when 'barrier' directive is present or 726 // implicit barrier at the end of a worksharing construct. 727 // 1) better to add a per-thread barrier counter to a thread data structure 728 // 2) set to 0 when a new team is created 729 // 4) no sync is required 730 731 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 732 #if OMPT_SUPPORT && OMPT_OPTIONAL 733 if (ompt_enabled.enabled) { 734 ompt_frame->enter_frame = ompt_data_none; 735 } 736 #endif 737 } 738 739 /* The BARRIER for a MASTER section is always explicit */ 740 /*! 741 @ingroup WORK_SHARING 742 @param loc source location information. 743 @param global_tid global thread number . 744 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 745 */ 746 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 747 int status = 0; 748 749 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 750 751 if (!TCR_4(__kmp_init_parallel)) 752 __kmp_parallel_initialize(); 753 754 #if OMP_50_ENABLED 755 __kmp_resume_if_soft_paused(); 756 #endif 757 758 if (KMP_MASTER_GTID(global_tid)) { 759 KMP_COUNT_BLOCK(OMP_MASTER); 760 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 761 status = 1; 762 } 763 764 #if OMPT_SUPPORT && OMPT_OPTIONAL 765 if (status) { 766 if (ompt_enabled.ompt_callback_master) { 767 kmp_info_t *this_thr = __kmp_threads[global_tid]; 768 kmp_team_t *team = this_thr->th.th_team; 769 770 int tid = __kmp_tid_from_gtid(global_tid); 771 ompt_callbacks.ompt_callback(ompt_callback_master)( 772 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 773 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 774 OMPT_GET_RETURN_ADDRESS(0)); 775 } 776 } 777 #endif 778 779 if (__kmp_env_consistency_check) { 780 #if KMP_USE_DYNAMIC_LOCK 781 if (status) 782 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 783 else 784 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 785 #else 786 if (status) 787 __kmp_push_sync(global_tid, ct_master, loc, NULL); 788 else 789 __kmp_check_sync(global_tid, ct_master, loc, NULL); 790 #endif 791 } 792 793 return status; 794 } 795 796 /*! 797 @ingroup WORK_SHARING 798 @param loc source location information. 799 @param global_tid global thread number . 800 801 Mark the end of a <tt>master</tt> region. This should only be called by the 802 thread that executes the <tt>master</tt> region. 803 */ 804 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 805 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 806 807 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 808 KMP_POP_PARTITIONED_TIMER(); 809 810 #if OMPT_SUPPORT && OMPT_OPTIONAL 811 kmp_info_t *this_thr = __kmp_threads[global_tid]; 812 kmp_team_t *team = this_thr->th.th_team; 813 if (ompt_enabled.ompt_callback_master) { 814 int tid = __kmp_tid_from_gtid(global_tid); 815 ompt_callbacks.ompt_callback(ompt_callback_master)( 816 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 817 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 818 OMPT_GET_RETURN_ADDRESS(0)); 819 } 820 #endif 821 822 if (__kmp_env_consistency_check) { 823 if (global_tid < 0) 824 KMP_WARNING(ThreadIdentInvalid); 825 826 if (KMP_MASTER_GTID(global_tid)) 827 __kmp_pop_sync(global_tid, ct_master, loc); 828 } 829 } 830 831 /*! 832 @ingroup WORK_SHARING 833 @param loc source location information. 834 @param gtid global thread number. 835 836 Start execution of an <tt>ordered</tt> construct. 837 */ 838 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 839 int cid = 0; 840 kmp_info_t *th; 841 KMP_DEBUG_ASSERT(__kmp_init_serial); 842 843 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 844 845 if (!TCR_4(__kmp_init_parallel)) 846 __kmp_parallel_initialize(); 847 848 #if OMP_50_ENABLED 849 __kmp_resume_if_soft_paused(); 850 #endif 851 852 #if USE_ITT_BUILD 853 __kmp_itt_ordered_prep(gtid); 854 // TODO: ordered_wait_id 855 #endif /* USE_ITT_BUILD */ 856 857 th = __kmp_threads[gtid]; 858 859 #if OMPT_SUPPORT && OMPT_OPTIONAL 860 kmp_team_t *team; 861 ompt_wait_id_t lck; 862 void *codeptr_ra; 863 if (ompt_enabled.enabled) { 864 OMPT_STORE_RETURN_ADDRESS(gtid); 865 team = __kmp_team_from_gtid(gtid); 866 lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value; 867 /* OMPT state update */ 868 th->th.ompt_thread_info.wait_id = lck; 869 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 870 871 /* OMPT event callback */ 872 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 873 if (ompt_enabled.ompt_callback_mutex_acquire) { 874 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 875 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 876 (ompt_wait_id_t)lck, codeptr_ra); 877 } 878 } 879 #endif 880 881 if (th->th.th_dispatch->th_deo_fcn != 0) 882 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 883 else 884 __kmp_parallel_deo(>id, &cid, loc); 885 886 #if OMPT_SUPPORT && OMPT_OPTIONAL 887 if (ompt_enabled.enabled) { 888 /* OMPT state update */ 889 th->th.ompt_thread_info.state = ompt_state_work_parallel; 890 th->th.ompt_thread_info.wait_id = 0; 891 892 /* OMPT event callback */ 893 if (ompt_enabled.ompt_callback_mutex_acquired) { 894 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 895 ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra); 896 } 897 } 898 #endif 899 900 #if USE_ITT_BUILD 901 __kmp_itt_ordered_start(gtid); 902 #endif /* USE_ITT_BUILD */ 903 } 904 905 /*! 906 @ingroup WORK_SHARING 907 @param loc source location information. 908 @param gtid global thread number. 909 910 End execution of an <tt>ordered</tt> construct. 911 */ 912 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 913 int cid = 0; 914 kmp_info_t *th; 915 916 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 917 918 #if USE_ITT_BUILD 919 __kmp_itt_ordered_end(gtid); 920 // TODO: ordered_wait_id 921 #endif /* USE_ITT_BUILD */ 922 923 th = __kmp_threads[gtid]; 924 925 if (th->th.th_dispatch->th_dxo_fcn != 0) 926 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 927 else 928 __kmp_parallel_dxo(>id, &cid, loc); 929 930 #if OMPT_SUPPORT && OMPT_OPTIONAL 931 OMPT_STORE_RETURN_ADDRESS(gtid); 932 if (ompt_enabled.ompt_callback_mutex_released) { 933 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 934 ompt_mutex_ordered, 935 (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 936 OMPT_LOAD_RETURN_ADDRESS(gtid)); 937 } 938 #endif 939 } 940 941 #if KMP_USE_DYNAMIC_LOCK 942 943 static __forceinline void 944 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 945 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 946 // Pointer to the allocated indirect lock is written to crit, while indexing 947 // is ignored. 948 void *idx; 949 kmp_indirect_lock_t **lck; 950 lck = (kmp_indirect_lock_t **)crit; 951 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 952 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 953 KMP_SET_I_LOCK_LOCATION(ilk, loc); 954 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 955 KA_TRACE(20, 956 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 957 #if USE_ITT_BUILD 958 __kmp_itt_critical_creating(ilk->lock, loc); 959 #endif 960 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 961 if (status == 0) { 962 #if USE_ITT_BUILD 963 __kmp_itt_critical_destroyed(ilk->lock); 964 #endif 965 // We don't really need to destroy the unclaimed lock here since it will be 966 // cleaned up at program exit. 967 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 968 } 969 KMP_DEBUG_ASSERT(*lck != NULL); 970 } 971 972 // Fast-path acquire tas lock 973 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 974 { \ 975 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 976 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 977 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 978 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 979 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 980 kmp_uint32 spins; \ 981 KMP_FSYNC_PREPARE(l); \ 982 KMP_INIT_YIELD(spins); \ 983 if (TCR_4(__kmp_nth) > \ 984 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 985 KMP_YIELD(TRUE); \ 986 } else { \ 987 KMP_YIELD_SPIN(spins); \ 988 } \ 989 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 990 while ( \ 991 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 992 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 993 __kmp_spin_backoff(&backoff); \ 994 if (TCR_4(__kmp_nth) > \ 995 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 996 KMP_YIELD(TRUE); \ 997 } else { \ 998 KMP_YIELD_SPIN(spins); \ 999 } \ 1000 } \ 1001 } \ 1002 KMP_FSYNC_ACQUIRED(l); \ 1003 } 1004 1005 // Fast-path test tas lock 1006 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 1007 { \ 1008 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 1009 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 1010 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 1011 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 1012 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 1013 } 1014 1015 // Fast-path release tas lock 1016 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 1017 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 1018 1019 #if KMP_USE_FUTEX 1020 1021 #include <sys/syscall.h> 1022 #include <unistd.h> 1023 #ifndef FUTEX_WAIT 1024 #define FUTEX_WAIT 0 1025 #endif 1026 #ifndef FUTEX_WAKE 1027 #define FUTEX_WAKE 1 1028 #endif 1029 1030 // Fast-path acquire futex lock 1031 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1032 { \ 1033 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1034 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1035 KMP_MB(); \ 1036 KMP_FSYNC_PREPARE(ftx); \ 1037 kmp_int32 poll_val; \ 1038 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1039 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1040 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1041 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1042 if (!cond) { \ 1043 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1044 poll_val | \ 1045 KMP_LOCK_BUSY(1, futex))) { \ 1046 continue; \ 1047 } \ 1048 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1049 } \ 1050 kmp_int32 rc; \ 1051 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1052 NULL, NULL, 0)) != 0) { \ 1053 continue; \ 1054 } \ 1055 gtid_code |= 1; \ 1056 } \ 1057 KMP_FSYNC_ACQUIRED(ftx); \ 1058 } 1059 1060 // Fast-path test futex lock 1061 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1062 { \ 1063 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1064 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1065 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1066 KMP_FSYNC_ACQUIRED(ftx); \ 1067 rc = TRUE; \ 1068 } else { \ 1069 rc = FALSE; \ 1070 } \ 1071 } 1072 1073 // Fast-path release futex lock 1074 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1075 { \ 1076 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1077 KMP_MB(); \ 1078 KMP_FSYNC_RELEASING(ftx); \ 1079 kmp_int32 poll_val = \ 1080 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1081 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1082 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1083 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1084 } \ 1085 KMP_MB(); \ 1086 KMP_YIELD(TCR_4(__kmp_nth) > \ 1087 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1088 } 1089 1090 #endif // KMP_USE_FUTEX 1091 1092 #else // KMP_USE_DYNAMIC_LOCK 1093 1094 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1095 ident_t const *loc, 1096 kmp_int32 gtid) { 1097 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1098 1099 // Because of the double-check, the following load doesn't need to be volatile 1100 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1101 1102 if (lck == NULL) { 1103 void *idx; 1104 1105 // Allocate & initialize the lock. 1106 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1107 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1108 __kmp_init_user_lock_with_checks(lck); 1109 __kmp_set_user_lock_location(lck, loc); 1110 #if USE_ITT_BUILD 1111 __kmp_itt_critical_creating(lck); 1112 // __kmp_itt_critical_creating() should be called *before* the first usage 1113 // of underlying lock. It is the only place where we can guarantee it. There 1114 // are chances the lock will destroyed with no usage, but it is not a 1115 // problem, because this is not real event seen by user but rather setting 1116 // name for object (lock). See more details in kmp_itt.h. 1117 #endif /* USE_ITT_BUILD */ 1118 1119 // Use a cmpxchg instruction to slam the start of the critical section with 1120 // the lock pointer. If another thread beat us to it, deallocate the lock, 1121 // and use the lock that the other thread allocated. 1122 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1123 1124 if (status == 0) { 1125 // Deallocate the lock and reload the value. 1126 #if USE_ITT_BUILD 1127 __kmp_itt_critical_destroyed(lck); 1128 // Let ITT know the lock is destroyed and the same memory location may be reused 1129 // for another purpose. 1130 #endif /* USE_ITT_BUILD */ 1131 __kmp_destroy_user_lock_with_checks(lck); 1132 __kmp_user_lock_free(&idx, gtid, lck); 1133 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1134 KMP_DEBUG_ASSERT(lck != NULL); 1135 } 1136 } 1137 return lck; 1138 } 1139 1140 #endif // KMP_USE_DYNAMIC_LOCK 1141 1142 /*! 1143 @ingroup WORK_SHARING 1144 @param loc source location information. 1145 @param global_tid global thread number . 1146 @param crit identity of the critical section. This could be a pointer to a lock 1147 associated with the critical section, or some other suitably unique value. 1148 1149 Enter code protected by a `critical` construct. 1150 This function blocks until the executing thread can enter the critical section. 1151 */ 1152 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1153 kmp_critical_name *crit) { 1154 #if KMP_USE_DYNAMIC_LOCK 1155 #if OMPT_SUPPORT && OMPT_OPTIONAL 1156 OMPT_STORE_RETURN_ADDRESS(global_tid); 1157 #endif // OMPT_SUPPORT 1158 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1159 #else 1160 KMP_COUNT_BLOCK(OMP_CRITICAL); 1161 #if OMPT_SUPPORT && OMPT_OPTIONAL 1162 ompt_state_t prev_state = ompt_state_undefined; 1163 ompt_thread_info_t ti; 1164 #endif 1165 kmp_user_lock_p lck; 1166 1167 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1168 1169 // TODO: add THR_OVHD_STATE 1170 1171 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1172 KMP_CHECK_USER_LOCK_INIT(); 1173 1174 if ((__kmp_user_lock_kind == lk_tas) && 1175 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1176 lck = (kmp_user_lock_p)crit; 1177 } 1178 #if KMP_USE_FUTEX 1179 else if ((__kmp_user_lock_kind == lk_futex) && 1180 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1181 lck = (kmp_user_lock_p)crit; 1182 } 1183 #endif 1184 else { // ticket, queuing or drdpa 1185 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1186 } 1187 1188 if (__kmp_env_consistency_check) 1189 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1190 1191 // since the critical directive binds to all threads, not just the current 1192 // team we have to check this even if we are in a serialized team. 1193 // also, even if we are the uber thread, we still have to conduct the lock, 1194 // as we have to contend with sibling threads. 1195 1196 #if USE_ITT_BUILD 1197 __kmp_itt_critical_acquiring(lck); 1198 #endif /* USE_ITT_BUILD */ 1199 #if OMPT_SUPPORT && OMPT_OPTIONAL 1200 OMPT_STORE_RETURN_ADDRESS(gtid); 1201 void *codeptr_ra = NULL; 1202 if (ompt_enabled.enabled) { 1203 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1204 /* OMPT state update */ 1205 prev_state = ti.state; 1206 ti.wait_id = (ompt_wait_id_t)lck; 1207 ti.state = ompt_state_wait_critical; 1208 1209 /* OMPT event callback */ 1210 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1211 if (ompt_enabled.ompt_callback_mutex_acquire) { 1212 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1213 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1214 (ompt_wait_id_t)crit, codeptr_ra); 1215 } 1216 } 1217 #endif 1218 // Value of 'crit' should be good for using as a critical_id of the critical 1219 // section directive. 1220 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1221 1222 #if USE_ITT_BUILD 1223 __kmp_itt_critical_acquired(lck); 1224 #endif /* USE_ITT_BUILD */ 1225 #if OMPT_SUPPORT && OMPT_OPTIONAL 1226 if (ompt_enabled.enabled) { 1227 /* OMPT state update */ 1228 ti.state = prev_state; 1229 ti.wait_id = 0; 1230 1231 /* OMPT event callback */ 1232 if (ompt_enabled.ompt_callback_mutex_acquired) { 1233 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1234 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra); 1235 } 1236 } 1237 #endif 1238 KMP_POP_PARTITIONED_TIMER(); 1239 1240 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1241 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1242 #endif // KMP_USE_DYNAMIC_LOCK 1243 } 1244 1245 #if KMP_USE_DYNAMIC_LOCK 1246 1247 // Converts the given hint to an internal lock implementation 1248 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1249 #if KMP_USE_TSX 1250 #define KMP_TSX_LOCK(seq) lockseq_##seq 1251 #else 1252 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1253 #endif 1254 1255 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1256 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1257 #else 1258 #define KMP_CPUINFO_RTM 0 1259 #endif 1260 1261 // Hints that do not require further logic 1262 if (hint & kmp_lock_hint_hle) 1263 return KMP_TSX_LOCK(hle); 1264 if (hint & kmp_lock_hint_rtm) 1265 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1266 if (hint & kmp_lock_hint_adaptive) 1267 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1268 1269 // Rule out conflicting hints first by returning the default lock 1270 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1271 return __kmp_user_lock_seq; 1272 if ((hint & omp_lock_hint_speculative) && 1273 (hint & omp_lock_hint_nonspeculative)) 1274 return __kmp_user_lock_seq; 1275 1276 // Do not even consider speculation when it appears to be contended 1277 if (hint & omp_lock_hint_contended) 1278 return lockseq_queuing; 1279 1280 // Uncontended lock without speculation 1281 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1282 return lockseq_tas; 1283 1284 // HLE lock for speculation 1285 if (hint & omp_lock_hint_speculative) 1286 return KMP_TSX_LOCK(hle); 1287 1288 return __kmp_user_lock_seq; 1289 } 1290 1291 #if OMPT_SUPPORT && OMPT_OPTIONAL 1292 #if KMP_USE_DYNAMIC_LOCK 1293 static kmp_mutex_impl_t 1294 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1295 if (user_lock) { 1296 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1297 case 0: 1298 break; 1299 #if KMP_USE_FUTEX 1300 case locktag_futex: 1301 return kmp_mutex_impl_queuing; 1302 #endif 1303 case locktag_tas: 1304 return kmp_mutex_impl_spin; 1305 #if KMP_USE_TSX 1306 case locktag_hle: 1307 return kmp_mutex_impl_speculative; 1308 #endif 1309 default: 1310 return kmp_mutex_impl_none; 1311 } 1312 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1313 } 1314 KMP_ASSERT(ilock); 1315 switch (ilock->type) { 1316 #if KMP_USE_TSX 1317 case locktag_adaptive: 1318 case locktag_rtm: 1319 return kmp_mutex_impl_speculative; 1320 #endif 1321 case locktag_nested_tas: 1322 return kmp_mutex_impl_spin; 1323 #if KMP_USE_FUTEX 1324 case locktag_nested_futex: 1325 #endif 1326 case locktag_ticket: 1327 case locktag_queuing: 1328 case locktag_drdpa: 1329 case locktag_nested_ticket: 1330 case locktag_nested_queuing: 1331 case locktag_nested_drdpa: 1332 return kmp_mutex_impl_queuing; 1333 default: 1334 return kmp_mutex_impl_none; 1335 } 1336 } 1337 #else 1338 // For locks without dynamic binding 1339 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1340 switch (__kmp_user_lock_kind) { 1341 case lk_tas: 1342 return kmp_mutex_impl_spin; 1343 #if KMP_USE_FUTEX 1344 case lk_futex: 1345 #endif 1346 case lk_ticket: 1347 case lk_queuing: 1348 case lk_drdpa: 1349 return kmp_mutex_impl_queuing; 1350 #if KMP_USE_TSX 1351 case lk_hle: 1352 case lk_rtm: 1353 case lk_adaptive: 1354 return kmp_mutex_impl_speculative; 1355 #endif 1356 default: 1357 return kmp_mutex_impl_none; 1358 } 1359 } 1360 #endif // KMP_USE_DYNAMIC_LOCK 1361 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1362 1363 /*! 1364 @ingroup WORK_SHARING 1365 @param loc source location information. 1366 @param global_tid global thread number. 1367 @param crit identity of the critical section. This could be a pointer to a lock 1368 associated with the critical section, or some other suitably unique value. 1369 @param hint the lock hint. 1370 1371 Enter code protected by a `critical` construct with a hint. The hint value is 1372 used to suggest a lock implementation. This function blocks until the executing 1373 thread can enter the critical section unless the hint suggests use of 1374 speculative execution and the hardware supports it. 1375 */ 1376 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1377 kmp_critical_name *crit, uint32_t hint) { 1378 KMP_COUNT_BLOCK(OMP_CRITICAL); 1379 kmp_user_lock_p lck; 1380 #if OMPT_SUPPORT && OMPT_OPTIONAL 1381 ompt_state_t prev_state = ompt_state_undefined; 1382 ompt_thread_info_t ti; 1383 // This is the case, if called from __kmpc_critical: 1384 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1385 if (!codeptr) 1386 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1387 #endif 1388 1389 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1390 1391 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1392 // Check if it is initialized. 1393 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1394 if (*lk == 0) { 1395 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1396 if (KMP_IS_D_LOCK(lckseq)) { 1397 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1398 KMP_GET_D_TAG(lckseq)); 1399 } else { 1400 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1401 } 1402 } 1403 // Branch for accessing the actual lock object and set operation. This 1404 // branching is inevitable since this lock initialization does not follow the 1405 // normal dispatch path (lock table is not used). 1406 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1407 lck = (kmp_user_lock_p)lk; 1408 if (__kmp_env_consistency_check) { 1409 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1410 __kmp_map_hint_to_lock(hint)); 1411 } 1412 #if USE_ITT_BUILD 1413 __kmp_itt_critical_acquiring(lck); 1414 #endif 1415 #if OMPT_SUPPORT && OMPT_OPTIONAL 1416 if (ompt_enabled.enabled) { 1417 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1418 /* OMPT state update */ 1419 prev_state = ti.state; 1420 ti.wait_id = (ompt_wait_id_t)lck; 1421 ti.state = ompt_state_wait_critical; 1422 1423 /* OMPT event callback */ 1424 if (ompt_enabled.ompt_callback_mutex_acquire) { 1425 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1426 ompt_mutex_critical, (unsigned int)hint, 1427 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr); 1428 } 1429 } 1430 #endif 1431 #if KMP_USE_INLINED_TAS 1432 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1433 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1434 } else 1435 #elif KMP_USE_INLINED_FUTEX 1436 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1437 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1438 } else 1439 #endif 1440 { 1441 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1442 } 1443 } else { 1444 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1445 lck = ilk->lock; 1446 if (__kmp_env_consistency_check) { 1447 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1448 __kmp_map_hint_to_lock(hint)); 1449 } 1450 #if USE_ITT_BUILD 1451 __kmp_itt_critical_acquiring(lck); 1452 #endif 1453 #if OMPT_SUPPORT && OMPT_OPTIONAL 1454 if (ompt_enabled.enabled) { 1455 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1456 /* OMPT state update */ 1457 prev_state = ti.state; 1458 ti.wait_id = (ompt_wait_id_t)lck; 1459 ti.state = ompt_state_wait_critical; 1460 1461 /* OMPT event callback */ 1462 if (ompt_enabled.ompt_callback_mutex_acquire) { 1463 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1464 ompt_mutex_critical, (unsigned int)hint, 1465 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr); 1466 } 1467 } 1468 #endif 1469 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1470 } 1471 KMP_POP_PARTITIONED_TIMER(); 1472 1473 #if USE_ITT_BUILD 1474 __kmp_itt_critical_acquired(lck); 1475 #endif /* USE_ITT_BUILD */ 1476 #if OMPT_SUPPORT && OMPT_OPTIONAL 1477 if (ompt_enabled.enabled) { 1478 /* OMPT state update */ 1479 ti.state = prev_state; 1480 ti.wait_id = 0; 1481 1482 /* OMPT event callback */ 1483 if (ompt_enabled.ompt_callback_mutex_acquired) { 1484 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1485 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr); 1486 } 1487 } 1488 #endif 1489 1490 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1491 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1492 } // __kmpc_critical_with_hint 1493 1494 #endif // KMP_USE_DYNAMIC_LOCK 1495 1496 /*! 1497 @ingroup WORK_SHARING 1498 @param loc source location information. 1499 @param global_tid global thread number . 1500 @param crit identity of the critical section. This could be a pointer to a lock 1501 associated with the critical section, or some other suitably unique value. 1502 1503 Leave a critical section, releasing any lock that was held during its execution. 1504 */ 1505 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1506 kmp_critical_name *crit) { 1507 kmp_user_lock_p lck; 1508 1509 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1510 1511 #if KMP_USE_DYNAMIC_LOCK 1512 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1513 lck = (kmp_user_lock_p)crit; 1514 KMP_ASSERT(lck != NULL); 1515 if (__kmp_env_consistency_check) { 1516 __kmp_pop_sync(global_tid, ct_critical, loc); 1517 } 1518 #if USE_ITT_BUILD 1519 __kmp_itt_critical_releasing(lck); 1520 #endif 1521 #if KMP_USE_INLINED_TAS 1522 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1523 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1524 } else 1525 #elif KMP_USE_INLINED_FUTEX 1526 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1527 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1528 } else 1529 #endif 1530 { 1531 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1532 } 1533 } else { 1534 kmp_indirect_lock_t *ilk = 1535 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1536 KMP_ASSERT(ilk != NULL); 1537 lck = ilk->lock; 1538 if (__kmp_env_consistency_check) { 1539 __kmp_pop_sync(global_tid, ct_critical, loc); 1540 } 1541 #if USE_ITT_BUILD 1542 __kmp_itt_critical_releasing(lck); 1543 #endif 1544 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1545 } 1546 1547 #else // KMP_USE_DYNAMIC_LOCK 1548 1549 if ((__kmp_user_lock_kind == lk_tas) && 1550 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1551 lck = (kmp_user_lock_p)crit; 1552 } 1553 #if KMP_USE_FUTEX 1554 else if ((__kmp_user_lock_kind == lk_futex) && 1555 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1556 lck = (kmp_user_lock_p)crit; 1557 } 1558 #endif 1559 else { // ticket, queuing or drdpa 1560 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1561 } 1562 1563 KMP_ASSERT(lck != NULL); 1564 1565 if (__kmp_env_consistency_check) 1566 __kmp_pop_sync(global_tid, ct_critical, loc); 1567 1568 #if USE_ITT_BUILD 1569 __kmp_itt_critical_releasing(lck); 1570 #endif /* USE_ITT_BUILD */ 1571 // Value of 'crit' should be good for using as a critical_id of the critical 1572 // section directive. 1573 __kmp_release_user_lock_with_checks(lck, global_tid); 1574 1575 #endif // KMP_USE_DYNAMIC_LOCK 1576 1577 #if OMPT_SUPPORT && OMPT_OPTIONAL 1578 /* OMPT release event triggers after lock is released; place here to trigger 1579 * for all #if branches */ 1580 OMPT_STORE_RETURN_ADDRESS(global_tid); 1581 if (ompt_enabled.ompt_callback_mutex_released) { 1582 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1583 ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1584 } 1585 #endif 1586 1587 KMP_POP_PARTITIONED_TIMER(); 1588 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1589 } 1590 1591 /*! 1592 @ingroup SYNCHRONIZATION 1593 @param loc source location information 1594 @param global_tid thread id. 1595 @return one if the thread should execute the master block, zero otherwise 1596 1597 Start execution of a combined barrier and master. The barrier is executed inside 1598 this function. 1599 */ 1600 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1601 int status; 1602 1603 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1604 1605 if (!TCR_4(__kmp_init_parallel)) 1606 __kmp_parallel_initialize(); 1607 1608 #if OMP_50_ENABLED 1609 __kmp_resume_if_soft_paused(); 1610 #endif 1611 1612 if (__kmp_env_consistency_check) 1613 __kmp_check_barrier(global_tid, ct_barrier, loc); 1614 1615 #if OMPT_SUPPORT 1616 ompt_frame_t *ompt_frame; 1617 if (ompt_enabled.enabled) { 1618 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1619 if (ompt_frame->enter_frame.ptr == NULL) 1620 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1621 OMPT_STORE_RETURN_ADDRESS(global_tid); 1622 } 1623 #endif 1624 #if USE_ITT_NOTIFY 1625 __kmp_threads[global_tid]->th.th_ident = loc; 1626 #endif 1627 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1628 #if OMPT_SUPPORT && OMPT_OPTIONAL 1629 if (ompt_enabled.enabled) { 1630 ompt_frame->enter_frame = ompt_data_none; 1631 } 1632 #endif 1633 1634 return (status != 0) ? 0 : 1; 1635 } 1636 1637 /*! 1638 @ingroup SYNCHRONIZATION 1639 @param loc source location information 1640 @param global_tid thread id. 1641 1642 Complete the execution of a combined barrier and master. This function should 1643 only be called at the completion of the <tt>master</tt> code. Other threads will 1644 still be waiting at the barrier and this call releases them. 1645 */ 1646 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1647 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1648 1649 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1650 } 1651 1652 /*! 1653 @ingroup SYNCHRONIZATION 1654 @param loc source location information 1655 @param global_tid thread id. 1656 @return one if the thread should execute the master block, zero otherwise 1657 1658 Start execution of a combined barrier and master(nowait) construct. 1659 The barrier is executed inside this function. 1660 There is no equivalent "end" function, since the 1661 */ 1662 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1663 kmp_int32 ret; 1664 1665 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1666 1667 if (!TCR_4(__kmp_init_parallel)) 1668 __kmp_parallel_initialize(); 1669 1670 #if OMP_50_ENABLED 1671 __kmp_resume_if_soft_paused(); 1672 #endif 1673 1674 if (__kmp_env_consistency_check) { 1675 if (loc == 0) { 1676 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1677 } 1678 __kmp_check_barrier(global_tid, ct_barrier, loc); 1679 } 1680 1681 #if OMPT_SUPPORT 1682 ompt_frame_t *ompt_frame; 1683 if (ompt_enabled.enabled) { 1684 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1685 if (ompt_frame->enter_frame.ptr == NULL) 1686 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1687 OMPT_STORE_RETURN_ADDRESS(global_tid); 1688 } 1689 #endif 1690 #if USE_ITT_NOTIFY 1691 __kmp_threads[global_tid]->th.th_ident = loc; 1692 #endif 1693 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1694 #if OMPT_SUPPORT && OMPT_OPTIONAL 1695 if (ompt_enabled.enabled) { 1696 ompt_frame->enter_frame = ompt_data_none; 1697 } 1698 #endif 1699 1700 ret = __kmpc_master(loc, global_tid); 1701 1702 if (__kmp_env_consistency_check) { 1703 /* there's no __kmpc_end_master called; so the (stats) */ 1704 /* actions of __kmpc_end_master are done here */ 1705 1706 if (global_tid < 0) { 1707 KMP_WARNING(ThreadIdentInvalid); 1708 } 1709 if (ret) { 1710 /* only one thread should do the pop since only */ 1711 /* one did the push (see __kmpc_master()) */ 1712 1713 __kmp_pop_sync(global_tid, ct_master, loc); 1714 } 1715 } 1716 1717 return (ret); 1718 } 1719 1720 /* The BARRIER for a SINGLE process section is always explicit */ 1721 /*! 1722 @ingroup WORK_SHARING 1723 @param loc source location information 1724 @param global_tid global thread number 1725 @return One if this thread should execute the single construct, zero otherwise. 1726 1727 Test whether to execute a <tt>single</tt> construct. 1728 There are no implicit barriers in the two "single" calls, rather the compiler 1729 should introduce an explicit barrier if it is required. 1730 */ 1731 1732 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1733 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1734 1735 if (rc) { 1736 // We are going to execute the single statement, so we should count it. 1737 KMP_COUNT_BLOCK(OMP_SINGLE); 1738 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1739 } 1740 1741 #if OMPT_SUPPORT && OMPT_OPTIONAL 1742 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1743 kmp_team_t *team = this_thr->th.th_team; 1744 int tid = __kmp_tid_from_gtid(global_tid); 1745 1746 if (ompt_enabled.enabled) { 1747 if (rc) { 1748 if (ompt_enabled.ompt_callback_work) { 1749 ompt_callbacks.ompt_callback(ompt_callback_work)( 1750 ompt_work_single_executor, ompt_scope_begin, 1751 &(team->t.ompt_team_info.parallel_data), 1752 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1753 1, OMPT_GET_RETURN_ADDRESS(0)); 1754 } 1755 } else { 1756 if (ompt_enabled.ompt_callback_work) { 1757 ompt_callbacks.ompt_callback(ompt_callback_work)( 1758 ompt_work_single_other, ompt_scope_begin, 1759 &(team->t.ompt_team_info.parallel_data), 1760 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1761 1, OMPT_GET_RETURN_ADDRESS(0)); 1762 ompt_callbacks.ompt_callback(ompt_callback_work)( 1763 ompt_work_single_other, ompt_scope_end, 1764 &(team->t.ompt_team_info.parallel_data), 1765 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1766 1, OMPT_GET_RETURN_ADDRESS(0)); 1767 } 1768 } 1769 } 1770 #endif 1771 1772 return rc; 1773 } 1774 1775 /*! 1776 @ingroup WORK_SHARING 1777 @param loc source location information 1778 @param global_tid global thread number 1779 1780 Mark the end of a <tt>single</tt> construct. This function should 1781 only be called by the thread that executed the block of code protected 1782 by the `single` construct. 1783 */ 1784 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1785 __kmp_exit_single(global_tid); 1786 KMP_POP_PARTITIONED_TIMER(); 1787 1788 #if OMPT_SUPPORT && OMPT_OPTIONAL 1789 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1790 kmp_team_t *team = this_thr->th.th_team; 1791 int tid = __kmp_tid_from_gtid(global_tid); 1792 1793 if (ompt_enabled.ompt_callback_work) { 1794 ompt_callbacks.ompt_callback(ompt_callback_work)( 1795 ompt_work_single_executor, ompt_scope_end, 1796 &(team->t.ompt_team_info.parallel_data), 1797 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1798 OMPT_GET_RETURN_ADDRESS(0)); 1799 } 1800 #endif 1801 } 1802 1803 /*! 1804 @ingroup WORK_SHARING 1805 @param loc Source location 1806 @param global_tid Global thread id 1807 1808 Mark the end of a statically scheduled loop. 1809 */ 1810 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1811 KMP_POP_PARTITIONED_TIMER(); 1812 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1813 1814 #if OMPT_SUPPORT && OMPT_OPTIONAL 1815 if (ompt_enabled.ompt_callback_work) { 1816 ompt_work_t ompt_work_type = ompt_work_loop; 1817 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1818 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1819 // Determine workshare type 1820 if (loc != NULL) { 1821 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1822 ompt_work_type = ompt_work_loop; 1823 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1824 ompt_work_type = ompt_work_sections; 1825 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1826 ompt_work_type = ompt_work_distribute; 1827 } else { 1828 // use default set above. 1829 // a warning about this case is provided in __kmpc_for_static_init 1830 } 1831 KMP_DEBUG_ASSERT(ompt_work_type); 1832 } 1833 ompt_callbacks.ompt_callback(ompt_callback_work)( 1834 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1835 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1836 } 1837 #endif 1838 if (__kmp_env_consistency_check) 1839 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1840 } 1841 1842 // User routines which take C-style arguments (call by value) 1843 // different from the Fortran equivalent routines 1844 1845 void ompc_set_num_threads(int arg) { 1846 // !!!!! TODO: check the per-task binding 1847 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1848 } 1849 1850 void ompc_set_dynamic(int flag) { 1851 kmp_info_t *thread; 1852 1853 /* For the thread-private implementation of the internal controls */ 1854 thread = __kmp_entry_thread(); 1855 1856 __kmp_save_internal_controls(thread); 1857 1858 set__dynamic(thread, flag ? TRUE : FALSE); 1859 } 1860 1861 void ompc_set_nested(int flag) { 1862 kmp_info_t *thread; 1863 1864 /* For the thread-private internal controls implementation */ 1865 thread = __kmp_entry_thread(); 1866 1867 __kmp_save_internal_controls(thread); 1868 1869 set__nested(thread, flag ? TRUE : FALSE); 1870 } 1871 1872 void ompc_set_max_active_levels(int max_active_levels) { 1873 /* TO DO */ 1874 /* we want per-task implementation of this internal control */ 1875 1876 /* For the per-thread internal controls implementation */ 1877 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1878 } 1879 1880 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1881 // !!!!! TODO: check the per-task binding 1882 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1883 } 1884 1885 int ompc_get_ancestor_thread_num(int level) { 1886 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1887 } 1888 1889 int ompc_get_team_size(int level) { 1890 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1891 } 1892 1893 #if OMP_50_ENABLED 1894 /* OpenMP 5.0 Affinity Format API */ 1895 1896 void ompc_set_affinity_format(char const *format) { 1897 if (!__kmp_init_serial) { 1898 __kmp_serial_initialize(); 1899 } 1900 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, 1901 format, KMP_STRLEN(format) + 1); 1902 } 1903 1904 size_t ompc_get_affinity_format(char *buffer, size_t size) { 1905 size_t format_size; 1906 if (!__kmp_init_serial) { 1907 __kmp_serial_initialize(); 1908 } 1909 format_size = KMP_STRLEN(__kmp_affinity_format); 1910 if (buffer && size) { 1911 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format, 1912 format_size + 1); 1913 } 1914 return format_size; 1915 } 1916 1917 void ompc_display_affinity(char const *format) { 1918 int gtid; 1919 if (!TCR_4(__kmp_init_middle)) { 1920 __kmp_middle_initialize(); 1921 } 1922 gtid = __kmp_get_gtid(); 1923 __kmp_aux_display_affinity(gtid, format); 1924 } 1925 1926 size_t ompc_capture_affinity(char *buffer, size_t buf_size, 1927 char const *format) { 1928 int gtid; 1929 size_t num_required; 1930 kmp_str_buf_t capture_buf; 1931 if (!TCR_4(__kmp_init_middle)) { 1932 __kmp_middle_initialize(); 1933 } 1934 gtid = __kmp_get_gtid(); 1935 __kmp_str_buf_init(&capture_buf); 1936 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf); 1937 if (buffer && buf_size) { 1938 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str, 1939 capture_buf.used + 1); 1940 } 1941 __kmp_str_buf_free(&capture_buf); 1942 return num_required; 1943 } 1944 #endif /* OMP_50_ENABLED */ 1945 1946 void kmpc_set_stacksize(int arg) { 1947 // __kmp_aux_set_stacksize initializes the library if needed 1948 __kmp_aux_set_stacksize(arg); 1949 } 1950 1951 void kmpc_set_stacksize_s(size_t arg) { 1952 // __kmp_aux_set_stacksize initializes the library if needed 1953 __kmp_aux_set_stacksize(arg); 1954 } 1955 1956 void kmpc_set_blocktime(int arg) { 1957 int gtid, tid; 1958 kmp_info_t *thread; 1959 1960 gtid = __kmp_entry_gtid(); 1961 tid = __kmp_tid_from_gtid(gtid); 1962 thread = __kmp_thread_from_gtid(gtid); 1963 1964 __kmp_aux_set_blocktime(arg, thread, tid); 1965 } 1966 1967 void kmpc_set_library(int arg) { 1968 // __kmp_user_set_library initializes the library if needed 1969 __kmp_user_set_library((enum library_type)arg); 1970 } 1971 1972 void kmpc_set_defaults(char const *str) { 1973 // __kmp_aux_set_defaults initializes the library if needed 1974 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1975 } 1976 1977 void kmpc_set_disp_num_buffers(int arg) { 1978 // ignore after initialization because some teams have already 1979 // allocated dispatch buffers 1980 if (__kmp_init_serial == 0 && arg > 0) 1981 __kmp_dispatch_num_buffers = arg; 1982 } 1983 1984 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1985 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1986 return -1; 1987 #else 1988 if (!TCR_4(__kmp_init_middle)) { 1989 __kmp_middle_initialize(); 1990 } 1991 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1992 #endif 1993 } 1994 1995 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1996 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1997 return -1; 1998 #else 1999 if (!TCR_4(__kmp_init_middle)) { 2000 __kmp_middle_initialize(); 2001 } 2002 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 2003 #endif 2004 } 2005 2006 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 2007 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 2008 return -1; 2009 #else 2010 if (!TCR_4(__kmp_init_middle)) { 2011 __kmp_middle_initialize(); 2012 } 2013 return __kmp_aux_get_affinity_mask_proc(proc, mask); 2014 #endif 2015 } 2016 2017 /* -------------------------------------------------------------------------- */ 2018 /*! 2019 @ingroup THREADPRIVATE 2020 @param loc source location information 2021 @param gtid global thread number 2022 @param cpy_size size of the cpy_data buffer 2023 @param cpy_data pointer to data to be copied 2024 @param cpy_func helper function to call for copying data 2025 @param didit flag variable: 1=single thread; 0=not single thread 2026 2027 __kmpc_copyprivate implements the interface for the private data broadcast 2028 needed for the copyprivate clause associated with a single region in an 2029 OpenMP<sup>*</sup> program (both C and Fortran). 2030 All threads participating in the parallel region call this routine. 2031 One of the threads (called the single thread) should have the <tt>didit</tt> 2032 variable set to 1 and all other threads should have that variable set to 0. 2033 All threads pass a pointer to a data buffer (cpy_data) that they have built. 2034 2035 The OpenMP specification forbids the use of nowait on the single region when a 2036 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 2037 barrier internally to avoid race conditions, so the code generation for the 2038 single region should avoid generating a barrier after the call to @ref 2039 __kmpc_copyprivate. 2040 2041 The <tt>gtid</tt> parameter is the global thread id for the current thread. 2042 The <tt>loc</tt> parameter is a pointer to source location information. 2043 2044 Internal implementation: The single thread will first copy its descriptor 2045 address (cpy_data) to a team-private location, then the other threads will each 2046 call the function pointed to by the parameter cpy_func, which carries out the 2047 copy by copying the data using the cpy_data buffer. 2048 2049 The cpy_func routine used for the copy and the contents of the data area defined 2050 by cpy_data and cpy_size may be built in any fashion that will allow the copy 2051 to be done. For instance, the cpy_data buffer can hold the actual data to be 2052 copied or it may hold a list of pointers to the data. The cpy_func routine must 2053 interpret the cpy_data buffer appropriately. 2054 2055 The interface to cpy_func is as follows: 2056 @code 2057 void cpy_func( void *destination, void *source ) 2058 @endcode 2059 where void *destination is the cpy_data pointer for the thread being copied to 2060 and void *source is the cpy_data pointer for the thread being copied from. 2061 */ 2062 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 2063 void *cpy_data, void (*cpy_func)(void *, void *), 2064 kmp_int32 didit) { 2065 void **data_ptr; 2066 2067 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 2068 2069 KMP_MB(); 2070 2071 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 2072 2073 if (__kmp_env_consistency_check) { 2074 if (loc == 0) { 2075 KMP_WARNING(ConstructIdentInvalid); 2076 } 2077 } 2078 2079 // ToDo: Optimize the following two barriers into some kind of split barrier 2080 2081 if (didit) 2082 *data_ptr = cpy_data; 2083 2084 #if OMPT_SUPPORT 2085 ompt_frame_t *ompt_frame; 2086 if (ompt_enabled.enabled) { 2087 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2088 if (ompt_frame->enter_frame.ptr == NULL) 2089 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 2090 OMPT_STORE_RETURN_ADDRESS(gtid); 2091 } 2092 #endif 2093 /* This barrier is not a barrier region boundary */ 2094 #if USE_ITT_NOTIFY 2095 __kmp_threads[gtid]->th.th_ident = loc; 2096 #endif 2097 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2098 2099 if (!didit) 2100 (*cpy_func)(cpy_data, *data_ptr); 2101 2102 // Consider next barrier a user-visible barrier for barrier region boundaries 2103 // Nesting checks are already handled by the single construct checks 2104 2105 #if OMPT_SUPPORT 2106 if (ompt_enabled.enabled) { 2107 OMPT_STORE_RETURN_ADDRESS(gtid); 2108 } 2109 #endif 2110 #if USE_ITT_NOTIFY 2111 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2112 // tasks can overwrite the location) 2113 #endif 2114 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2115 #if OMPT_SUPPORT && OMPT_OPTIONAL 2116 if (ompt_enabled.enabled) { 2117 ompt_frame->enter_frame = ompt_data_none; 2118 } 2119 #endif 2120 } 2121 2122 /* -------------------------------------------------------------------------- */ 2123 2124 #define INIT_LOCK __kmp_init_user_lock_with_checks 2125 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2126 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2127 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2128 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2129 #define ACQUIRE_NESTED_LOCK_TIMED \ 2130 __kmp_acquire_nested_user_lock_with_checks_timed 2131 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2132 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2133 #define TEST_LOCK __kmp_test_user_lock_with_checks 2134 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2135 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2136 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2137 2138 // TODO: Make check abort messages use location info & pass it into 2139 // with_checks routines 2140 2141 #if KMP_USE_DYNAMIC_LOCK 2142 2143 // internal lock initializer 2144 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2145 kmp_dyna_lockseq_t seq) { 2146 if (KMP_IS_D_LOCK(seq)) { 2147 KMP_INIT_D_LOCK(lock, seq); 2148 #if USE_ITT_BUILD 2149 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2150 #endif 2151 } else { 2152 KMP_INIT_I_LOCK(lock, seq); 2153 #if USE_ITT_BUILD 2154 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2155 __kmp_itt_lock_creating(ilk->lock, loc); 2156 #endif 2157 } 2158 } 2159 2160 // internal nest lock initializer 2161 static __forceinline void 2162 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2163 kmp_dyna_lockseq_t seq) { 2164 #if KMP_USE_TSX 2165 // Don't have nested lock implementation for speculative locks 2166 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2167 seq = __kmp_user_lock_seq; 2168 #endif 2169 switch (seq) { 2170 case lockseq_tas: 2171 seq = lockseq_nested_tas; 2172 break; 2173 #if KMP_USE_FUTEX 2174 case lockseq_futex: 2175 seq = lockseq_nested_futex; 2176 break; 2177 #endif 2178 case lockseq_ticket: 2179 seq = lockseq_nested_ticket; 2180 break; 2181 case lockseq_queuing: 2182 seq = lockseq_nested_queuing; 2183 break; 2184 case lockseq_drdpa: 2185 seq = lockseq_nested_drdpa; 2186 break; 2187 default: 2188 seq = lockseq_nested_queuing; 2189 } 2190 KMP_INIT_I_LOCK(lock, seq); 2191 #if USE_ITT_BUILD 2192 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2193 __kmp_itt_lock_creating(ilk->lock, loc); 2194 #endif 2195 } 2196 2197 /* initialize the lock with a hint */ 2198 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2199 uintptr_t hint) { 2200 KMP_DEBUG_ASSERT(__kmp_init_serial); 2201 if (__kmp_env_consistency_check && user_lock == NULL) { 2202 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2203 } 2204 2205 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2206 2207 #if OMPT_SUPPORT && OMPT_OPTIONAL 2208 // This is the case, if called from omp_init_lock_with_hint: 2209 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2210 if (!codeptr) 2211 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2212 if (ompt_enabled.ompt_callback_lock_init) { 2213 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2214 ompt_mutex_lock, (omp_lock_hint_t)hint, 2215 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2216 codeptr); 2217 } 2218 #endif 2219 } 2220 2221 /* initialize the lock with a hint */ 2222 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2223 void **user_lock, uintptr_t hint) { 2224 KMP_DEBUG_ASSERT(__kmp_init_serial); 2225 if (__kmp_env_consistency_check && user_lock == NULL) { 2226 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2227 } 2228 2229 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2230 2231 #if OMPT_SUPPORT && OMPT_OPTIONAL 2232 // This is the case, if called from omp_init_lock_with_hint: 2233 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2234 if (!codeptr) 2235 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2236 if (ompt_enabled.ompt_callback_lock_init) { 2237 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2238 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2239 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2240 codeptr); 2241 } 2242 #endif 2243 } 2244 2245 #endif // KMP_USE_DYNAMIC_LOCK 2246 2247 /* initialize the lock */ 2248 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2249 #if KMP_USE_DYNAMIC_LOCK 2250 2251 KMP_DEBUG_ASSERT(__kmp_init_serial); 2252 if (__kmp_env_consistency_check && user_lock == NULL) { 2253 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2254 } 2255 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2256 2257 #if OMPT_SUPPORT && OMPT_OPTIONAL 2258 // This is the case, if called from omp_init_lock_with_hint: 2259 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2260 if (!codeptr) 2261 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2262 if (ompt_enabled.ompt_callback_lock_init) { 2263 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2264 ompt_mutex_lock, omp_lock_hint_none, 2265 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2266 codeptr); 2267 } 2268 #endif 2269 2270 #else // KMP_USE_DYNAMIC_LOCK 2271 2272 static char const *const func = "omp_init_lock"; 2273 kmp_user_lock_p lck; 2274 KMP_DEBUG_ASSERT(__kmp_init_serial); 2275 2276 if (__kmp_env_consistency_check) { 2277 if (user_lock == NULL) { 2278 KMP_FATAL(LockIsUninitialized, func); 2279 } 2280 } 2281 2282 KMP_CHECK_USER_LOCK_INIT(); 2283 2284 if ((__kmp_user_lock_kind == lk_tas) && 2285 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2286 lck = (kmp_user_lock_p)user_lock; 2287 } 2288 #if KMP_USE_FUTEX 2289 else if ((__kmp_user_lock_kind == lk_futex) && 2290 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2291 lck = (kmp_user_lock_p)user_lock; 2292 } 2293 #endif 2294 else { 2295 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2296 } 2297 INIT_LOCK(lck); 2298 __kmp_set_user_lock_location(lck, loc); 2299 2300 #if OMPT_SUPPORT && OMPT_OPTIONAL 2301 // This is the case, if called from omp_init_lock_with_hint: 2302 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2303 if (!codeptr) 2304 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2305 if (ompt_enabled.ompt_callback_lock_init) { 2306 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2307 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2308 (ompt_wait_id_t)user_lock, codeptr); 2309 } 2310 #endif 2311 2312 #if USE_ITT_BUILD 2313 __kmp_itt_lock_creating(lck); 2314 #endif /* USE_ITT_BUILD */ 2315 2316 #endif // KMP_USE_DYNAMIC_LOCK 2317 } // __kmpc_init_lock 2318 2319 /* initialize the lock */ 2320 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2321 #if KMP_USE_DYNAMIC_LOCK 2322 2323 KMP_DEBUG_ASSERT(__kmp_init_serial); 2324 if (__kmp_env_consistency_check && user_lock == NULL) { 2325 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2326 } 2327 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2328 2329 #if OMPT_SUPPORT && OMPT_OPTIONAL 2330 // This is the case, if called from omp_init_lock_with_hint: 2331 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2332 if (!codeptr) 2333 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2334 if (ompt_enabled.ompt_callback_lock_init) { 2335 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2336 ompt_mutex_nest_lock, omp_lock_hint_none, 2337 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2338 codeptr); 2339 } 2340 #endif 2341 2342 #else // KMP_USE_DYNAMIC_LOCK 2343 2344 static char const *const func = "omp_init_nest_lock"; 2345 kmp_user_lock_p lck; 2346 KMP_DEBUG_ASSERT(__kmp_init_serial); 2347 2348 if (__kmp_env_consistency_check) { 2349 if (user_lock == NULL) { 2350 KMP_FATAL(LockIsUninitialized, func); 2351 } 2352 } 2353 2354 KMP_CHECK_USER_LOCK_INIT(); 2355 2356 if ((__kmp_user_lock_kind == lk_tas) && 2357 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2358 OMP_NEST_LOCK_T_SIZE)) { 2359 lck = (kmp_user_lock_p)user_lock; 2360 } 2361 #if KMP_USE_FUTEX 2362 else if ((__kmp_user_lock_kind == lk_futex) && 2363 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2364 OMP_NEST_LOCK_T_SIZE)) { 2365 lck = (kmp_user_lock_p)user_lock; 2366 } 2367 #endif 2368 else { 2369 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2370 } 2371 2372 INIT_NESTED_LOCK(lck); 2373 __kmp_set_user_lock_location(lck, loc); 2374 2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2376 // This is the case, if called from omp_init_lock_with_hint: 2377 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2378 if (!codeptr) 2379 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2380 if (ompt_enabled.ompt_callback_lock_init) { 2381 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2382 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2383 (ompt_wait_id_t)user_lock, codeptr); 2384 } 2385 #endif 2386 2387 #if USE_ITT_BUILD 2388 __kmp_itt_lock_creating(lck); 2389 #endif /* USE_ITT_BUILD */ 2390 2391 #endif // KMP_USE_DYNAMIC_LOCK 2392 } // __kmpc_init_nest_lock 2393 2394 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2395 #if KMP_USE_DYNAMIC_LOCK 2396 2397 #if USE_ITT_BUILD 2398 kmp_user_lock_p lck; 2399 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2400 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2401 } else { 2402 lck = (kmp_user_lock_p)user_lock; 2403 } 2404 __kmp_itt_lock_destroyed(lck); 2405 #endif 2406 #if OMPT_SUPPORT && OMPT_OPTIONAL 2407 // This is the case, if called from omp_init_lock_with_hint: 2408 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2409 if (!codeptr) 2410 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2411 if (ompt_enabled.ompt_callback_lock_destroy) { 2412 kmp_user_lock_p lck; 2413 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2414 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2415 } else { 2416 lck = (kmp_user_lock_p)user_lock; 2417 } 2418 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2419 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2420 } 2421 #endif 2422 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2423 #else 2424 kmp_user_lock_p lck; 2425 2426 if ((__kmp_user_lock_kind == lk_tas) && 2427 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2428 lck = (kmp_user_lock_p)user_lock; 2429 } 2430 #if KMP_USE_FUTEX 2431 else if ((__kmp_user_lock_kind == lk_futex) && 2432 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2433 lck = (kmp_user_lock_p)user_lock; 2434 } 2435 #endif 2436 else { 2437 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2438 } 2439 2440 #if OMPT_SUPPORT && OMPT_OPTIONAL 2441 // This is the case, if called from omp_init_lock_with_hint: 2442 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2443 if (!codeptr) 2444 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2445 if (ompt_enabled.ompt_callback_lock_destroy) { 2446 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2447 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2448 } 2449 #endif 2450 2451 #if USE_ITT_BUILD 2452 __kmp_itt_lock_destroyed(lck); 2453 #endif /* USE_ITT_BUILD */ 2454 DESTROY_LOCK(lck); 2455 2456 if ((__kmp_user_lock_kind == lk_tas) && 2457 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2458 ; 2459 } 2460 #if KMP_USE_FUTEX 2461 else if ((__kmp_user_lock_kind == lk_futex) && 2462 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2463 ; 2464 } 2465 #endif 2466 else { 2467 __kmp_user_lock_free(user_lock, gtid, lck); 2468 } 2469 #endif // KMP_USE_DYNAMIC_LOCK 2470 } // __kmpc_destroy_lock 2471 2472 /* destroy the lock */ 2473 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2474 #if KMP_USE_DYNAMIC_LOCK 2475 2476 #if USE_ITT_BUILD 2477 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2478 __kmp_itt_lock_destroyed(ilk->lock); 2479 #endif 2480 #if OMPT_SUPPORT && OMPT_OPTIONAL 2481 // This is the case, if called from omp_init_lock_with_hint: 2482 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2483 if (!codeptr) 2484 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2485 if (ompt_enabled.ompt_callback_lock_destroy) { 2486 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2487 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2488 } 2489 #endif 2490 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2491 2492 #else // KMP_USE_DYNAMIC_LOCK 2493 2494 kmp_user_lock_p lck; 2495 2496 if ((__kmp_user_lock_kind == lk_tas) && 2497 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2498 OMP_NEST_LOCK_T_SIZE)) { 2499 lck = (kmp_user_lock_p)user_lock; 2500 } 2501 #if KMP_USE_FUTEX 2502 else if ((__kmp_user_lock_kind == lk_futex) && 2503 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2504 OMP_NEST_LOCK_T_SIZE)) { 2505 lck = (kmp_user_lock_p)user_lock; 2506 } 2507 #endif 2508 else { 2509 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2510 } 2511 2512 #if OMPT_SUPPORT && OMPT_OPTIONAL 2513 // This is the case, if called from omp_init_lock_with_hint: 2514 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2515 if (!codeptr) 2516 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2517 if (ompt_enabled.ompt_callback_lock_destroy) { 2518 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2519 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2520 } 2521 #endif 2522 2523 #if USE_ITT_BUILD 2524 __kmp_itt_lock_destroyed(lck); 2525 #endif /* USE_ITT_BUILD */ 2526 2527 DESTROY_NESTED_LOCK(lck); 2528 2529 if ((__kmp_user_lock_kind == lk_tas) && 2530 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2531 OMP_NEST_LOCK_T_SIZE)) { 2532 ; 2533 } 2534 #if KMP_USE_FUTEX 2535 else if ((__kmp_user_lock_kind == lk_futex) && 2536 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2537 OMP_NEST_LOCK_T_SIZE)) { 2538 ; 2539 } 2540 #endif 2541 else { 2542 __kmp_user_lock_free(user_lock, gtid, lck); 2543 } 2544 #endif // KMP_USE_DYNAMIC_LOCK 2545 } // __kmpc_destroy_nest_lock 2546 2547 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2548 KMP_COUNT_BLOCK(OMP_set_lock); 2549 #if KMP_USE_DYNAMIC_LOCK 2550 int tag = KMP_EXTRACT_D_TAG(user_lock); 2551 #if USE_ITT_BUILD 2552 __kmp_itt_lock_acquiring( 2553 (kmp_user_lock_p) 2554 user_lock); // itt function will get to the right lock object. 2555 #endif 2556 #if OMPT_SUPPORT && OMPT_OPTIONAL 2557 // This is the case, if called from omp_init_lock_with_hint: 2558 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2559 if (!codeptr) 2560 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2561 if (ompt_enabled.ompt_callback_mutex_acquire) { 2562 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2563 ompt_mutex_lock, omp_lock_hint_none, 2564 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2565 codeptr); 2566 } 2567 #endif 2568 #if KMP_USE_INLINED_TAS 2569 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2570 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2571 } else 2572 #elif KMP_USE_INLINED_FUTEX 2573 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2574 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2575 } else 2576 #endif 2577 { 2578 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2579 } 2580 #if USE_ITT_BUILD 2581 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2582 #endif 2583 #if OMPT_SUPPORT && OMPT_OPTIONAL 2584 if (ompt_enabled.ompt_callback_mutex_acquired) { 2585 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2586 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2587 } 2588 #endif 2589 2590 #else // KMP_USE_DYNAMIC_LOCK 2591 2592 kmp_user_lock_p lck; 2593 2594 if ((__kmp_user_lock_kind == lk_tas) && 2595 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2596 lck = (kmp_user_lock_p)user_lock; 2597 } 2598 #if KMP_USE_FUTEX 2599 else if ((__kmp_user_lock_kind == lk_futex) && 2600 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2601 lck = (kmp_user_lock_p)user_lock; 2602 } 2603 #endif 2604 else { 2605 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2606 } 2607 2608 #if USE_ITT_BUILD 2609 __kmp_itt_lock_acquiring(lck); 2610 #endif /* USE_ITT_BUILD */ 2611 #if OMPT_SUPPORT && OMPT_OPTIONAL 2612 // This is the case, if called from omp_init_lock_with_hint: 2613 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2614 if (!codeptr) 2615 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2616 if (ompt_enabled.ompt_callback_mutex_acquire) { 2617 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2618 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2619 (ompt_wait_id_t)lck, codeptr); 2620 } 2621 #endif 2622 2623 ACQUIRE_LOCK(lck, gtid); 2624 2625 #if USE_ITT_BUILD 2626 __kmp_itt_lock_acquired(lck); 2627 #endif /* USE_ITT_BUILD */ 2628 2629 #if OMPT_SUPPORT && OMPT_OPTIONAL 2630 if (ompt_enabled.ompt_callback_mutex_acquired) { 2631 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2632 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2633 } 2634 #endif 2635 2636 #endif // KMP_USE_DYNAMIC_LOCK 2637 } 2638 2639 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2640 #if KMP_USE_DYNAMIC_LOCK 2641 2642 #if USE_ITT_BUILD 2643 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2644 #endif 2645 #if OMPT_SUPPORT && OMPT_OPTIONAL 2646 // This is the case, if called from omp_init_lock_with_hint: 2647 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2648 if (!codeptr) 2649 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2650 if (ompt_enabled.enabled) { 2651 if (ompt_enabled.ompt_callback_mutex_acquire) { 2652 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2653 ompt_mutex_nest_lock, omp_lock_hint_none, 2654 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2655 codeptr); 2656 } 2657 } 2658 #endif 2659 int acquire_status = 2660 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2661 (void) acquire_status; 2662 #if USE_ITT_BUILD 2663 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2664 #endif 2665 2666 #if OMPT_SUPPORT && OMPT_OPTIONAL 2667 if (ompt_enabled.enabled) { 2668 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2669 if (ompt_enabled.ompt_callback_mutex_acquired) { 2670 // lock_first 2671 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2672 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2673 } 2674 } else { 2675 if (ompt_enabled.ompt_callback_nest_lock) { 2676 // lock_next 2677 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2678 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr); 2679 } 2680 } 2681 } 2682 #endif 2683 2684 #else // KMP_USE_DYNAMIC_LOCK 2685 int acquire_status; 2686 kmp_user_lock_p lck; 2687 2688 if ((__kmp_user_lock_kind == lk_tas) && 2689 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2690 OMP_NEST_LOCK_T_SIZE)) { 2691 lck = (kmp_user_lock_p)user_lock; 2692 } 2693 #if KMP_USE_FUTEX 2694 else if ((__kmp_user_lock_kind == lk_futex) && 2695 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2696 OMP_NEST_LOCK_T_SIZE)) { 2697 lck = (kmp_user_lock_p)user_lock; 2698 } 2699 #endif 2700 else { 2701 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2702 } 2703 2704 #if USE_ITT_BUILD 2705 __kmp_itt_lock_acquiring(lck); 2706 #endif /* USE_ITT_BUILD */ 2707 #if OMPT_SUPPORT && OMPT_OPTIONAL 2708 // This is the case, if called from omp_init_lock_with_hint: 2709 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2710 if (!codeptr) 2711 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2712 if (ompt_enabled.enabled) { 2713 if (ompt_enabled.ompt_callback_mutex_acquire) { 2714 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2715 ompt_mutex_nest_lock, omp_lock_hint_none, 2716 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr); 2717 } 2718 } 2719 #endif 2720 2721 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2722 2723 #if USE_ITT_BUILD 2724 __kmp_itt_lock_acquired(lck); 2725 #endif /* USE_ITT_BUILD */ 2726 2727 #if OMPT_SUPPORT && OMPT_OPTIONAL 2728 if (ompt_enabled.enabled) { 2729 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2730 if (ompt_enabled.ompt_callback_mutex_acquired) { 2731 // lock_first 2732 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2733 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2734 } 2735 } else { 2736 if (ompt_enabled.ompt_callback_nest_lock) { 2737 // lock_next 2738 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2739 ompt_scope_begin, (ompt_wait_id_t)lck, codeptr); 2740 } 2741 } 2742 } 2743 #endif 2744 2745 #endif // KMP_USE_DYNAMIC_LOCK 2746 } 2747 2748 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2749 #if KMP_USE_DYNAMIC_LOCK 2750 2751 int tag = KMP_EXTRACT_D_TAG(user_lock); 2752 #if USE_ITT_BUILD 2753 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2754 #endif 2755 #if KMP_USE_INLINED_TAS 2756 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2757 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2758 } else 2759 #elif KMP_USE_INLINED_FUTEX 2760 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2761 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2762 } else 2763 #endif 2764 { 2765 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2766 } 2767 2768 #if OMPT_SUPPORT && OMPT_OPTIONAL 2769 // This is the case, if called from omp_init_lock_with_hint: 2770 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2771 if (!codeptr) 2772 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2773 if (ompt_enabled.ompt_callback_mutex_released) { 2774 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2775 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 2776 } 2777 #endif 2778 2779 #else // KMP_USE_DYNAMIC_LOCK 2780 2781 kmp_user_lock_p lck; 2782 2783 /* Can't use serial interval since not block structured */ 2784 /* release the lock */ 2785 2786 if ((__kmp_user_lock_kind == lk_tas) && 2787 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2788 #if KMP_OS_LINUX && \ 2789 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2790 // "fast" path implemented to fix customer performance issue 2791 #if USE_ITT_BUILD 2792 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2793 #endif /* USE_ITT_BUILD */ 2794 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2795 KMP_MB(); 2796 2797 #if OMPT_SUPPORT && OMPT_OPTIONAL 2798 // This is the case, if called from omp_init_lock_with_hint: 2799 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2800 if (!codeptr) 2801 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2802 if (ompt_enabled.ompt_callback_mutex_released) { 2803 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2804 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2805 } 2806 #endif 2807 2808 return; 2809 #else 2810 lck = (kmp_user_lock_p)user_lock; 2811 #endif 2812 } 2813 #if KMP_USE_FUTEX 2814 else if ((__kmp_user_lock_kind == lk_futex) && 2815 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2816 lck = (kmp_user_lock_p)user_lock; 2817 } 2818 #endif 2819 else { 2820 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2821 } 2822 2823 #if USE_ITT_BUILD 2824 __kmp_itt_lock_releasing(lck); 2825 #endif /* USE_ITT_BUILD */ 2826 2827 RELEASE_LOCK(lck, gtid); 2828 2829 #if OMPT_SUPPORT && OMPT_OPTIONAL 2830 // This is the case, if called from omp_init_lock_with_hint: 2831 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2832 if (!codeptr) 2833 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2834 if (ompt_enabled.ompt_callback_mutex_released) { 2835 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2836 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 2837 } 2838 #endif 2839 2840 #endif // KMP_USE_DYNAMIC_LOCK 2841 } 2842 2843 /* release the lock */ 2844 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2845 #if KMP_USE_DYNAMIC_LOCK 2846 2847 #if USE_ITT_BUILD 2848 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2849 #endif 2850 int release_status = 2851 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2852 (void) release_status; 2853 2854 #if OMPT_SUPPORT && OMPT_OPTIONAL 2855 // This is the case, if called from omp_init_lock_with_hint: 2856 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2857 if (!codeptr) 2858 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2859 if (ompt_enabled.enabled) { 2860 if (release_status == KMP_LOCK_RELEASED) { 2861 if (ompt_enabled.ompt_callback_mutex_released) { 2862 // release_lock_last 2863 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2864 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 2865 } 2866 } else if (ompt_enabled.ompt_callback_nest_lock) { 2867 // release_lock_prev 2868 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2869 ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr); 2870 } 2871 } 2872 #endif 2873 2874 #else // KMP_USE_DYNAMIC_LOCK 2875 2876 kmp_user_lock_p lck; 2877 2878 /* Can't use serial interval since not block structured */ 2879 2880 if ((__kmp_user_lock_kind == lk_tas) && 2881 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2882 OMP_NEST_LOCK_T_SIZE)) { 2883 #if KMP_OS_LINUX && \ 2884 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2885 // "fast" path implemented to fix customer performance issue 2886 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2887 #if USE_ITT_BUILD 2888 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2889 #endif /* USE_ITT_BUILD */ 2890 2891 #if OMPT_SUPPORT && OMPT_OPTIONAL 2892 int release_status = KMP_LOCK_STILL_HELD; 2893 #endif 2894 2895 if (--(tl->lk.depth_locked) == 0) { 2896 TCW_4(tl->lk.poll, 0); 2897 #if OMPT_SUPPORT && OMPT_OPTIONAL 2898 release_status = KMP_LOCK_RELEASED; 2899 #endif 2900 } 2901 KMP_MB(); 2902 2903 #if OMPT_SUPPORT && OMPT_OPTIONAL 2904 // This is the case, if called from omp_init_lock_with_hint: 2905 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2906 if (!codeptr) 2907 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2908 if (ompt_enabled.enabled) { 2909 if (release_status == KMP_LOCK_RELEASED) { 2910 if (ompt_enabled.ompt_callback_mutex_released) { 2911 // release_lock_last 2912 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2913 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2914 } 2915 } else if (ompt_enabled.ompt_callback_nest_lock) { 2916 // release_lock_previous 2917 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2918 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr); 2919 } 2920 } 2921 #endif 2922 2923 return; 2924 #else 2925 lck = (kmp_user_lock_p)user_lock; 2926 #endif 2927 } 2928 #if KMP_USE_FUTEX 2929 else if ((__kmp_user_lock_kind == lk_futex) && 2930 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2931 OMP_NEST_LOCK_T_SIZE)) { 2932 lck = (kmp_user_lock_p)user_lock; 2933 } 2934 #endif 2935 else { 2936 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2937 } 2938 2939 #if USE_ITT_BUILD 2940 __kmp_itt_lock_releasing(lck); 2941 #endif /* USE_ITT_BUILD */ 2942 2943 int release_status; 2944 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2945 #if OMPT_SUPPORT && OMPT_OPTIONAL 2946 // This is the case, if called from omp_init_lock_with_hint: 2947 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2948 if (!codeptr) 2949 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2950 if (ompt_enabled.enabled) { 2951 if (release_status == KMP_LOCK_RELEASED) { 2952 if (ompt_enabled.ompt_callback_mutex_released) { 2953 // release_lock_last 2954 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2955 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 2956 } 2957 } else if (ompt_enabled.ompt_callback_nest_lock) { 2958 // release_lock_previous 2959 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2960 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr); 2961 } 2962 } 2963 #endif 2964 2965 #endif // KMP_USE_DYNAMIC_LOCK 2966 } 2967 2968 /* try to acquire the lock */ 2969 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2970 KMP_COUNT_BLOCK(OMP_test_lock); 2971 2972 #if KMP_USE_DYNAMIC_LOCK 2973 int rc; 2974 int tag = KMP_EXTRACT_D_TAG(user_lock); 2975 #if USE_ITT_BUILD 2976 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2977 #endif 2978 #if OMPT_SUPPORT && OMPT_OPTIONAL 2979 // This is the case, if called from omp_init_lock_with_hint: 2980 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2981 if (!codeptr) 2982 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2983 if (ompt_enabled.ompt_callback_mutex_acquire) { 2984 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2985 ompt_mutex_lock, omp_lock_hint_none, 2986 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 2987 codeptr); 2988 } 2989 #endif 2990 #if KMP_USE_INLINED_TAS 2991 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2992 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2993 } else 2994 #elif KMP_USE_INLINED_FUTEX 2995 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2996 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2997 } else 2998 #endif 2999 { 3000 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 3001 } 3002 if (rc) { 3003 #if USE_ITT_BUILD 3004 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3005 #endif 3006 #if OMPT_SUPPORT && OMPT_OPTIONAL 3007 if (ompt_enabled.ompt_callback_mutex_acquired) { 3008 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3009 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr); 3010 } 3011 #endif 3012 return FTN_TRUE; 3013 } else { 3014 #if USE_ITT_BUILD 3015 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3016 #endif 3017 return FTN_FALSE; 3018 } 3019 3020 #else // KMP_USE_DYNAMIC_LOCK 3021 3022 kmp_user_lock_p lck; 3023 int rc; 3024 3025 if ((__kmp_user_lock_kind == lk_tas) && 3026 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 3027 lck = (kmp_user_lock_p)user_lock; 3028 } 3029 #if KMP_USE_FUTEX 3030 else if ((__kmp_user_lock_kind == lk_futex) && 3031 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 3032 lck = (kmp_user_lock_p)user_lock; 3033 } 3034 #endif 3035 else { 3036 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 3037 } 3038 3039 #if USE_ITT_BUILD 3040 __kmp_itt_lock_acquiring(lck); 3041 #endif /* USE_ITT_BUILD */ 3042 #if OMPT_SUPPORT && OMPT_OPTIONAL 3043 // This is the case, if called from omp_init_lock_with_hint: 3044 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3045 if (!codeptr) 3046 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3047 if (ompt_enabled.ompt_callback_mutex_acquire) { 3048 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3049 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 3050 (ompt_wait_id_t)lck, codeptr); 3051 } 3052 #endif 3053 3054 rc = TEST_LOCK(lck, gtid); 3055 #if USE_ITT_BUILD 3056 if (rc) { 3057 __kmp_itt_lock_acquired(lck); 3058 } else { 3059 __kmp_itt_lock_cancelled(lck); 3060 } 3061 #endif /* USE_ITT_BUILD */ 3062 #if OMPT_SUPPORT && OMPT_OPTIONAL 3063 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 3064 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3065 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr); 3066 } 3067 #endif 3068 3069 return (rc ? FTN_TRUE : FTN_FALSE); 3070 3071 /* Can't use serial interval since not block structured */ 3072 3073 #endif // KMP_USE_DYNAMIC_LOCK 3074 } 3075 3076 /* try to acquire the lock */ 3077 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 3078 #if KMP_USE_DYNAMIC_LOCK 3079 int rc; 3080 #if USE_ITT_BUILD 3081 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 3082 #endif 3083 #if OMPT_SUPPORT && OMPT_OPTIONAL 3084 // This is the case, if called from omp_init_lock_with_hint: 3085 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3086 if (!codeptr) 3087 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3088 if (ompt_enabled.ompt_callback_mutex_acquire) { 3089 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3090 ompt_mutex_nest_lock, omp_lock_hint_none, 3091 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock, 3092 codeptr); 3093 } 3094 #endif 3095 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3096 #if USE_ITT_BUILD 3097 if (rc) { 3098 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3099 } else { 3100 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3101 } 3102 #endif 3103 #if OMPT_SUPPORT && OMPT_OPTIONAL 3104 if (ompt_enabled.enabled && rc) { 3105 if (rc == 1) { 3106 if (ompt_enabled.ompt_callback_mutex_acquired) { 3107 // lock_first 3108 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3109 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr); 3110 } 3111 } else { 3112 if (ompt_enabled.ompt_callback_nest_lock) { 3113 // lock_next 3114 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3115 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr); 3116 } 3117 } 3118 } 3119 #endif 3120 return rc; 3121 3122 #else // KMP_USE_DYNAMIC_LOCK 3123 3124 kmp_user_lock_p lck; 3125 int rc; 3126 3127 if ((__kmp_user_lock_kind == lk_tas) && 3128 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3129 OMP_NEST_LOCK_T_SIZE)) { 3130 lck = (kmp_user_lock_p)user_lock; 3131 } 3132 #if KMP_USE_FUTEX 3133 else if ((__kmp_user_lock_kind == lk_futex) && 3134 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3135 OMP_NEST_LOCK_T_SIZE)) { 3136 lck = (kmp_user_lock_p)user_lock; 3137 } 3138 #endif 3139 else { 3140 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3141 } 3142 3143 #if USE_ITT_BUILD 3144 __kmp_itt_lock_acquiring(lck); 3145 #endif /* USE_ITT_BUILD */ 3146 3147 #if OMPT_SUPPORT && OMPT_OPTIONAL 3148 // This is the case, if called from omp_init_lock_with_hint: 3149 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3150 if (!codeptr) 3151 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3152 if (ompt_enabled.enabled) && 3153 ompt_enabled.ompt_callback_mutex_acquire) { 3154 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3155 ompt_mutex_nest_lock, omp_lock_hint_none, 3156 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr); 3157 } 3158 #endif 3159 3160 rc = TEST_NESTED_LOCK(lck, gtid); 3161 #if USE_ITT_BUILD 3162 if (rc) { 3163 __kmp_itt_lock_acquired(lck); 3164 } else { 3165 __kmp_itt_lock_cancelled(lck); 3166 } 3167 #endif /* USE_ITT_BUILD */ 3168 #if OMPT_SUPPORT && OMPT_OPTIONAL 3169 if (ompt_enabled.enabled && rc) { 3170 if (rc == 1) { 3171 if (ompt_enabled.ompt_callback_mutex_acquired) { 3172 // lock_first 3173 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3174 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr); 3175 } 3176 } else { 3177 if (ompt_enabled.ompt_callback_nest_lock) { 3178 // lock_next 3179 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3180 ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr); 3181 } 3182 } 3183 } 3184 #endif 3185 return rc; 3186 3187 /* Can't use serial interval since not block structured */ 3188 3189 #endif // KMP_USE_DYNAMIC_LOCK 3190 } 3191 3192 // Interface to fast scalable reduce methods routines 3193 3194 // keep the selected method in a thread local structure for cross-function 3195 // usage: will be used in __kmpc_end_reduce* functions; 3196 // another solution: to re-determine the method one more time in 3197 // __kmpc_end_reduce* functions (new prototype required then) 3198 // AT: which solution is better? 3199 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3200 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3201 3202 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3203 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3204 3205 // description of the packed_reduction_method variable: look at the macros in 3206 // kmp.h 3207 3208 // used in a critical section reduce block 3209 static __forceinline void 3210 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3211 kmp_critical_name *crit) { 3212 3213 // this lock was visible to a customer and to the threading profile tool as a 3214 // serial overhead span (although it's used for an internal purpose only) 3215 // why was it visible in previous implementation? 3216 // should we keep it visible in new reduce block? 3217 kmp_user_lock_p lck; 3218 3219 #if KMP_USE_DYNAMIC_LOCK 3220 3221 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3222 // Check if it is initialized. 3223 if (*lk == 0) { 3224 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3225 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3226 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3227 } else { 3228 __kmp_init_indirect_csptr(crit, loc, global_tid, 3229 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3230 } 3231 } 3232 // Branch for accessing the actual lock object and set operation. This 3233 // branching is inevitable since this lock initialization does not follow the 3234 // normal dispatch path (lock table is not used). 3235 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3236 lck = (kmp_user_lock_p)lk; 3237 KMP_DEBUG_ASSERT(lck != NULL); 3238 if (__kmp_env_consistency_check) { 3239 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3240 } 3241 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3242 } else { 3243 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3244 lck = ilk->lock; 3245 KMP_DEBUG_ASSERT(lck != NULL); 3246 if (__kmp_env_consistency_check) { 3247 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3248 } 3249 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3250 } 3251 3252 #else // KMP_USE_DYNAMIC_LOCK 3253 3254 // We know that the fast reduction code is only emitted by Intel compilers 3255 // with 32 byte critical sections. If there isn't enough space, then we 3256 // have to use a pointer. 3257 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3258 lck = (kmp_user_lock_p)crit; 3259 } else { 3260 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3261 } 3262 KMP_DEBUG_ASSERT(lck != NULL); 3263 3264 if (__kmp_env_consistency_check) 3265 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3266 3267 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3268 3269 #endif // KMP_USE_DYNAMIC_LOCK 3270 } 3271 3272 // used in a critical section reduce block 3273 static __forceinline void 3274 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3275 kmp_critical_name *crit) { 3276 3277 kmp_user_lock_p lck; 3278 3279 #if KMP_USE_DYNAMIC_LOCK 3280 3281 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3282 lck = (kmp_user_lock_p)crit; 3283 if (__kmp_env_consistency_check) 3284 __kmp_pop_sync(global_tid, ct_critical, loc); 3285 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3286 } else { 3287 kmp_indirect_lock_t *ilk = 3288 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3289 if (__kmp_env_consistency_check) 3290 __kmp_pop_sync(global_tid, ct_critical, loc); 3291 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3292 } 3293 3294 #else // KMP_USE_DYNAMIC_LOCK 3295 3296 // We know that the fast reduction code is only emitted by Intel compilers 3297 // with 32 byte critical sections. If there isn't enough space, then we have 3298 // to use a pointer. 3299 if (__kmp_base_user_lock_size > 32) { 3300 lck = *((kmp_user_lock_p *)crit); 3301 KMP_ASSERT(lck != NULL); 3302 } else { 3303 lck = (kmp_user_lock_p)crit; 3304 } 3305 3306 if (__kmp_env_consistency_check) 3307 __kmp_pop_sync(global_tid, ct_critical, loc); 3308 3309 __kmp_release_user_lock_with_checks(lck, global_tid); 3310 3311 #endif // KMP_USE_DYNAMIC_LOCK 3312 } // __kmp_end_critical_section_reduce_block 3313 3314 #if OMP_40_ENABLED 3315 static __forceinline int 3316 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3317 int *task_state) { 3318 kmp_team_t *team; 3319 3320 // Check if we are inside the teams construct? 3321 if (th->th.th_teams_microtask) { 3322 *team_p = team = th->th.th_team; 3323 if (team->t.t_level == th->th.th_teams_level) { 3324 // This is reduction at teams construct. 3325 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3326 // Let's swap teams temporarily for the reduction. 3327 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3328 th->th.th_team = team->t.t_parent; 3329 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3330 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3331 *task_state = th->th.th_task_state; 3332 th->th.th_task_state = 0; 3333 3334 return 1; 3335 } 3336 } 3337 return 0; 3338 } 3339 3340 static __forceinline void 3341 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3342 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3343 th->th.th_info.ds.ds_tid = 0; 3344 th->th.th_team = team; 3345 th->th.th_team_nproc = team->t.t_nproc; 3346 th->th.th_task_team = team->t.t_task_team[task_state]; 3347 th->th.th_task_state = task_state; 3348 } 3349 #endif 3350 3351 /* 2.a.i. Reduce Block without a terminating barrier */ 3352 /*! 3353 @ingroup SYNCHRONIZATION 3354 @param loc source location information 3355 @param global_tid global thread number 3356 @param num_vars number of items (variables) to be reduced 3357 @param reduce_size size of data in bytes to be reduced 3358 @param reduce_data pointer to data to be reduced 3359 @param reduce_func callback function providing reduction operation on two 3360 operands and returning result of reduction in lhs_data 3361 @param lck pointer to the unique lock data structure 3362 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3363 threads if atomic reduction needed 3364 3365 The nowait version is used for a reduce clause with the nowait argument. 3366 */ 3367 kmp_int32 3368 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3369 size_t reduce_size, void *reduce_data, 3370 void (*reduce_func)(void *lhs_data, void *rhs_data), 3371 kmp_critical_name *lck) { 3372 3373 KMP_COUNT_BLOCK(REDUCE_nowait); 3374 int retval = 0; 3375 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3376 #if OMP_40_ENABLED 3377 kmp_info_t *th; 3378 kmp_team_t *team; 3379 int teams_swapped = 0, task_state; 3380 #endif 3381 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3382 3383 // why do we need this initialization here at all? 3384 // Reduction clause can not be used as a stand-alone directive. 3385 3386 // do not call __kmp_serial_initialize(), it will be called by 3387 // __kmp_parallel_initialize() if needed 3388 // possible detection of false-positive race by the threadchecker ??? 3389 if (!TCR_4(__kmp_init_parallel)) 3390 __kmp_parallel_initialize(); 3391 3392 #if OMP_50_ENABLED 3393 __kmp_resume_if_soft_paused(); 3394 #endif 3395 3396 // check correctness of reduce block nesting 3397 #if KMP_USE_DYNAMIC_LOCK 3398 if (__kmp_env_consistency_check) 3399 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3400 #else 3401 if (__kmp_env_consistency_check) 3402 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3403 #endif 3404 3405 #if OMP_40_ENABLED 3406 th = __kmp_thread_from_gtid(global_tid); 3407 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3408 #endif // OMP_40_ENABLED 3409 3410 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3411 // the value should be kept in a variable 3412 // the variable should be either a construct-specific or thread-specific 3413 // property, not a team specific property 3414 // (a thread can reach the next reduce block on the next construct, reduce 3415 // method may differ on the next construct) 3416 // an ident_t "loc" parameter could be used as a construct-specific property 3417 // (what if loc == 0?) 3418 // (if both construct-specific and team-specific variables were shared, 3419 // then unness extra syncs should be needed) 3420 // a thread-specific variable is better regarding two issues above (next 3421 // construct and extra syncs) 3422 // a thread-specific "th_local.reduction_method" variable is used currently 3423 // each thread executes 'determine' and 'set' lines (no need to execute by one 3424 // thread, to avoid unness extra syncs) 3425 3426 packed_reduction_method = __kmp_determine_reduction_method( 3427 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3428 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3429 3430 if (packed_reduction_method == critical_reduce_block) { 3431 3432 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3433 retval = 1; 3434 3435 } else if (packed_reduction_method == empty_reduce_block) { 3436 3437 // usage: if team size == 1, no synchronization is required ( Intel 3438 // platforms only ) 3439 retval = 1; 3440 3441 } else if (packed_reduction_method == atomic_reduce_block) { 3442 3443 retval = 2; 3444 3445 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3446 // won't be called by the code gen) 3447 // (it's not quite good, because the checking block has been closed by 3448 // this 'pop', 3449 // but atomic operation has not been executed yet, will be executed 3450 // slightly later, literally on next instruction) 3451 if (__kmp_env_consistency_check) 3452 __kmp_pop_sync(global_tid, ct_reduce, loc); 3453 3454 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3455 tree_reduce_block)) { 3456 3457 // AT: performance issue: a real barrier here 3458 // AT: (if master goes slow, other threads are blocked here waiting for the 3459 // master to come and release them) 3460 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3461 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3462 // be confusing to a customer) 3463 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3464 // might go faster and be more in line with sense of NOWAIT 3465 // AT: TO DO: do epcc test and compare times 3466 3467 // this barrier should be invisible to a customer and to the threading profile 3468 // tool (it's neither a terminating barrier nor customer's code, it's 3469 // used for an internal purpose) 3470 #if OMPT_SUPPORT 3471 // JP: can this barrier potentially leed to task scheduling? 3472 // JP: as long as there is a barrier in the implementation, OMPT should and 3473 // will provide the barrier events 3474 // so we set-up the necessary frame/return addresses. 3475 ompt_frame_t *ompt_frame; 3476 if (ompt_enabled.enabled) { 3477 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3478 if (ompt_frame->enter_frame.ptr == NULL) 3479 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3480 OMPT_STORE_RETURN_ADDRESS(global_tid); 3481 } 3482 #endif 3483 #if USE_ITT_NOTIFY 3484 __kmp_threads[global_tid]->th.th_ident = loc; 3485 #endif 3486 retval = 3487 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3488 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3489 retval = (retval != 0) ? (0) : (1); 3490 #if OMPT_SUPPORT && OMPT_OPTIONAL 3491 if (ompt_enabled.enabled) { 3492 ompt_frame->enter_frame = ompt_data_none; 3493 } 3494 #endif 3495 3496 // all other workers except master should do this pop here 3497 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3498 if (__kmp_env_consistency_check) { 3499 if (retval == 0) { 3500 __kmp_pop_sync(global_tid, ct_reduce, loc); 3501 } 3502 } 3503 3504 } else { 3505 3506 // should never reach this block 3507 KMP_ASSERT(0); // "unexpected method" 3508 } 3509 #if OMP_40_ENABLED 3510 if (teams_swapped) { 3511 __kmp_restore_swapped_teams(th, team, task_state); 3512 } 3513 #endif 3514 KA_TRACE( 3515 10, 3516 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3517 global_tid, packed_reduction_method, retval)); 3518 3519 return retval; 3520 } 3521 3522 /*! 3523 @ingroup SYNCHRONIZATION 3524 @param loc source location information 3525 @param global_tid global thread id. 3526 @param lck pointer to the unique lock data structure 3527 3528 Finish the execution of a reduce nowait. 3529 */ 3530 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3531 kmp_critical_name *lck) { 3532 3533 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3534 3535 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3536 3537 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3538 3539 if (packed_reduction_method == critical_reduce_block) { 3540 3541 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3542 3543 } else if (packed_reduction_method == empty_reduce_block) { 3544 3545 // usage: if team size == 1, no synchronization is required ( on Intel 3546 // platforms only ) 3547 3548 } else if (packed_reduction_method == atomic_reduce_block) { 3549 3550 // neither master nor other workers should get here 3551 // (code gen does not generate this call in case 2: atomic reduce block) 3552 // actually it's better to remove this elseif at all; 3553 // after removal this value will checked by the 'else' and will assert 3554 3555 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3556 tree_reduce_block)) { 3557 3558 // only master gets here 3559 3560 } else { 3561 3562 // should never reach this block 3563 KMP_ASSERT(0); // "unexpected method" 3564 } 3565 3566 if (__kmp_env_consistency_check) 3567 __kmp_pop_sync(global_tid, ct_reduce, loc); 3568 3569 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3570 global_tid, packed_reduction_method)); 3571 3572 return; 3573 } 3574 3575 /* 2.a.ii. Reduce Block with a terminating barrier */ 3576 3577 /*! 3578 @ingroup SYNCHRONIZATION 3579 @param loc source location information 3580 @param global_tid global thread number 3581 @param num_vars number of items (variables) to be reduced 3582 @param reduce_size size of data in bytes to be reduced 3583 @param reduce_data pointer to data to be reduced 3584 @param reduce_func callback function providing reduction operation on two 3585 operands and returning result of reduction in lhs_data 3586 @param lck pointer to the unique lock data structure 3587 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3588 threads if atomic reduction needed 3589 3590 A blocking reduce that includes an implicit barrier. 3591 */ 3592 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3593 size_t reduce_size, void *reduce_data, 3594 void (*reduce_func)(void *lhs_data, void *rhs_data), 3595 kmp_critical_name *lck) { 3596 KMP_COUNT_BLOCK(REDUCE_wait); 3597 int retval = 0; 3598 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3599 #if OMP_40_ENABLED 3600 kmp_info_t *th; 3601 kmp_team_t *team; 3602 int teams_swapped = 0, task_state; 3603 #endif 3604 3605 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3606 3607 // why do we need this initialization here at all? 3608 // Reduction clause can not be a stand-alone directive. 3609 3610 // do not call __kmp_serial_initialize(), it will be called by 3611 // __kmp_parallel_initialize() if needed 3612 // possible detection of false-positive race by the threadchecker ??? 3613 if (!TCR_4(__kmp_init_parallel)) 3614 __kmp_parallel_initialize(); 3615 3616 #if OMP_50_ENABLED 3617 __kmp_resume_if_soft_paused(); 3618 #endif 3619 3620 // check correctness of reduce block nesting 3621 #if KMP_USE_DYNAMIC_LOCK 3622 if (__kmp_env_consistency_check) 3623 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3624 #else 3625 if (__kmp_env_consistency_check) 3626 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3627 #endif 3628 3629 #if OMP_40_ENABLED 3630 th = __kmp_thread_from_gtid(global_tid); 3631 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3632 #endif // OMP_40_ENABLED 3633 3634 packed_reduction_method = __kmp_determine_reduction_method( 3635 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3636 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3637 3638 if (packed_reduction_method == critical_reduce_block) { 3639 3640 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3641 retval = 1; 3642 3643 } else if (packed_reduction_method == empty_reduce_block) { 3644 3645 // usage: if team size == 1, no synchronization is required ( Intel 3646 // platforms only ) 3647 retval = 1; 3648 3649 } else if (packed_reduction_method == atomic_reduce_block) { 3650 3651 retval = 2; 3652 3653 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3654 tree_reduce_block)) { 3655 3656 // case tree_reduce_block: 3657 // this barrier should be visible to a customer and to the threading profile 3658 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3659 #if OMPT_SUPPORT 3660 ompt_frame_t *ompt_frame; 3661 if (ompt_enabled.enabled) { 3662 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3663 if (ompt_frame->enter_frame.ptr == NULL) 3664 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3665 OMPT_STORE_RETURN_ADDRESS(global_tid); 3666 } 3667 #endif 3668 #if USE_ITT_NOTIFY 3669 __kmp_threads[global_tid]->th.th_ident = 3670 loc; // needed for correct notification of frames 3671 #endif 3672 retval = 3673 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3674 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3675 retval = (retval != 0) ? (0) : (1); 3676 #if OMPT_SUPPORT && OMPT_OPTIONAL 3677 if (ompt_enabled.enabled) { 3678 ompt_frame->enter_frame = ompt_data_none; 3679 } 3680 #endif 3681 3682 // all other workers except master should do this pop here 3683 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3684 if (__kmp_env_consistency_check) { 3685 if (retval == 0) { // 0: all other workers; 1: master 3686 __kmp_pop_sync(global_tid, ct_reduce, loc); 3687 } 3688 } 3689 3690 } else { 3691 3692 // should never reach this block 3693 KMP_ASSERT(0); // "unexpected method" 3694 } 3695 #if OMP_40_ENABLED 3696 if (teams_swapped) { 3697 __kmp_restore_swapped_teams(th, team, task_state); 3698 } 3699 #endif 3700 3701 KA_TRACE(10, 3702 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3703 global_tid, packed_reduction_method, retval)); 3704 3705 return retval; 3706 } 3707 3708 /*! 3709 @ingroup SYNCHRONIZATION 3710 @param loc source location information 3711 @param global_tid global thread id. 3712 @param lck pointer to the unique lock data structure 3713 3714 Finish the execution of a blocking reduce. 3715 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3716 start function. 3717 */ 3718 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3719 kmp_critical_name *lck) { 3720 3721 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3722 #if OMP_40_ENABLED 3723 kmp_info_t *th; 3724 kmp_team_t *team; 3725 int teams_swapped = 0, task_state; 3726 #endif 3727 3728 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3729 3730 #if OMP_40_ENABLED 3731 th = __kmp_thread_from_gtid(global_tid); 3732 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3733 #endif // OMP_40_ENABLED 3734 3735 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3736 3737 // this barrier should be visible to a customer and to the threading profile 3738 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3739 3740 if (packed_reduction_method == critical_reduce_block) { 3741 3742 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3743 3744 // TODO: implicit barrier: should be exposed 3745 #if OMPT_SUPPORT 3746 ompt_frame_t *ompt_frame; 3747 if (ompt_enabled.enabled) { 3748 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3749 if (ompt_frame->enter_frame.ptr == NULL) 3750 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3751 OMPT_STORE_RETURN_ADDRESS(global_tid); 3752 } 3753 #endif 3754 #if USE_ITT_NOTIFY 3755 __kmp_threads[global_tid]->th.th_ident = loc; 3756 #endif 3757 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3758 #if OMPT_SUPPORT && OMPT_OPTIONAL 3759 if (ompt_enabled.enabled) { 3760 ompt_frame->enter_frame = ompt_data_none; 3761 } 3762 #endif 3763 3764 } else if (packed_reduction_method == empty_reduce_block) { 3765 3766 // usage: if team size==1, no synchronization is required (Intel platforms only) 3767 3768 // TODO: implicit barrier: should be exposed 3769 #if OMPT_SUPPORT 3770 ompt_frame_t *ompt_frame; 3771 if (ompt_enabled.enabled) { 3772 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3773 if (ompt_frame->enter_frame.ptr == NULL) 3774 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3775 OMPT_STORE_RETURN_ADDRESS(global_tid); 3776 } 3777 #endif 3778 #if USE_ITT_NOTIFY 3779 __kmp_threads[global_tid]->th.th_ident = loc; 3780 #endif 3781 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3782 #if OMPT_SUPPORT && OMPT_OPTIONAL 3783 if (ompt_enabled.enabled) { 3784 ompt_frame->enter_frame = ompt_data_none; 3785 } 3786 #endif 3787 3788 } else if (packed_reduction_method == atomic_reduce_block) { 3789 3790 #if OMPT_SUPPORT 3791 ompt_frame_t *ompt_frame; 3792 if (ompt_enabled.enabled) { 3793 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3794 if (ompt_frame->enter_frame.ptr == NULL) 3795 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3796 OMPT_STORE_RETURN_ADDRESS(global_tid); 3797 } 3798 #endif 3799 // TODO: implicit barrier: should be exposed 3800 #if USE_ITT_NOTIFY 3801 __kmp_threads[global_tid]->th.th_ident = loc; 3802 #endif 3803 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3804 #if OMPT_SUPPORT && OMPT_OPTIONAL 3805 if (ompt_enabled.enabled) { 3806 ompt_frame->enter_frame = ompt_data_none; 3807 } 3808 #endif 3809 3810 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3811 tree_reduce_block)) { 3812 3813 // only master executes here (master releases all other workers) 3814 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3815 global_tid); 3816 3817 } else { 3818 3819 // should never reach this block 3820 KMP_ASSERT(0); // "unexpected method" 3821 } 3822 #if OMP_40_ENABLED 3823 if (teams_swapped) { 3824 __kmp_restore_swapped_teams(th, team, task_state); 3825 } 3826 #endif 3827 3828 if (__kmp_env_consistency_check) 3829 __kmp_pop_sync(global_tid, ct_reduce, loc); 3830 3831 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3832 global_tid, packed_reduction_method)); 3833 3834 return; 3835 } 3836 3837 #undef __KMP_GET_REDUCTION_METHOD 3838 #undef __KMP_SET_REDUCTION_METHOD 3839 3840 /* end of interface to fast scalable reduce routines */ 3841 3842 kmp_uint64 __kmpc_get_taskid() { 3843 3844 kmp_int32 gtid; 3845 kmp_info_t *thread; 3846 3847 gtid = __kmp_get_gtid(); 3848 if (gtid < 0) { 3849 return 0; 3850 } 3851 thread = __kmp_thread_from_gtid(gtid); 3852 return thread->th.th_current_task->td_task_id; 3853 3854 } // __kmpc_get_taskid 3855 3856 kmp_uint64 __kmpc_get_parent_taskid() { 3857 3858 kmp_int32 gtid; 3859 kmp_info_t *thread; 3860 kmp_taskdata_t *parent_task; 3861 3862 gtid = __kmp_get_gtid(); 3863 if (gtid < 0) { 3864 return 0; 3865 } 3866 thread = __kmp_thread_from_gtid(gtid); 3867 parent_task = thread->th.th_current_task->td_parent; 3868 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3869 3870 } // __kmpc_get_parent_taskid 3871 3872 #if OMP_45_ENABLED 3873 /*! 3874 @ingroup WORK_SHARING 3875 @param loc source location information. 3876 @param gtid global thread number. 3877 @param num_dims number of associated doacross loops. 3878 @param dims info on loops bounds. 3879 3880 Initialize doacross loop information. 3881 Expect compiler send us inclusive bounds, 3882 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3883 */ 3884 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3885 const struct kmp_dim *dims) { 3886 int j, idx; 3887 kmp_int64 last, trace_count; 3888 kmp_info_t *th = __kmp_threads[gtid]; 3889 kmp_team_t *team = th->th.th_team; 3890 kmp_uint32 *flags; 3891 kmp_disp_t *pr_buf = th->th.th_dispatch; 3892 dispatch_shared_info_t *sh_buf; 3893 3894 KA_TRACE( 3895 20, 3896 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3897 gtid, num_dims, !team->t.t_serialized)); 3898 KMP_DEBUG_ASSERT(dims != NULL); 3899 KMP_DEBUG_ASSERT(num_dims > 0); 3900 3901 if (team->t.t_serialized) { 3902 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3903 return; // no dependencies if team is serialized 3904 } 3905 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3906 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3907 // the next loop 3908 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3909 3910 // Save bounds info into allocated private buffer 3911 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3912 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3913 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3914 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3915 pr_buf->th_doacross_info[0] = 3916 (kmp_int64)num_dims; // first element is number of dimensions 3917 // Save also address of num_done in order to access it later without knowing 3918 // the buffer index 3919 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3920 pr_buf->th_doacross_info[2] = dims[0].lo; 3921 pr_buf->th_doacross_info[3] = dims[0].up; 3922 pr_buf->th_doacross_info[4] = dims[0].st; 3923 last = 5; 3924 for (j = 1; j < num_dims; ++j) { 3925 kmp_int64 3926 range_length; // To keep ranges of all dimensions but the first dims[0] 3927 if (dims[j].st == 1) { // most common case 3928 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3929 range_length = dims[j].up - dims[j].lo + 1; 3930 } else { 3931 if (dims[j].st > 0) { 3932 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3933 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3934 } else { // negative increment 3935 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3936 range_length = 3937 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3938 } 3939 } 3940 pr_buf->th_doacross_info[last++] = range_length; 3941 pr_buf->th_doacross_info[last++] = dims[j].lo; 3942 pr_buf->th_doacross_info[last++] = dims[j].up; 3943 pr_buf->th_doacross_info[last++] = dims[j].st; 3944 } 3945 3946 // Compute total trip count. 3947 // Start with range of dims[0] which we don't need to keep in the buffer. 3948 if (dims[0].st == 1) { // most common case 3949 trace_count = dims[0].up - dims[0].lo + 1; 3950 } else if (dims[0].st > 0) { 3951 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3952 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3953 } else { // negative increment 3954 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3955 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3956 } 3957 for (j = 1; j < num_dims; ++j) { 3958 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3959 } 3960 KMP_DEBUG_ASSERT(trace_count > 0); 3961 3962 // Check if shared buffer is not occupied by other loop (idx - 3963 // __kmp_dispatch_num_buffers) 3964 if (idx != sh_buf->doacross_buf_idx) { 3965 // Shared buffer is occupied, wait for it to be free 3966 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3967 __kmp_eq_4, NULL); 3968 } 3969 #if KMP_32_BIT_ARCH 3970 // Check if we are the first thread. After the CAS the first thread gets 0, 3971 // others get 1 if initialization is in progress, allocated pointer otherwise. 3972 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3973 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3974 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3975 #else 3976 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3977 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3978 #endif 3979 if (flags == NULL) { 3980 // we are the first thread, allocate the array of flags 3981 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3982 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3983 KMP_MB(); 3984 sh_buf->doacross_flags = flags; 3985 } else if (flags == (kmp_uint32 *)1) { 3986 #if KMP_32_BIT_ARCH 3987 // initialization is still in progress, need to wait 3988 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3989 #else 3990 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3991 #endif 3992 KMP_YIELD(TRUE); 3993 KMP_MB(); 3994 } else { 3995 KMP_MB(); 3996 } 3997 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3998 pr_buf->th_doacross_flags = 3999 sh_buf->doacross_flags; // save private copy in order to not 4000 // touch shared buffer on each iteration 4001 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 4002 } 4003 4004 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 4005 kmp_int32 shft, num_dims, i; 4006 kmp_uint32 flag; 4007 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4008 kmp_info_t *th = __kmp_threads[gtid]; 4009 kmp_team_t *team = th->th.th_team; 4010 kmp_disp_t *pr_buf; 4011 kmp_int64 lo, up, st; 4012 4013 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 4014 if (team->t.t_serialized) { 4015 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 4016 return; // no dependencies if team is serialized 4017 } 4018 4019 // calculate sequential iteration number and check out-of-bounds condition 4020 pr_buf = th->th.th_dispatch; 4021 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4022 num_dims = pr_buf->th_doacross_info[0]; 4023 lo = pr_buf->th_doacross_info[2]; 4024 up = pr_buf->th_doacross_info[3]; 4025 st = pr_buf->th_doacross_info[4]; 4026 if (st == 1) { // most common case 4027 if (vec[0] < lo || vec[0] > up) { 4028 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4029 "bounds [%lld,%lld]\n", 4030 gtid, vec[0], lo, up)); 4031 return; 4032 } 4033 iter_number = vec[0] - lo; 4034 } else if (st > 0) { 4035 if (vec[0] < lo || vec[0] > up) { 4036 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4037 "bounds [%lld,%lld]\n", 4038 gtid, vec[0], lo, up)); 4039 return; 4040 } 4041 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4042 } else { // negative increment 4043 if (vec[0] > lo || vec[0] < up) { 4044 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4045 "bounds [%lld,%lld]\n", 4046 gtid, vec[0], lo, up)); 4047 return; 4048 } 4049 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4050 } 4051 for (i = 1; i < num_dims; ++i) { 4052 kmp_int64 iter, ln; 4053 kmp_int32 j = i * 4; 4054 ln = pr_buf->th_doacross_info[j + 1]; 4055 lo = pr_buf->th_doacross_info[j + 2]; 4056 up = pr_buf->th_doacross_info[j + 3]; 4057 st = pr_buf->th_doacross_info[j + 4]; 4058 if (st == 1) { 4059 if (vec[i] < lo || vec[i] > up) { 4060 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4061 "bounds [%lld,%lld]\n", 4062 gtid, vec[i], lo, up)); 4063 return; 4064 } 4065 iter = vec[i] - lo; 4066 } else if (st > 0) { 4067 if (vec[i] < lo || vec[i] > up) { 4068 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4069 "bounds [%lld,%lld]\n", 4070 gtid, vec[i], lo, up)); 4071 return; 4072 } 4073 iter = (kmp_uint64)(vec[i] - lo) / st; 4074 } else { // st < 0 4075 if (vec[i] > lo || vec[i] < up) { 4076 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4077 "bounds [%lld,%lld]\n", 4078 gtid, vec[i], lo, up)); 4079 return; 4080 } 4081 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4082 } 4083 iter_number = iter + ln * iter_number; 4084 } 4085 shft = iter_number % 32; // use 32-bit granularity 4086 iter_number >>= 5; // divided by 32 4087 flag = 1 << shft; 4088 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 4089 KMP_YIELD(TRUE); 4090 } 4091 KMP_MB(); 4092 KA_TRACE(20, 4093 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 4094 gtid, (iter_number << 5) + shft)); 4095 } 4096 4097 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4098 kmp_int32 shft, num_dims, i; 4099 kmp_uint32 flag; 4100 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4101 kmp_info_t *th = __kmp_threads[gtid]; 4102 kmp_team_t *team = th->th.th_team; 4103 kmp_disp_t *pr_buf; 4104 kmp_int64 lo, st; 4105 4106 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4107 if (team->t.t_serialized) { 4108 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4109 return; // no dependencies if team is serialized 4110 } 4111 4112 // calculate sequential iteration number (same as in "wait" but no 4113 // out-of-bounds checks) 4114 pr_buf = th->th.th_dispatch; 4115 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4116 num_dims = pr_buf->th_doacross_info[0]; 4117 lo = pr_buf->th_doacross_info[2]; 4118 st = pr_buf->th_doacross_info[4]; 4119 if (st == 1) { // most common case 4120 iter_number = vec[0] - lo; 4121 } else if (st > 0) { 4122 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4123 } else { // negative increment 4124 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4125 } 4126 for (i = 1; i < num_dims; ++i) { 4127 kmp_int64 iter, ln; 4128 kmp_int32 j = i * 4; 4129 ln = pr_buf->th_doacross_info[j + 1]; 4130 lo = pr_buf->th_doacross_info[j + 2]; 4131 st = pr_buf->th_doacross_info[j + 4]; 4132 if (st == 1) { 4133 iter = vec[i] - lo; 4134 } else if (st > 0) { 4135 iter = (kmp_uint64)(vec[i] - lo) / st; 4136 } else { // st < 0 4137 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4138 } 4139 iter_number = iter + ln * iter_number; 4140 } 4141 shft = iter_number % 32; // use 32-bit granularity 4142 iter_number >>= 5; // divided by 32 4143 flag = 1 << shft; 4144 KMP_MB(); 4145 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4146 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4147 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4148 (iter_number << 5) + shft)); 4149 } 4150 4151 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4152 kmp_int32 num_done; 4153 kmp_info_t *th = __kmp_threads[gtid]; 4154 kmp_team_t *team = th->th.th_team; 4155 kmp_disp_t *pr_buf = th->th.th_dispatch; 4156 4157 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4158 if (team->t.t_serialized) { 4159 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4160 return; // nothing to do 4161 } 4162 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4163 if (num_done == th->th.th_team_nproc) { 4164 // we are the last thread, need to free shared resources 4165 int idx = pr_buf->th_doacross_buf_idx - 1; 4166 dispatch_shared_info_t *sh_buf = 4167 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4168 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4169 (kmp_int64)&sh_buf->doacross_num_done); 4170 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4171 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4172 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4173 sh_buf->doacross_flags = NULL; 4174 sh_buf->doacross_num_done = 0; 4175 sh_buf->doacross_buf_idx += 4176 __kmp_dispatch_num_buffers; // free buffer for future re-use 4177 } 4178 // free private resources (need to keep buffer index forever) 4179 pr_buf->th_doacross_flags = NULL; 4180 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4181 pr_buf->th_doacross_info = NULL; 4182 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4183 } 4184 #endif 4185 4186 #if OMP_50_ENABLED 4187 int __kmpc_get_target_offload(void) { 4188 if (!__kmp_init_serial) { 4189 __kmp_serial_initialize(); 4190 } 4191 return __kmp_target_offload; 4192 } 4193 4194 int __kmpc_pause_resource(kmp_pause_status_t level) { 4195 if (!__kmp_init_serial) { 4196 return 1; // Can't pause if runtime is not initialized 4197 } 4198 return __kmp_pause_resource(level); 4199 } 4200 #endif // OMP_50_ENABLED 4201 4202 // end of file // 4203