1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #define __KMP_IMP 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 #include "ompt-specific.h" 22 23 #define MAX_MESSAGE 512 24 25 // flags will be used in future, e.g. to implement openmp_strict library 26 // restrictions 27 28 /*! 29 * @ingroup STARTUP_SHUTDOWN 30 * @param loc in source location information 31 * @param flags in for future use (currently ignored) 32 * 33 * Initialize the runtime library. This call is optional; if it is not made then 34 * it will be implicitly called by attempts to use other library functions. 35 */ 36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 37 // By default __kmpc_begin() is no-op. 38 char *env; 39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 40 __kmp_str_match_true(env)) { 41 __kmp_middle_initialize(); 42 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 43 } else if (__kmp_ignore_mppbeg() == FALSE) { 44 // By default __kmp_ignore_mppbeg() returns TRUE. 45 __kmp_internal_begin(); 46 KC_TRACE(10, ("__kmpc_begin: called\n")); 47 } 48 } 49 50 /*! 51 * @ingroup STARTUP_SHUTDOWN 52 * @param loc source location information 53 * 54 * Shutdown the runtime library. This is also optional, and even if called will 55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 56 * zero. 57 */ 58 void __kmpc_end(ident_t *loc) { 59 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 60 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 61 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 62 // returns FALSE and __kmpc_end() will unregister this root (it can cause 63 // library shut down). 64 if (__kmp_ignore_mppend() == FALSE) { 65 KC_TRACE(10, ("__kmpc_end: called\n")); 66 KA_TRACE(30, ("__kmpc_end\n")); 67 68 __kmp_internal_end_thread(-1); 69 } 70 #if KMP_OS_WINDOWS && OMPT_SUPPORT 71 // Normal exit process on Windows does not allow worker threads of the final 72 // parallel region to finish reporting their events, so shutting down the 73 // library here fixes the issue at least for the cases where __kmpc_end() is 74 // placed properly. 75 if (ompt_enabled.enabled) 76 __kmp_internal_end_library(__kmp_gtid_get_specific()); 77 #endif 78 } 79 80 /*! 81 @ingroup THREAD_STATES 82 @param loc Source location information. 83 @return The global thread index of the active thread. 84 85 This function can be called in any context. 86 87 If the runtime has ony been entered at the outermost level from a 88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 89 that which would be returned by omp_get_thread_num() in the outermost 90 active parallel construct. (Or zero if there is no active parallel 91 construct, since the master thread is necessarily thread zero). 92 93 If multiple non-OpenMP threads all enter an OpenMP construct then this 94 will be a unique thread identifier among all the threads created by 95 the OpenMP runtime (but the value cannot be defined in terms of 96 OpenMP thread ids returned by omp_get_thread_num()). 97 */ 98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 99 kmp_int32 gtid = __kmp_entry_gtid(); 100 101 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 102 103 return gtid; 104 } 105 106 /*! 107 @ingroup THREAD_STATES 108 @param loc Source location information. 109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 110 111 This function can be called in any context. 112 It returns the total number of threads under the control of the OpenMP runtime. 113 That is not a number that can be determined by any OpenMP standard calls, since 114 the library may be called from more than one non-OpenMP thread, and this 115 reflects the total over all such calls. Similarly the runtime maintains 116 underlying threads even when they are not active (since the cost of creating 117 and destroying OS threads is high), this call counts all such threads even if 118 they are not waiting for work. 119 */ 120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 121 KC_TRACE(10, 122 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 123 124 return TCR_4(__kmp_all_nth); 125 } 126 127 /*! 128 @ingroup THREAD_STATES 129 @param loc Source location information. 130 @return The thread number of the calling thread in the innermost active parallel 131 construct. 132 */ 133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 134 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 135 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 136 } 137 138 /*! 139 @ingroup THREAD_STATES 140 @param loc Source location information. 141 @return The number of threads in the innermost active parallel construct. 142 */ 143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 144 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 145 146 return __kmp_entry_thread()->th.th_team->t.t_nproc; 147 } 148 149 /*! 150 * @ingroup DEPRECATED 151 * @param loc location description 152 * 153 * This function need not be called. It always returns TRUE. 154 */ 155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 156 #ifndef KMP_DEBUG 157 158 return TRUE; 159 160 #else 161 162 const char *semi2; 163 const char *semi3; 164 int line_no; 165 166 if (__kmp_par_range == 0) { 167 return TRUE; 168 } 169 semi2 = loc->psource; 170 if (semi2 == NULL) { 171 return TRUE; 172 } 173 semi2 = strchr(semi2, ';'); 174 if (semi2 == NULL) { 175 return TRUE; 176 } 177 semi2 = strchr(semi2 + 1, ';'); 178 if (semi2 == NULL) { 179 return TRUE; 180 } 181 if (__kmp_par_range_filename[0]) { 182 const char *name = semi2 - 1; 183 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 184 name--; 185 } 186 if ((*name == '/') || (*name == ';')) { 187 name++; 188 } 189 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 190 return __kmp_par_range < 0; 191 } 192 } 193 semi3 = strchr(semi2 + 1, ';'); 194 if (__kmp_par_range_routine[0]) { 195 if ((semi3 != NULL) && (semi3 > semi2) && 196 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 197 return __kmp_par_range < 0; 198 } 199 } 200 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 201 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 202 return __kmp_par_range > 0; 203 } 204 return __kmp_par_range < 0; 205 } 206 return TRUE; 207 208 #endif /* KMP_DEBUG */ 209 } 210 211 /*! 212 @ingroup THREAD_STATES 213 @param loc Source location information. 214 @return 1 if this thread is executing inside an active parallel region, zero if 215 not. 216 */ 217 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 218 return __kmp_entry_thread()->th.th_root->r.r_active; 219 } 220 221 /*! 222 @ingroup PARALLEL 223 @param loc source location information 224 @param global_tid global thread number 225 @param num_threads number of threads requested for this parallel construct 226 227 Set the number of threads to be used by the next fork spawned by this thread. 228 This call is only required if the parallel construct has a `num_threads` clause. 229 */ 230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 231 kmp_int32 num_threads) { 232 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 233 global_tid, num_threads)); 234 __kmp_assert_valid_gtid(global_tid); 235 __kmp_push_num_threads(loc, global_tid, num_threads); 236 } 237 238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 239 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 240 /* the num_threads are automatically popped */ 241 } 242 243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 244 kmp_int32 proc_bind) { 245 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 246 proc_bind)); 247 __kmp_assert_valid_gtid(global_tid); 248 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 249 } 250 251 /*! 252 @ingroup PARALLEL 253 @param loc source location information 254 @param argc total number of arguments in the ellipsis 255 @param microtask pointer to callback routine consisting of outlined parallel 256 construct 257 @param ... pointers to shared variables that aren't global 258 259 Do the actual fork and call the microtask in the relevant number of threads. 260 */ 261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 262 int gtid = __kmp_entry_gtid(); 263 264 #if (KMP_STATS_ENABLED) 265 // If we were in a serial region, then stop the serial timer, record 266 // the event, and start parallel region timer 267 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 268 if (previous_state == stats_state_e::SERIAL_REGION) { 269 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 270 } else { 271 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 272 } 273 int inParallel = __kmpc_in_parallel(loc); 274 if (inParallel) { 275 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 276 } else { 277 KMP_COUNT_BLOCK(OMP_PARALLEL); 278 } 279 #endif 280 281 // maybe to save thr_state is enough here 282 { 283 va_list ap; 284 va_start(ap, microtask); 285 286 #if OMPT_SUPPORT 287 ompt_frame_t *ompt_frame; 288 if (ompt_enabled.enabled) { 289 kmp_info_t *master_th = __kmp_threads[gtid]; 290 kmp_team_t *parent_team = master_th->th.th_team; 291 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 292 if (lwt) 293 ompt_frame = &(lwt->ompt_task_info.frame); 294 else { 295 int tid = __kmp_tid_from_gtid(gtid); 296 ompt_frame = &( 297 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 298 } 299 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 300 OMPT_STORE_RETURN_ADDRESS(gtid); 301 } 302 #endif 303 304 #if INCLUDE_SSC_MARKS 305 SSC_MARK_FORKING(); 306 #endif 307 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 308 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 309 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 312 &ap 313 #else 314 ap 315 #endif 316 ); 317 #if INCLUDE_SSC_MARKS 318 SSC_MARK_JOINING(); 319 #endif 320 __kmp_join_call(loc, gtid 321 #if OMPT_SUPPORT 322 , 323 fork_context_intel 324 #endif 325 ); 326 327 va_end(ap); 328 } 329 330 #if KMP_STATS_ENABLED 331 if (previous_state == stats_state_e::SERIAL_REGION) { 332 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 333 } else { 334 KMP_POP_PARTITIONED_TIMER(); 335 } 336 #endif // KMP_STATS_ENABLED 337 } 338 339 /*! 340 @ingroup PARALLEL 341 @param loc source location information 342 @param global_tid global thread number 343 @param num_teams number of teams requested for the teams construct 344 @param num_threads number of threads per team requested for the teams construct 345 346 Set the number of teams to be used by the teams construct. 347 This call is only required if the teams construct has a `num_teams` clause 348 or a `thread_limit` clause (or both). 349 */ 350 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 351 kmp_int32 num_teams, kmp_int32 num_threads) { 352 KA_TRACE(20, 353 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 354 global_tid, num_teams, num_threads)); 355 __kmp_assert_valid_gtid(global_tid); 356 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 357 } 358 359 /*! 360 @ingroup PARALLEL 361 @param loc source location information 362 @param argc total number of arguments in the ellipsis 363 @param microtask pointer to callback routine consisting of outlined teams 364 construct 365 @param ... pointers to shared variables that aren't global 366 367 Do the actual fork and call the microtask in the relevant number of threads. 368 */ 369 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 370 ...) { 371 int gtid = __kmp_entry_gtid(); 372 kmp_info_t *this_thr = __kmp_threads[gtid]; 373 va_list ap; 374 va_start(ap, microtask); 375 376 #if KMP_STATS_ENABLED 377 KMP_COUNT_BLOCK(OMP_TEAMS); 378 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 379 if (previous_state == stats_state_e::SERIAL_REGION) { 380 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead); 381 } else { 382 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead); 383 } 384 #endif 385 386 // remember teams entry point and nesting level 387 this_thr->th.th_teams_microtask = microtask; 388 this_thr->th.th_teams_level = 389 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 390 391 #if OMPT_SUPPORT 392 kmp_team_t *parent_team = this_thr->th.th_team; 393 int tid = __kmp_tid_from_gtid(gtid); 394 if (ompt_enabled.enabled) { 395 parent_team->t.t_implicit_task_taskdata[tid] 396 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 397 } 398 OMPT_STORE_RETURN_ADDRESS(gtid); 399 #endif 400 401 // check if __kmpc_push_num_teams called, set default number of teams 402 // otherwise 403 if (this_thr->th.th_teams_size.nteams == 0) { 404 __kmp_push_num_teams(loc, gtid, 0, 0); 405 } 406 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 407 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 408 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 409 410 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 411 VOLATILE_CAST(microtask_t) 412 __kmp_teams_master, // "wrapped" task 413 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 414 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 415 &ap 416 #else 417 ap 418 #endif 419 ); 420 __kmp_join_call(loc, gtid 421 #if OMPT_SUPPORT 422 , 423 fork_context_intel 424 #endif 425 ); 426 427 // Pop current CG root off list 428 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots); 429 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 430 this_thr->th.th_cg_roots = tmp->up; 431 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up" 432 " to node %p. cg_nthreads was %d\n", 433 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads)); 434 KMP_DEBUG_ASSERT(tmp->cg_nthreads); 435 int i = tmp->cg_nthreads--; 436 if (i == 1) { // check is we are the last thread in CG (not always the case) 437 __kmp_free(tmp); 438 } 439 // Restore current task's thread_limit from CG root 440 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots); 441 this_thr->th.th_current_task->td_icvs.thread_limit = 442 this_thr->th.th_cg_roots->cg_thread_limit; 443 444 this_thr->th.th_teams_microtask = NULL; 445 this_thr->th.th_teams_level = 0; 446 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 447 va_end(ap); 448 #if KMP_STATS_ENABLED 449 if (previous_state == stats_state_e::SERIAL_REGION) { 450 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 451 } else { 452 KMP_POP_PARTITIONED_TIMER(); 453 } 454 #endif // KMP_STATS_ENABLED 455 } 456 457 // I don't think this function should ever have been exported. 458 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 459 // openmp code ever called it, but it's been exported from the RTL for so 460 // long that I'm afraid to remove the definition. 461 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 462 463 /*! 464 @ingroup PARALLEL 465 @param loc source location information 466 @param global_tid global thread number 467 468 Enter a serialized parallel construct. This interface is used to handle a 469 conditional parallel region, like this, 470 @code 471 #pragma omp parallel if (condition) 472 @endcode 473 when the condition is false. 474 */ 475 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 476 // The implementation is now in kmp_runtime.cpp so that it can share static 477 // functions with kmp_fork_call since the tasks to be done are similar in 478 // each case. 479 __kmp_assert_valid_gtid(global_tid); 480 #if OMPT_SUPPORT 481 OMPT_STORE_RETURN_ADDRESS(global_tid); 482 #endif 483 __kmp_serialized_parallel(loc, global_tid); 484 } 485 486 /*! 487 @ingroup PARALLEL 488 @param loc source location information 489 @param global_tid global thread number 490 491 Leave a serialized parallel construct. 492 */ 493 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 494 kmp_internal_control_t *top; 495 kmp_info_t *this_thr; 496 kmp_team_t *serial_team; 497 498 KC_TRACE(10, 499 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 500 501 /* skip all this code for autopar serialized loops since it results in 502 unacceptable overhead */ 503 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 504 return; 505 506 // Not autopar code 507 __kmp_assert_valid_gtid(global_tid); 508 if (!TCR_4(__kmp_init_parallel)) 509 __kmp_parallel_initialize(); 510 511 __kmp_resume_if_soft_paused(); 512 513 this_thr = __kmp_threads[global_tid]; 514 serial_team = this_thr->th.th_serial_team; 515 516 kmp_task_team_t *task_team = this_thr->th.th_task_team; 517 // we need to wait for the proxy tasks before finishing the thread 518 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 519 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 520 521 KMP_MB(); 522 KMP_DEBUG_ASSERT(serial_team); 523 KMP_ASSERT(serial_team->t.t_serialized); 524 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 525 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 526 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 527 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 528 529 #if OMPT_SUPPORT 530 if (ompt_enabled.enabled && 531 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 532 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none; 533 if (ompt_enabled.ompt_callback_implicit_task) { 534 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 535 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 536 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit); 537 } 538 539 // reset clear the task id only after unlinking the task 540 ompt_data_t *parent_task_data; 541 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 542 543 if (ompt_enabled.ompt_callback_parallel_end) { 544 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 545 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 546 ompt_parallel_invoker_program | ompt_parallel_team, 547 OMPT_LOAD_RETURN_ADDRESS(global_tid)); 548 } 549 __ompt_lw_taskteam_unlink(this_thr); 550 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 551 } 552 #endif 553 554 /* If necessary, pop the internal control stack values and replace the team 555 * values */ 556 top = serial_team->t.t_control_stack_top; 557 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 558 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 559 serial_team->t.t_control_stack_top = top->next; 560 __kmp_free(top); 561 } 562 563 // if( serial_team -> t.t_serialized > 1 ) 564 serial_team->t.t_level--; 565 566 /* pop dispatch buffers stack */ 567 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 568 { 569 dispatch_private_info_t *disp_buffer = 570 serial_team->t.t_dispatch->th_disp_buffer; 571 serial_team->t.t_dispatch->th_disp_buffer = 572 serial_team->t.t_dispatch->th_disp_buffer->next; 573 __kmp_free(disp_buffer); 574 } 575 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore 576 577 --serial_team->t.t_serialized; 578 if (serial_team->t.t_serialized == 0) { 579 580 /* return to the parallel section */ 581 582 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 583 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 584 __kmp_clear_x87_fpu_status_word(); 585 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 586 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 587 } 588 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 589 590 this_thr->th.th_team = serial_team->t.t_parent; 591 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 592 593 /* restore values cached in the thread */ 594 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 595 this_thr->th.th_team_master = 596 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 597 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 598 599 /* TODO the below shouldn't need to be adjusted for serialized teams */ 600 this_thr->th.th_dispatch = 601 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 602 603 __kmp_pop_current_task_from_thread(this_thr); 604 605 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 606 this_thr->th.th_current_task->td_flags.executing = 1; 607 608 if (__kmp_tasking_mode != tskm_immediate_exec) { 609 // Copy the task team from the new child / old parent team to the thread. 610 this_thr->th.th_task_team = 611 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 612 KA_TRACE(20, 613 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 614 "team %p\n", 615 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 616 } 617 } else { 618 if (__kmp_tasking_mode != tskm_immediate_exec) { 619 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 620 "depth of serial team %p to %d\n", 621 global_tid, serial_team, serial_team->t.t_serialized)); 622 } 623 } 624 625 if (__kmp_env_consistency_check) 626 __kmp_pop_parallel(global_tid, NULL); 627 #if OMPT_SUPPORT 628 if (ompt_enabled.enabled) 629 this_thr->th.ompt_thread_info.state = 630 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial 631 : ompt_state_work_parallel); 632 #endif 633 } 634 635 /*! 636 @ingroup SYNCHRONIZATION 637 @param loc source location information. 638 639 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 640 depending on the memory ordering convention obeyed by the compiler 641 even that may not be necessary). 642 */ 643 void __kmpc_flush(ident_t *loc) { 644 KC_TRACE(10, ("__kmpc_flush: called\n")); 645 646 /* need explicit __mf() here since use volatile instead in library */ 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 650 #if KMP_MIC 651 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 652 // We shouldn't need it, though, since the ABI rules require that 653 // * If the compiler generates NGO stores it also generates the fence 654 // * If users hand-code NGO stores they should insert the fence 655 // therefore no incomplete unordered stores should be visible. 656 #else 657 // C74404 658 // This is to address non-temporal store instructions (sfence needed). 659 // The clflush instruction is addressed either (mfence needed). 660 // Probably the non-temporal load monvtdqa instruction should also be 661 // addressed. 662 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 663 if (!__kmp_cpuinfo.initialized) { 664 __kmp_query_cpuid(&__kmp_cpuinfo); 665 } 666 if (!__kmp_cpuinfo.sse2) { 667 // CPU cannot execute SSE2 instructions. 668 } else { 669 #if KMP_COMPILER_ICC 670 _mm_mfence(); 671 #elif KMP_COMPILER_MSVC 672 MemoryBarrier(); 673 #else 674 __sync_synchronize(); 675 #endif // KMP_COMPILER_ICC 676 } 677 #endif // KMP_MIC 678 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \ 679 KMP_ARCH_RISCV64) 680 // Nothing to see here move along 681 #elif KMP_ARCH_PPC64 682 // Nothing needed here (we have a real MB above). 683 #if KMP_OS_CNK 684 // The flushing thread needs to yield here; this prevents a 685 // busy-waiting thread from saturating the pipeline. flush is 686 // often used in loops like this: 687 // while (!flag) { 688 // #pragma omp flush(flag) 689 // } 690 // and adding the yield here is good for at least a 10x speedup 691 // when running >2 threads per core (on the NAS LU benchmark). 692 __kmp_yield(); 693 #endif 694 #else 695 #error Unknown or unsupported architecture 696 #endif 697 698 #if OMPT_SUPPORT && OMPT_OPTIONAL 699 if (ompt_enabled.ompt_callback_flush) { 700 ompt_callbacks.ompt_callback(ompt_callback_flush)( 701 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 702 } 703 #endif 704 } 705 706 /* -------------------------------------------------------------------------- */ 707 /*! 708 @ingroup SYNCHRONIZATION 709 @param loc source location information 710 @param global_tid thread id. 711 712 Execute a barrier. 713 */ 714 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 715 KMP_COUNT_BLOCK(OMP_BARRIER); 716 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 717 __kmp_assert_valid_gtid(global_tid); 718 719 if (!TCR_4(__kmp_init_parallel)) 720 __kmp_parallel_initialize(); 721 722 __kmp_resume_if_soft_paused(); 723 724 if (__kmp_env_consistency_check) { 725 if (loc == 0) { 726 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 727 } 728 __kmp_check_barrier(global_tid, ct_barrier, loc); 729 } 730 731 #if OMPT_SUPPORT 732 ompt_frame_t *ompt_frame; 733 if (ompt_enabled.enabled) { 734 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 735 if (ompt_frame->enter_frame.ptr == NULL) 736 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 737 OMPT_STORE_RETURN_ADDRESS(global_tid); 738 } 739 #endif 740 __kmp_threads[global_tid]->th.th_ident = loc; 741 // TODO: explicit barrier_wait_id: 742 // this function is called when 'barrier' directive is present or 743 // implicit barrier at the end of a worksharing construct. 744 // 1) better to add a per-thread barrier counter to a thread data structure 745 // 2) set to 0 when a new team is created 746 // 4) no sync is required 747 748 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 749 #if OMPT_SUPPORT && OMPT_OPTIONAL 750 if (ompt_enabled.enabled) { 751 ompt_frame->enter_frame = ompt_data_none; 752 } 753 #endif 754 } 755 756 /* The BARRIER for a MASTER section is always explicit */ 757 /*! 758 @ingroup WORK_SHARING 759 @param loc source location information. 760 @param global_tid global thread number . 761 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 762 */ 763 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 764 int status = 0; 765 766 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 767 __kmp_assert_valid_gtid(global_tid); 768 769 if (!TCR_4(__kmp_init_parallel)) 770 __kmp_parallel_initialize(); 771 772 __kmp_resume_if_soft_paused(); 773 774 if (KMP_MASTER_GTID(global_tid)) { 775 KMP_COUNT_BLOCK(OMP_MASTER); 776 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 777 status = 1; 778 } 779 780 #if OMPT_SUPPORT && OMPT_OPTIONAL 781 if (status) { 782 if (ompt_enabled.ompt_callback_master) { 783 kmp_info_t *this_thr = __kmp_threads[global_tid]; 784 kmp_team_t *team = this_thr->th.th_team; 785 786 int tid = __kmp_tid_from_gtid(global_tid); 787 ompt_callbacks.ompt_callback(ompt_callback_master)( 788 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 789 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 790 OMPT_GET_RETURN_ADDRESS(0)); 791 } 792 } 793 #endif 794 795 if (__kmp_env_consistency_check) { 796 #if KMP_USE_DYNAMIC_LOCK 797 if (status) 798 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 799 else 800 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 801 #else 802 if (status) 803 __kmp_push_sync(global_tid, ct_master, loc, NULL); 804 else 805 __kmp_check_sync(global_tid, ct_master, loc, NULL); 806 #endif 807 } 808 809 return status; 810 } 811 812 /*! 813 @ingroup WORK_SHARING 814 @param loc source location information. 815 @param global_tid global thread number . 816 817 Mark the end of a <tt>master</tt> region. This should only be called by the 818 thread that executes the <tt>master</tt> region. 819 */ 820 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 821 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 822 __kmp_assert_valid_gtid(global_tid); 823 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 824 KMP_POP_PARTITIONED_TIMER(); 825 826 #if OMPT_SUPPORT && OMPT_OPTIONAL 827 kmp_info_t *this_thr = __kmp_threads[global_tid]; 828 kmp_team_t *team = this_thr->th.th_team; 829 if (ompt_enabled.ompt_callback_master) { 830 int tid = __kmp_tid_from_gtid(global_tid); 831 ompt_callbacks.ompt_callback(ompt_callback_master)( 832 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 833 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 834 OMPT_GET_RETURN_ADDRESS(0)); 835 } 836 #endif 837 838 if (__kmp_env_consistency_check) { 839 if (KMP_MASTER_GTID(global_tid)) 840 __kmp_pop_sync(global_tid, ct_master, loc); 841 } 842 } 843 844 /*! 845 @ingroup WORK_SHARING 846 @param loc source location information. 847 @param gtid global thread number. 848 849 Start execution of an <tt>ordered</tt> construct. 850 */ 851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 852 int cid = 0; 853 kmp_info_t *th; 854 KMP_DEBUG_ASSERT(__kmp_init_serial); 855 856 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 857 __kmp_assert_valid_gtid(gtid); 858 859 if (!TCR_4(__kmp_init_parallel)) 860 __kmp_parallel_initialize(); 861 862 __kmp_resume_if_soft_paused(); 863 864 #if USE_ITT_BUILD 865 __kmp_itt_ordered_prep(gtid); 866 // TODO: ordered_wait_id 867 #endif /* USE_ITT_BUILD */ 868 869 th = __kmp_threads[gtid]; 870 871 #if OMPT_SUPPORT && OMPT_OPTIONAL 872 kmp_team_t *team; 873 ompt_wait_id_t lck; 874 void *codeptr_ra; 875 if (ompt_enabled.enabled) { 876 OMPT_STORE_RETURN_ADDRESS(gtid); 877 team = __kmp_team_from_gtid(gtid); 878 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value; 879 /* OMPT state update */ 880 th->th.ompt_thread_info.wait_id = lck; 881 th->th.ompt_thread_info.state = ompt_state_wait_ordered; 882 883 /* OMPT event callback */ 884 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 885 if (ompt_enabled.ompt_callback_mutex_acquire) { 886 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 887 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck, 888 codeptr_ra); 889 } 890 } 891 #endif 892 893 if (th->th.th_dispatch->th_deo_fcn != 0) 894 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 895 else 896 __kmp_parallel_deo(>id, &cid, loc); 897 898 #if OMPT_SUPPORT && OMPT_OPTIONAL 899 if (ompt_enabled.enabled) { 900 /* OMPT state update */ 901 th->th.ompt_thread_info.state = ompt_state_work_parallel; 902 th->th.ompt_thread_info.wait_id = 0; 903 904 /* OMPT event callback */ 905 if (ompt_enabled.ompt_callback_mutex_acquired) { 906 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 907 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 908 } 909 } 910 #endif 911 912 #if USE_ITT_BUILD 913 __kmp_itt_ordered_start(gtid); 914 #endif /* USE_ITT_BUILD */ 915 } 916 917 /*! 918 @ingroup WORK_SHARING 919 @param loc source location information. 920 @param gtid global thread number. 921 922 End execution of an <tt>ordered</tt> construct. 923 */ 924 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 925 int cid = 0; 926 kmp_info_t *th; 927 928 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 929 __kmp_assert_valid_gtid(gtid); 930 931 #if USE_ITT_BUILD 932 __kmp_itt_ordered_end(gtid); 933 // TODO: ordered_wait_id 934 #endif /* USE_ITT_BUILD */ 935 936 th = __kmp_threads[gtid]; 937 938 if (th->th.th_dispatch->th_dxo_fcn != 0) 939 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 940 else 941 __kmp_parallel_dxo(>id, &cid, loc); 942 943 #if OMPT_SUPPORT && OMPT_OPTIONAL 944 OMPT_STORE_RETURN_ADDRESS(gtid); 945 if (ompt_enabled.ompt_callback_mutex_released) { 946 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 947 ompt_mutex_ordered, 948 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid) 949 ->t.t_ordered.dt.t_value, 950 OMPT_LOAD_RETURN_ADDRESS(gtid)); 951 } 952 #endif 953 } 954 955 #if KMP_USE_DYNAMIC_LOCK 956 957 static __forceinline void 958 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 959 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 960 // Pointer to the allocated indirect lock is written to crit, while indexing 961 // is ignored. 962 void *idx; 963 kmp_indirect_lock_t **lck; 964 lck = (kmp_indirect_lock_t **)crit; 965 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 966 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 967 KMP_SET_I_LOCK_LOCATION(ilk, loc); 968 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 969 KA_TRACE(20, 970 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 971 #if USE_ITT_BUILD 972 __kmp_itt_critical_creating(ilk->lock, loc); 973 #endif 974 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 975 if (status == 0) { 976 #if USE_ITT_BUILD 977 __kmp_itt_critical_destroyed(ilk->lock); 978 #endif 979 // We don't really need to destroy the unclaimed lock here since it will be 980 // cleaned up at program exit. 981 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 982 } 983 KMP_DEBUG_ASSERT(*lck != NULL); 984 } 985 986 // Fast-path acquire tas lock 987 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 988 { \ 989 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 990 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 991 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 992 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 993 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 994 kmp_uint32 spins; \ 995 KMP_FSYNC_PREPARE(l); \ 996 KMP_INIT_YIELD(spins); \ 997 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 998 do { \ 999 if (TCR_4(__kmp_nth) > \ 1000 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 1001 KMP_YIELD(TRUE); \ 1002 } else { \ 1003 KMP_YIELD_SPIN(spins); \ 1004 } \ 1005 __kmp_spin_backoff(&backoff); \ 1006 } while ( \ 1007 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 1008 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \ 1009 } \ 1010 KMP_FSYNC_ACQUIRED(l); \ 1011 } 1012 1013 // Fast-path test tas lock 1014 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 1015 { \ 1016 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 1017 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 1018 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 1019 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 1020 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 1021 } 1022 1023 // Fast-path release tas lock 1024 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 1025 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 1026 1027 #if KMP_USE_FUTEX 1028 1029 #include <sys/syscall.h> 1030 #include <unistd.h> 1031 #ifndef FUTEX_WAIT 1032 #define FUTEX_WAIT 0 1033 #endif 1034 #ifndef FUTEX_WAKE 1035 #define FUTEX_WAKE 1 1036 #endif 1037 1038 // Fast-path acquire futex lock 1039 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1040 { \ 1041 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1042 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1043 KMP_MB(); \ 1044 KMP_FSYNC_PREPARE(ftx); \ 1045 kmp_int32 poll_val; \ 1046 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1047 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1048 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1049 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1050 if (!cond) { \ 1051 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1052 poll_val | \ 1053 KMP_LOCK_BUSY(1, futex))) { \ 1054 continue; \ 1055 } \ 1056 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1057 } \ 1058 kmp_int32 rc; \ 1059 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1060 NULL, NULL, 0)) != 0) { \ 1061 continue; \ 1062 } \ 1063 gtid_code |= 1; \ 1064 } \ 1065 KMP_FSYNC_ACQUIRED(ftx); \ 1066 } 1067 1068 // Fast-path test futex lock 1069 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1070 { \ 1071 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1072 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1073 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1074 KMP_FSYNC_ACQUIRED(ftx); \ 1075 rc = TRUE; \ 1076 } else { \ 1077 rc = FALSE; \ 1078 } \ 1079 } 1080 1081 // Fast-path release futex lock 1082 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1083 { \ 1084 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1085 KMP_MB(); \ 1086 KMP_FSYNC_RELEASING(ftx); \ 1087 kmp_int32 poll_val = \ 1088 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1089 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1090 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1091 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1092 } \ 1093 KMP_MB(); \ 1094 KMP_YIELD_OVERSUB(); \ 1095 } 1096 1097 #endif // KMP_USE_FUTEX 1098 1099 #else // KMP_USE_DYNAMIC_LOCK 1100 1101 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1102 ident_t const *loc, 1103 kmp_int32 gtid) { 1104 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1105 1106 // Because of the double-check, the following load doesn't need to be volatile 1107 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1108 1109 if (lck == NULL) { 1110 void *idx; 1111 1112 // Allocate & initialize the lock. 1113 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1114 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1115 __kmp_init_user_lock_with_checks(lck); 1116 __kmp_set_user_lock_location(lck, loc); 1117 #if USE_ITT_BUILD 1118 __kmp_itt_critical_creating(lck); 1119 // __kmp_itt_critical_creating() should be called *before* the first usage 1120 // of underlying lock. It is the only place where we can guarantee it. There 1121 // are chances the lock will destroyed with no usage, but it is not a 1122 // problem, because this is not real event seen by user but rather setting 1123 // name for object (lock). See more details in kmp_itt.h. 1124 #endif /* USE_ITT_BUILD */ 1125 1126 // Use a cmpxchg instruction to slam the start of the critical section with 1127 // the lock pointer. If another thread beat us to it, deallocate the lock, 1128 // and use the lock that the other thread allocated. 1129 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1130 1131 if (status == 0) { 1132 // Deallocate the lock and reload the value. 1133 #if USE_ITT_BUILD 1134 __kmp_itt_critical_destroyed(lck); 1135 // Let ITT know the lock is destroyed and the same memory location may be reused 1136 // for another purpose. 1137 #endif /* USE_ITT_BUILD */ 1138 __kmp_destroy_user_lock_with_checks(lck); 1139 __kmp_user_lock_free(&idx, gtid, lck); 1140 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1141 KMP_DEBUG_ASSERT(lck != NULL); 1142 } 1143 } 1144 return lck; 1145 } 1146 1147 #endif // KMP_USE_DYNAMIC_LOCK 1148 1149 /*! 1150 @ingroup WORK_SHARING 1151 @param loc source location information. 1152 @param global_tid global thread number. 1153 @param crit identity of the critical section. This could be a pointer to a lock 1154 associated with the critical section, or some other suitably unique value. 1155 1156 Enter code protected by a `critical` construct. 1157 This function blocks until the executing thread can enter the critical section. 1158 */ 1159 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1160 kmp_critical_name *crit) { 1161 #if KMP_USE_DYNAMIC_LOCK 1162 #if OMPT_SUPPORT && OMPT_OPTIONAL 1163 OMPT_STORE_RETURN_ADDRESS(global_tid); 1164 #endif // OMPT_SUPPORT 1165 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1166 #else 1167 KMP_COUNT_BLOCK(OMP_CRITICAL); 1168 #if OMPT_SUPPORT && OMPT_OPTIONAL 1169 ompt_state_t prev_state = ompt_state_undefined; 1170 ompt_thread_info_t ti; 1171 #endif 1172 kmp_user_lock_p lck; 1173 1174 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1175 __kmp_assert_valid_gtid(global_tid); 1176 1177 // TODO: add THR_OVHD_STATE 1178 1179 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1180 KMP_CHECK_USER_LOCK_INIT(); 1181 1182 if ((__kmp_user_lock_kind == lk_tas) && 1183 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1184 lck = (kmp_user_lock_p)crit; 1185 } 1186 #if KMP_USE_FUTEX 1187 else if ((__kmp_user_lock_kind == lk_futex) && 1188 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1189 lck = (kmp_user_lock_p)crit; 1190 } 1191 #endif 1192 else { // ticket, queuing or drdpa 1193 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1194 } 1195 1196 if (__kmp_env_consistency_check) 1197 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1198 1199 // since the critical directive binds to all threads, not just the current 1200 // team we have to check this even if we are in a serialized team. 1201 // also, even if we are the uber thread, we still have to conduct the lock, 1202 // as we have to contend with sibling threads. 1203 1204 #if USE_ITT_BUILD 1205 __kmp_itt_critical_acquiring(lck); 1206 #endif /* USE_ITT_BUILD */ 1207 #if OMPT_SUPPORT && OMPT_OPTIONAL 1208 OMPT_STORE_RETURN_ADDRESS(gtid); 1209 void *codeptr_ra = NULL; 1210 if (ompt_enabled.enabled) { 1211 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1212 /* OMPT state update */ 1213 prev_state = ti.state; 1214 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1215 ti.state = ompt_state_wait_critical; 1216 1217 /* OMPT event callback */ 1218 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1219 if (ompt_enabled.ompt_callback_mutex_acquire) { 1220 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1221 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1222 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 1223 } 1224 } 1225 #endif 1226 // Value of 'crit' should be good for using as a critical_id of the critical 1227 // section directive. 1228 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1229 1230 #if USE_ITT_BUILD 1231 __kmp_itt_critical_acquired(lck); 1232 #endif /* USE_ITT_BUILD */ 1233 #if OMPT_SUPPORT && OMPT_OPTIONAL 1234 if (ompt_enabled.enabled) { 1235 /* OMPT state update */ 1236 ti.state = prev_state; 1237 ti.wait_id = 0; 1238 1239 /* OMPT event callback */ 1240 if (ompt_enabled.ompt_callback_mutex_acquired) { 1241 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1242 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra); 1243 } 1244 } 1245 #endif 1246 KMP_POP_PARTITIONED_TIMER(); 1247 1248 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1249 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1250 #endif // KMP_USE_DYNAMIC_LOCK 1251 } 1252 1253 #if KMP_USE_DYNAMIC_LOCK 1254 1255 // Converts the given hint to an internal lock implementation 1256 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1257 #if KMP_USE_TSX 1258 #define KMP_TSX_LOCK(seq) lockseq_##seq 1259 #else 1260 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1261 #endif 1262 1263 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1264 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1265 #else 1266 #define KMP_CPUINFO_RTM 0 1267 #endif 1268 1269 // Hints that do not require further logic 1270 if (hint & kmp_lock_hint_hle) 1271 return KMP_TSX_LOCK(hle); 1272 if (hint & kmp_lock_hint_rtm) 1273 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1274 if (hint & kmp_lock_hint_adaptive) 1275 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1276 1277 // Rule out conflicting hints first by returning the default lock 1278 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1279 return __kmp_user_lock_seq; 1280 if ((hint & omp_lock_hint_speculative) && 1281 (hint & omp_lock_hint_nonspeculative)) 1282 return __kmp_user_lock_seq; 1283 1284 // Do not even consider speculation when it appears to be contended 1285 if (hint & omp_lock_hint_contended) 1286 return lockseq_queuing; 1287 1288 // Uncontended lock without speculation 1289 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1290 return lockseq_tas; 1291 1292 // HLE lock for speculation 1293 if (hint & omp_lock_hint_speculative) 1294 return KMP_TSX_LOCK(hle); 1295 1296 return __kmp_user_lock_seq; 1297 } 1298 1299 #if OMPT_SUPPORT && OMPT_OPTIONAL 1300 #if KMP_USE_DYNAMIC_LOCK 1301 static kmp_mutex_impl_t 1302 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1303 if (user_lock) { 1304 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1305 case 0: 1306 break; 1307 #if KMP_USE_FUTEX 1308 case locktag_futex: 1309 return kmp_mutex_impl_queuing; 1310 #endif 1311 case locktag_tas: 1312 return kmp_mutex_impl_spin; 1313 #if KMP_USE_TSX 1314 case locktag_hle: 1315 return kmp_mutex_impl_speculative; 1316 #endif 1317 default: 1318 return kmp_mutex_impl_none; 1319 } 1320 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1321 } 1322 KMP_ASSERT(ilock); 1323 switch (ilock->type) { 1324 #if KMP_USE_TSX 1325 case locktag_adaptive: 1326 case locktag_rtm: 1327 return kmp_mutex_impl_speculative; 1328 #endif 1329 case locktag_nested_tas: 1330 return kmp_mutex_impl_spin; 1331 #if KMP_USE_FUTEX 1332 case locktag_nested_futex: 1333 #endif 1334 case locktag_ticket: 1335 case locktag_queuing: 1336 case locktag_drdpa: 1337 case locktag_nested_ticket: 1338 case locktag_nested_queuing: 1339 case locktag_nested_drdpa: 1340 return kmp_mutex_impl_queuing; 1341 default: 1342 return kmp_mutex_impl_none; 1343 } 1344 } 1345 #else 1346 // For locks without dynamic binding 1347 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1348 switch (__kmp_user_lock_kind) { 1349 case lk_tas: 1350 return kmp_mutex_impl_spin; 1351 #if KMP_USE_FUTEX 1352 case lk_futex: 1353 #endif 1354 case lk_ticket: 1355 case lk_queuing: 1356 case lk_drdpa: 1357 return kmp_mutex_impl_queuing; 1358 #if KMP_USE_TSX 1359 case lk_hle: 1360 case lk_rtm: 1361 case lk_adaptive: 1362 return kmp_mutex_impl_speculative; 1363 #endif 1364 default: 1365 return kmp_mutex_impl_none; 1366 } 1367 } 1368 #endif // KMP_USE_DYNAMIC_LOCK 1369 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1370 1371 /*! 1372 @ingroup WORK_SHARING 1373 @param loc source location information. 1374 @param global_tid global thread number. 1375 @param crit identity of the critical section. This could be a pointer to a lock 1376 associated with the critical section, or some other suitably unique value. 1377 @param hint the lock hint. 1378 1379 Enter code protected by a `critical` construct with a hint. The hint value is 1380 used to suggest a lock implementation. This function blocks until the executing 1381 thread can enter the critical section unless the hint suggests use of 1382 speculative execution and the hardware supports it. 1383 */ 1384 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1385 kmp_critical_name *crit, uint32_t hint) { 1386 KMP_COUNT_BLOCK(OMP_CRITICAL); 1387 kmp_user_lock_p lck; 1388 #if OMPT_SUPPORT && OMPT_OPTIONAL 1389 ompt_state_t prev_state = ompt_state_undefined; 1390 ompt_thread_info_t ti; 1391 // This is the case, if called from __kmpc_critical: 1392 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1393 if (!codeptr) 1394 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1395 #endif 1396 1397 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1398 __kmp_assert_valid_gtid(global_tid); 1399 1400 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1401 // Check if it is initialized. 1402 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1403 if (*lk == 0) { 1404 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1405 if (KMP_IS_D_LOCK(lckseq)) { 1406 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1407 KMP_GET_D_TAG(lckseq)); 1408 } else { 1409 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1410 } 1411 } 1412 // Branch for accessing the actual lock object and set operation. This 1413 // branching is inevitable since this lock initialization does not follow the 1414 // normal dispatch path (lock table is not used). 1415 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1416 lck = (kmp_user_lock_p)lk; 1417 if (__kmp_env_consistency_check) { 1418 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1419 __kmp_map_hint_to_lock(hint)); 1420 } 1421 #if USE_ITT_BUILD 1422 __kmp_itt_critical_acquiring(lck); 1423 #endif 1424 #if OMPT_SUPPORT && OMPT_OPTIONAL 1425 if (ompt_enabled.enabled) { 1426 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1427 /* OMPT state update */ 1428 prev_state = ti.state; 1429 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1430 ti.state = ompt_state_wait_critical; 1431 1432 /* OMPT event callback */ 1433 if (ompt_enabled.ompt_callback_mutex_acquire) { 1434 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1435 ompt_mutex_critical, (unsigned int)hint, 1436 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck, 1437 codeptr); 1438 } 1439 } 1440 #endif 1441 #if KMP_USE_INLINED_TAS 1442 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1443 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1444 } else 1445 #elif KMP_USE_INLINED_FUTEX 1446 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1447 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1448 } else 1449 #endif 1450 { 1451 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1452 } 1453 } else { 1454 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1455 lck = ilk->lock; 1456 if (__kmp_env_consistency_check) { 1457 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1458 __kmp_map_hint_to_lock(hint)); 1459 } 1460 #if USE_ITT_BUILD 1461 __kmp_itt_critical_acquiring(lck); 1462 #endif 1463 #if OMPT_SUPPORT && OMPT_OPTIONAL 1464 if (ompt_enabled.enabled) { 1465 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1466 /* OMPT state update */ 1467 prev_state = ti.state; 1468 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck; 1469 ti.state = ompt_state_wait_critical; 1470 1471 /* OMPT event callback */ 1472 if (ompt_enabled.ompt_callback_mutex_acquire) { 1473 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1474 ompt_mutex_critical, (unsigned int)hint, 1475 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck, 1476 codeptr); 1477 } 1478 } 1479 #endif 1480 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1481 } 1482 KMP_POP_PARTITIONED_TIMER(); 1483 1484 #if USE_ITT_BUILD 1485 __kmp_itt_critical_acquired(lck); 1486 #endif /* USE_ITT_BUILD */ 1487 #if OMPT_SUPPORT && OMPT_OPTIONAL 1488 if (ompt_enabled.enabled) { 1489 /* OMPT state update */ 1490 ti.state = prev_state; 1491 ti.wait_id = 0; 1492 1493 /* OMPT event callback */ 1494 if (ompt_enabled.ompt_callback_mutex_acquired) { 1495 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1496 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 1497 } 1498 } 1499 #endif 1500 1501 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1502 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1503 } // __kmpc_critical_with_hint 1504 1505 #endif // KMP_USE_DYNAMIC_LOCK 1506 1507 /*! 1508 @ingroup WORK_SHARING 1509 @param loc source location information. 1510 @param global_tid global thread number . 1511 @param crit identity of the critical section. This could be a pointer to a lock 1512 associated with the critical section, or some other suitably unique value. 1513 1514 Leave a critical section, releasing any lock that was held during its execution. 1515 */ 1516 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1517 kmp_critical_name *crit) { 1518 kmp_user_lock_p lck; 1519 1520 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1521 1522 #if KMP_USE_DYNAMIC_LOCK 1523 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1524 lck = (kmp_user_lock_p)crit; 1525 KMP_ASSERT(lck != NULL); 1526 if (__kmp_env_consistency_check) { 1527 __kmp_pop_sync(global_tid, ct_critical, loc); 1528 } 1529 #if USE_ITT_BUILD 1530 __kmp_itt_critical_releasing(lck); 1531 #endif 1532 #if KMP_USE_INLINED_TAS 1533 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1534 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1535 } else 1536 #elif KMP_USE_INLINED_FUTEX 1537 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1538 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1539 } else 1540 #endif 1541 { 1542 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1543 } 1544 } else { 1545 kmp_indirect_lock_t *ilk = 1546 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1547 KMP_ASSERT(ilk != NULL); 1548 lck = ilk->lock; 1549 if (__kmp_env_consistency_check) { 1550 __kmp_pop_sync(global_tid, ct_critical, loc); 1551 } 1552 #if USE_ITT_BUILD 1553 __kmp_itt_critical_releasing(lck); 1554 #endif 1555 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1556 } 1557 1558 #else // KMP_USE_DYNAMIC_LOCK 1559 1560 if ((__kmp_user_lock_kind == lk_tas) && 1561 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1562 lck = (kmp_user_lock_p)crit; 1563 } 1564 #if KMP_USE_FUTEX 1565 else if ((__kmp_user_lock_kind == lk_futex) && 1566 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1567 lck = (kmp_user_lock_p)crit; 1568 } 1569 #endif 1570 else { // ticket, queuing or drdpa 1571 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1572 } 1573 1574 KMP_ASSERT(lck != NULL); 1575 1576 if (__kmp_env_consistency_check) 1577 __kmp_pop_sync(global_tid, ct_critical, loc); 1578 1579 #if USE_ITT_BUILD 1580 __kmp_itt_critical_releasing(lck); 1581 #endif /* USE_ITT_BUILD */ 1582 // Value of 'crit' should be good for using as a critical_id of the critical 1583 // section directive. 1584 __kmp_release_user_lock_with_checks(lck, global_tid); 1585 1586 #endif // KMP_USE_DYNAMIC_LOCK 1587 1588 #if OMPT_SUPPORT && OMPT_OPTIONAL 1589 /* OMPT release event triggers after lock is released; place here to trigger 1590 * for all #if branches */ 1591 OMPT_STORE_RETURN_ADDRESS(global_tid); 1592 if (ompt_enabled.ompt_callback_mutex_released) { 1593 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1594 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, 1595 OMPT_LOAD_RETURN_ADDRESS(0)); 1596 } 1597 #endif 1598 1599 KMP_POP_PARTITIONED_TIMER(); 1600 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1601 } 1602 1603 /*! 1604 @ingroup SYNCHRONIZATION 1605 @param loc source location information 1606 @param global_tid thread id. 1607 @return one if the thread should execute the master block, zero otherwise 1608 1609 Start execution of a combined barrier and master. The barrier is executed inside 1610 this function. 1611 */ 1612 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1613 int status; 1614 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1615 __kmp_assert_valid_gtid(global_tid); 1616 1617 if (!TCR_4(__kmp_init_parallel)) 1618 __kmp_parallel_initialize(); 1619 1620 __kmp_resume_if_soft_paused(); 1621 1622 if (__kmp_env_consistency_check) 1623 __kmp_check_barrier(global_tid, ct_barrier, loc); 1624 1625 #if OMPT_SUPPORT 1626 ompt_frame_t *ompt_frame; 1627 if (ompt_enabled.enabled) { 1628 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1629 if (ompt_frame->enter_frame.ptr == NULL) 1630 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1631 OMPT_STORE_RETURN_ADDRESS(global_tid); 1632 } 1633 #endif 1634 #if USE_ITT_NOTIFY 1635 __kmp_threads[global_tid]->th.th_ident = loc; 1636 #endif 1637 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1638 #if OMPT_SUPPORT && OMPT_OPTIONAL 1639 if (ompt_enabled.enabled) { 1640 ompt_frame->enter_frame = ompt_data_none; 1641 } 1642 #endif 1643 1644 return (status != 0) ? 0 : 1; 1645 } 1646 1647 /*! 1648 @ingroup SYNCHRONIZATION 1649 @param loc source location information 1650 @param global_tid thread id. 1651 1652 Complete the execution of a combined barrier and master. This function should 1653 only be called at the completion of the <tt>master</tt> code. Other threads will 1654 still be waiting at the barrier and this call releases them. 1655 */ 1656 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1657 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1658 __kmp_assert_valid_gtid(global_tid); 1659 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1660 } 1661 1662 /*! 1663 @ingroup SYNCHRONIZATION 1664 @param loc source location information 1665 @param global_tid thread id. 1666 @return one if the thread should execute the master block, zero otherwise 1667 1668 Start execution of a combined barrier and master(nowait) construct. 1669 The barrier is executed inside this function. 1670 There is no equivalent "end" function, since the 1671 */ 1672 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1673 kmp_int32 ret; 1674 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1675 __kmp_assert_valid_gtid(global_tid); 1676 1677 if (!TCR_4(__kmp_init_parallel)) 1678 __kmp_parallel_initialize(); 1679 1680 __kmp_resume_if_soft_paused(); 1681 1682 if (__kmp_env_consistency_check) { 1683 if (loc == 0) { 1684 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1685 } 1686 __kmp_check_barrier(global_tid, ct_barrier, loc); 1687 } 1688 1689 #if OMPT_SUPPORT 1690 ompt_frame_t *ompt_frame; 1691 if (ompt_enabled.enabled) { 1692 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1693 if (ompt_frame->enter_frame.ptr == NULL) 1694 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1695 OMPT_STORE_RETURN_ADDRESS(global_tid); 1696 } 1697 #endif 1698 #if USE_ITT_NOTIFY 1699 __kmp_threads[global_tid]->th.th_ident = loc; 1700 #endif 1701 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1702 #if OMPT_SUPPORT && OMPT_OPTIONAL 1703 if (ompt_enabled.enabled) { 1704 ompt_frame->enter_frame = ompt_data_none; 1705 } 1706 #endif 1707 1708 ret = __kmpc_master(loc, global_tid); 1709 1710 if (__kmp_env_consistency_check) { 1711 /* there's no __kmpc_end_master called; so the (stats) */ 1712 /* actions of __kmpc_end_master are done here */ 1713 if (ret) { 1714 /* only one thread should do the pop since only */ 1715 /* one did the push (see __kmpc_master()) */ 1716 __kmp_pop_sync(global_tid, ct_master, loc); 1717 } 1718 } 1719 1720 return (ret); 1721 } 1722 1723 /* The BARRIER for a SINGLE process section is always explicit */ 1724 /*! 1725 @ingroup WORK_SHARING 1726 @param loc source location information 1727 @param global_tid global thread number 1728 @return One if this thread should execute the single construct, zero otherwise. 1729 1730 Test whether to execute a <tt>single</tt> construct. 1731 There are no implicit barriers in the two "single" calls, rather the compiler 1732 should introduce an explicit barrier if it is required. 1733 */ 1734 1735 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1736 __kmp_assert_valid_gtid(global_tid); 1737 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1738 1739 if (rc) { 1740 // We are going to execute the single statement, so we should count it. 1741 KMP_COUNT_BLOCK(OMP_SINGLE); 1742 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1743 } 1744 1745 #if OMPT_SUPPORT && OMPT_OPTIONAL 1746 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1747 kmp_team_t *team = this_thr->th.th_team; 1748 int tid = __kmp_tid_from_gtid(global_tid); 1749 1750 if (ompt_enabled.enabled) { 1751 if (rc) { 1752 if (ompt_enabled.ompt_callback_work) { 1753 ompt_callbacks.ompt_callback(ompt_callback_work)( 1754 ompt_work_single_executor, ompt_scope_begin, 1755 &(team->t.ompt_team_info.parallel_data), 1756 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1757 1, OMPT_GET_RETURN_ADDRESS(0)); 1758 } 1759 } else { 1760 if (ompt_enabled.ompt_callback_work) { 1761 ompt_callbacks.ompt_callback(ompt_callback_work)( 1762 ompt_work_single_other, ompt_scope_begin, 1763 &(team->t.ompt_team_info.parallel_data), 1764 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1765 1, OMPT_GET_RETURN_ADDRESS(0)); 1766 ompt_callbacks.ompt_callback(ompt_callback_work)( 1767 ompt_work_single_other, ompt_scope_end, 1768 &(team->t.ompt_team_info.parallel_data), 1769 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1770 1, OMPT_GET_RETURN_ADDRESS(0)); 1771 } 1772 } 1773 } 1774 #endif 1775 1776 return rc; 1777 } 1778 1779 /*! 1780 @ingroup WORK_SHARING 1781 @param loc source location information 1782 @param global_tid global thread number 1783 1784 Mark the end of a <tt>single</tt> construct. This function should 1785 only be called by the thread that executed the block of code protected 1786 by the `single` construct. 1787 */ 1788 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1789 __kmp_assert_valid_gtid(global_tid); 1790 __kmp_exit_single(global_tid); 1791 KMP_POP_PARTITIONED_TIMER(); 1792 1793 #if OMPT_SUPPORT && OMPT_OPTIONAL 1794 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1795 kmp_team_t *team = this_thr->th.th_team; 1796 int tid = __kmp_tid_from_gtid(global_tid); 1797 1798 if (ompt_enabled.ompt_callback_work) { 1799 ompt_callbacks.ompt_callback(ompt_callback_work)( 1800 ompt_work_single_executor, ompt_scope_end, 1801 &(team->t.ompt_team_info.parallel_data), 1802 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1803 OMPT_GET_RETURN_ADDRESS(0)); 1804 } 1805 #endif 1806 } 1807 1808 /*! 1809 @ingroup WORK_SHARING 1810 @param loc Source location 1811 @param global_tid Global thread id 1812 1813 Mark the end of a statically scheduled loop. 1814 */ 1815 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1816 KMP_POP_PARTITIONED_TIMER(); 1817 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1818 1819 #if OMPT_SUPPORT && OMPT_OPTIONAL 1820 if (ompt_enabled.ompt_callback_work) { 1821 ompt_work_t ompt_work_type = ompt_work_loop; 1822 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1823 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1824 // Determine workshare type 1825 if (loc != NULL) { 1826 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1827 ompt_work_type = ompt_work_loop; 1828 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1829 ompt_work_type = ompt_work_sections; 1830 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1831 ompt_work_type = ompt_work_distribute; 1832 } else { 1833 // use default set above. 1834 // a warning about this case is provided in __kmpc_for_static_init 1835 } 1836 KMP_DEBUG_ASSERT(ompt_work_type); 1837 } 1838 ompt_callbacks.ompt_callback(ompt_callback_work)( 1839 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1840 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1841 } 1842 #endif 1843 if (__kmp_env_consistency_check) 1844 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1845 } 1846 1847 // User routines which take C-style arguments (call by value) 1848 // different from the Fortran equivalent routines 1849 1850 void ompc_set_num_threads(int arg) { 1851 // !!!!! TODO: check the per-task binding 1852 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1853 } 1854 1855 void ompc_set_dynamic(int flag) { 1856 kmp_info_t *thread; 1857 1858 /* For the thread-private implementation of the internal controls */ 1859 thread = __kmp_entry_thread(); 1860 1861 __kmp_save_internal_controls(thread); 1862 1863 set__dynamic(thread, flag ? TRUE : FALSE); 1864 } 1865 1866 void ompc_set_nested(int flag) { 1867 kmp_info_t *thread; 1868 1869 /* For the thread-private internal controls implementation */ 1870 thread = __kmp_entry_thread(); 1871 1872 __kmp_save_internal_controls(thread); 1873 1874 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1); 1875 } 1876 1877 void ompc_set_max_active_levels(int max_active_levels) { 1878 /* TO DO */ 1879 /* we want per-task implementation of this internal control */ 1880 1881 /* For the per-thread internal controls implementation */ 1882 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1883 } 1884 1885 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1886 // !!!!! TODO: check the per-task binding 1887 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1888 } 1889 1890 int ompc_get_ancestor_thread_num(int level) { 1891 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1892 } 1893 1894 int ompc_get_team_size(int level) { 1895 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1896 } 1897 1898 /* OpenMP 5.0 Affinity Format API */ 1899 1900 void ompc_set_affinity_format(char const *format) { 1901 if (!__kmp_init_serial) { 1902 __kmp_serial_initialize(); 1903 } 1904 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, 1905 format, KMP_STRLEN(format) + 1); 1906 } 1907 1908 size_t ompc_get_affinity_format(char *buffer, size_t size) { 1909 size_t format_size; 1910 if (!__kmp_init_serial) { 1911 __kmp_serial_initialize(); 1912 } 1913 format_size = KMP_STRLEN(__kmp_affinity_format); 1914 if (buffer && size) { 1915 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format, 1916 format_size + 1); 1917 } 1918 return format_size; 1919 } 1920 1921 void ompc_display_affinity(char const *format) { 1922 int gtid; 1923 if (!TCR_4(__kmp_init_middle)) { 1924 __kmp_middle_initialize(); 1925 } 1926 gtid = __kmp_get_gtid(); 1927 __kmp_aux_display_affinity(gtid, format); 1928 } 1929 1930 size_t ompc_capture_affinity(char *buffer, size_t buf_size, 1931 char const *format) { 1932 int gtid; 1933 size_t num_required; 1934 kmp_str_buf_t capture_buf; 1935 if (!TCR_4(__kmp_init_middle)) { 1936 __kmp_middle_initialize(); 1937 } 1938 gtid = __kmp_get_gtid(); 1939 __kmp_str_buf_init(&capture_buf); 1940 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf); 1941 if (buffer && buf_size) { 1942 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str, 1943 capture_buf.used + 1); 1944 } 1945 __kmp_str_buf_free(&capture_buf); 1946 return num_required; 1947 } 1948 1949 void kmpc_set_stacksize(int arg) { 1950 // __kmp_aux_set_stacksize initializes the library if needed 1951 __kmp_aux_set_stacksize(arg); 1952 } 1953 1954 void kmpc_set_stacksize_s(size_t arg) { 1955 // __kmp_aux_set_stacksize initializes the library if needed 1956 __kmp_aux_set_stacksize(arg); 1957 } 1958 1959 void kmpc_set_blocktime(int arg) { 1960 int gtid, tid; 1961 kmp_info_t *thread; 1962 1963 gtid = __kmp_entry_gtid(); 1964 tid = __kmp_tid_from_gtid(gtid); 1965 thread = __kmp_thread_from_gtid(gtid); 1966 1967 __kmp_aux_set_blocktime(arg, thread, tid); 1968 } 1969 1970 void kmpc_set_library(int arg) { 1971 // __kmp_user_set_library initializes the library if needed 1972 __kmp_user_set_library((enum library_type)arg); 1973 } 1974 1975 void kmpc_set_defaults(char const *str) { 1976 // __kmp_aux_set_defaults initializes the library if needed 1977 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1978 } 1979 1980 void kmpc_set_disp_num_buffers(int arg) { 1981 // ignore after initialization because some teams have already 1982 // allocated dispatch buffers 1983 if (__kmp_init_serial == 0 && arg > 0) 1984 __kmp_dispatch_num_buffers = arg; 1985 } 1986 1987 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1989 return -1; 1990 #else 1991 if (!TCR_4(__kmp_init_middle)) { 1992 __kmp_middle_initialize(); 1993 } 1994 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1995 #endif 1996 } 1997 1998 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1999 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 2000 return -1; 2001 #else 2002 if (!TCR_4(__kmp_init_middle)) { 2003 __kmp_middle_initialize(); 2004 } 2005 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 2006 #endif 2007 } 2008 2009 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 2010 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 2011 return -1; 2012 #else 2013 if (!TCR_4(__kmp_init_middle)) { 2014 __kmp_middle_initialize(); 2015 } 2016 return __kmp_aux_get_affinity_mask_proc(proc, mask); 2017 #endif 2018 } 2019 2020 /* -------------------------------------------------------------------------- */ 2021 /*! 2022 @ingroup THREADPRIVATE 2023 @param loc source location information 2024 @param gtid global thread number 2025 @param cpy_size size of the cpy_data buffer 2026 @param cpy_data pointer to data to be copied 2027 @param cpy_func helper function to call for copying data 2028 @param didit flag variable: 1=single thread; 0=not single thread 2029 2030 __kmpc_copyprivate implements the interface for the private data broadcast 2031 needed for the copyprivate clause associated with a single region in an 2032 OpenMP<sup>*</sup> program (both C and Fortran). 2033 All threads participating in the parallel region call this routine. 2034 One of the threads (called the single thread) should have the <tt>didit</tt> 2035 variable set to 1 and all other threads should have that variable set to 0. 2036 All threads pass a pointer to a data buffer (cpy_data) that they have built. 2037 2038 The OpenMP specification forbids the use of nowait on the single region when a 2039 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 2040 barrier internally to avoid race conditions, so the code generation for the 2041 single region should avoid generating a barrier after the call to @ref 2042 __kmpc_copyprivate. 2043 2044 The <tt>gtid</tt> parameter is the global thread id for the current thread. 2045 The <tt>loc</tt> parameter is a pointer to source location information. 2046 2047 Internal implementation: The single thread will first copy its descriptor 2048 address (cpy_data) to a team-private location, then the other threads will each 2049 call the function pointed to by the parameter cpy_func, which carries out the 2050 copy by copying the data using the cpy_data buffer. 2051 2052 The cpy_func routine used for the copy and the contents of the data area defined 2053 by cpy_data and cpy_size may be built in any fashion that will allow the copy 2054 to be done. For instance, the cpy_data buffer can hold the actual data to be 2055 copied or it may hold a list of pointers to the data. The cpy_func routine must 2056 interpret the cpy_data buffer appropriately. 2057 2058 The interface to cpy_func is as follows: 2059 @code 2060 void cpy_func( void *destination, void *source ) 2061 @endcode 2062 where void *destination is the cpy_data pointer for the thread being copied to 2063 and void *source is the cpy_data pointer for the thread being copied from. 2064 */ 2065 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 2066 void *cpy_data, void (*cpy_func)(void *, void *), 2067 kmp_int32 didit) { 2068 void **data_ptr; 2069 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 2070 __kmp_assert_valid_gtid(gtid); 2071 2072 KMP_MB(); 2073 2074 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 2075 2076 if (__kmp_env_consistency_check) { 2077 if (loc == 0) { 2078 KMP_WARNING(ConstructIdentInvalid); 2079 } 2080 } 2081 2082 // ToDo: Optimize the following two barriers into some kind of split barrier 2083 2084 if (didit) 2085 *data_ptr = cpy_data; 2086 2087 #if OMPT_SUPPORT 2088 ompt_frame_t *ompt_frame; 2089 if (ompt_enabled.enabled) { 2090 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2091 if (ompt_frame->enter_frame.ptr == NULL) 2092 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 2093 OMPT_STORE_RETURN_ADDRESS(gtid); 2094 } 2095 #endif 2096 /* This barrier is not a barrier region boundary */ 2097 #if USE_ITT_NOTIFY 2098 __kmp_threads[gtid]->th.th_ident = loc; 2099 #endif 2100 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2101 2102 if (!didit) 2103 (*cpy_func)(cpy_data, *data_ptr); 2104 2105 // Consider next barrier a user-visible barrier for barrier region boundaries 2106 // Nesting checks are already handled by the single construct checks 2107 2108 #if OMPT_SUPPORT 2109 if (ompt_enabled.enabled) { 2110 OMPT_STORE_RETURN_ADDRESS(gtid); 2111 } 2112 #endif 2113 #if USE_ITT_NOTIFY 2114 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2115 // tasks can overwrite the location) 2116 #endif 2117 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2118 #if OMPT_SUPPORT && OMPT_OPTIONAL 2119 if (ompt_enabled.enabled) { 2120 ompt_frame->enter_frame = ompt_data_none; 2121 } 2122 #endif 2123 } 2124 2125 /* -------------------------------------------------------------------------- */ 2126 2127 #define INIT_LOCK __kmp_init_user_lock_with_checks 2128 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2129 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2130 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2131 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2132 #define ACQUIRE_NESTED_LOCK_TIMED \ 2133 __kmp_acquire_nested_user_lock_with_checks_timed 2134 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2135 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2136 #define TEST_LOCK __kmp_test_user_lock_with_checks 2137 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2138 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2139 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2140 2141 // TODO: Make check abort messages use location info & pass it into 2142 // with_checks routines 2143 2144 #if KMP_USE_DYNAMIC_LOCK 2145 2146 // internal lock initializer 2147 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2148 kmp_dyna_lockseq_t seq) { 2149 if (KMP_IS_D_LOCK(seq)) { 2150 KMP_INIT_D_LOCK(lock, seq); 2151 #if USE_ITT_BUILD 2152 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2153 #endif 2154 } else { 2155 KMP_INIT_I_LOCK(lock, seq); 2156 #if USE_ITT_BUILD 2157 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2158 __kmp_itt_lock_creating(ilk->lock, loc); 2159 #endif 2160 } 2161 } 2162 2163 // internal nest lock initializer 2164 static __forceinline void 2165 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2166 kmp_dyna_lockseq_t seq) { 2167 #if KMP_USE_TSX 2168 // Don't have nested lock implementation for speculative locks 2169 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2170 seq = __kmp_user_lock_seq; 2171 #endif 2172 switch (seq) { 2173 case lockseq_tas: 2174 seq = lockseq_nested_tas; 2175 break; 2176 #if KMP_USE_FUTEX 2177 case lockseq_futex: 2178 seq = lockseq_nested_futex; 2179 break; 2180 #endif 2181 case lockseq_ticket: 2182 seq = lockseq_nested_ticket; 2183 break; 2184 case lockseq_queuing: 2185 seq = lockseq_nested_queuing; 2186 break; 2187 case lockseq_drdpa: 2188 seq = lockseq_nested_drdpa; 2189 break; 2190 default: 2191 seq = lockseq_nested_queuing; 2192 } 2193 KMP_INIT_I_LOCK(lock, seq); 2194 #if USE_ITT_BUILD 2195 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2196 __kmp_itt_lock_creating(ilk->lock, loc); 2197 #endif 2198 } 2199 2200 /* initialize the lock with a hint */ 2201 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2202 uintptr_t hint) { 2203 KMP_DEBUG_ASSERT(__kmp_init_serial); 2204 if (__kmp_env_consistency_check && user_lock == NULL) { 2205 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2206 } 2207 2208 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2209 2210 #if OMPT_SUPPORT && OMPT_OPTIONAL 2211 // This is the case, if called from omp_init_lock_with_hint: 2212 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2213 if (!codeptr) 2214 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2215 if (ompt_enabled.ompt_callback_lock_init) { 2216 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2217 ompt_mutex_lock, (omp_lock_hint_t)hint, 2218 __ompt_get_mutex_impl_type(user_lock), 2219 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2220 } 2221 #endif 2222 } 2223 2224 /* initialize the lock with a hint */ 2225 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2226 void **user_lock, uintptr_t hint) { 2227 KMP_DEBUG_ASSERT(__kmp_init_serial); 2228 if (__kmp_env_consistency_check && user_lock == NULL) { 2229 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2230 } 2231 2232 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2233 2234 #if OMPT_SUPPORT && OMPT_OPTIONAL 2235 // This is the case, if called from omp_init_lock_with_hint: 2236 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2237 if (!codeptr) 2238 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2239 if (ompt_enabled.ompt_callback_lock_init) { 2240 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2241 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2242 __ompt_get_mutex_impl_type(user_lock), 2243 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2244 } 2245 #endif 2246 } 2247 2248 #endif // KMP_USE_DYNAMIC_LOCK 2249 2250 /* initialize the lock */ 2251 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2252 #if KMP_USE_DYNAMIC_LOCK 2253 2254 KMP_DEBUG_ASSERT(__kmp_init_serial); 2255 if (__kmp_env_consistency_check && user_lock == NULL) { 2256 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2257 } 2258 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2259 2260 #if OMPT_SUPPORT && OMPT_OPTIONAL 2261 // This is the case, if called from omp_init_lock_with_hint: 2262 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2263 if (!codeptr) 2264 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2265 if (ompt_enabled.ompt_callback_lock_init) { 2266 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2267 ompt_mutex_lock, omp_lock_hint_none, 2268 __ompt_get_mutex_impl_type(user_lock), 2269 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2270 } 2271 #endif 2272 2273 #else // KMP_USE_DYNAMIC_LOCK 2274 2275 static char const *const func = "omp_init_lock"; 2276 kmp_user_lock_p lck; 2277 KMP_DEBUG_ASSERT(__kmp_init_serial); 2278 2279 if (__kmp_env_consistency_check) { 2280 if (user_lock == NULL) { 2281 KMP_FATAL(LockIsUninitialized, func); 2282 } 2283 } 2284 2285 KMP_CHECK_USER_LOCK_INIT(); 2286 2287 if ((__kmp_user_lock_kind == lk_tas) && 2288 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2289 lck = (kmp_user_lock_p)user_lock; 2290 } 2291 #if KMP_USE_FUTEX 2292 else if ((__kmp_user_lock_kind == lk_futex) && 2293 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2294 lck = (kmp_user_lock_p)user_lock; 2295 } 2296 #endif 2297 else { 2298 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2299 } 2300 INIT_LOCK(lck); 2301 __kmp_set_user_lock_location(lck, loc); 2302 2303 #if OMPT_SUPPORT && OMPT_OPTIONAL 2304 // This is the case, if called from omp_init_lock_with_hint: 2305 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2306 if (!codeptr) 2307 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2308 if (ompt_enabled.ompt_callback_lock_init) { 2309 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2310 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2311 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2312 } 2313 #endif 2314 2315 #if USE_ITT_BUILD 2316 __kmp_itt_lock_creating(lck); 2317 #endif /* USE_ITT_BUILD */ 2318 2319 #endif // KMP_USE_DYNAMIC_LOCK 2320 } // __kmpc_init_lock 2321 2322 /* initialize the lock */ 2323 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2324 #if KMP_USE_DYNAMIC_LOCK 2325 2326 KMP_DEBUG_ASSERT(__kmp_init_serial); 2327 if (__kmp_env_consistency_check && user_lock == NULL) { 2328 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2329 } 2330 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2331 2332 #if OMPT_SUPPORT && OMPT_OPTIONAL 2333 // This is the case, if called from omp_init_lock_with_hint: 2334 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2335 if (!codeptr) 2336 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2337 if (ompt_enabled.ompt_callback_lock_init) { 2338 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2339 ompt_mutex_nest_lock, omp_lock_hint_none, 2340 __ompt_get_mutex_impl_type(user_lock), 2341 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2342 } 2343 #endif 2344 2345 #else // KMP_USE_DYNAMIC_LOCK 2346 2347 static char const *const func = "omp_init_nest_lock"; 2348 kmp_user_lock_p lck; 2349 KMP_DEBUG_ASSERT(__kmp_init_serial); 2350 2351 if (__kmp_env_consistency_check) { 2352 if (user_lock == NULL) { 2353 KMP_FATAL(LockIsUninitialized, func); 2354 } 2355 } 2356 2357 KMP_CHECK_USER_LOCK_INIT(); 2358 2359 if ((__kmp_user_lock_kind == lk_tas) && 2360 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2361 OMP_NEST_LOCK_T_SIZE)) { 2362 lck = (kmp_user_lock_p)user_lock; 2363 } 2364 #if KMP_USE_FUTEX 2365 else if ((__kmp_user_lock_kind == lk_futex) && 2366 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2367 OMP_NEST_LOCK_T_SIZE)) { 2368 lck = (kmp_user_lock_p)user_lock; 2369 } 2370 #endif 2371 else { 2372 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2373 } 2374 2375 INIT_NESTED_LOCK(lck); 2376 __kmp_set_user_lock_location(lck, loc); 2377 2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 // This is the case, if called from omp_init_lock_with_hint: 2380 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2381 if (!codeptr) 2382 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2383 if (ompt_enabled.ompt_callback_lock_init) { 2384 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2385 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2386 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2387 } 2388 #endif 2389 2390 #if USE_ITT_BUILD 2391 __kmp_itt_lock_creating(lck); 2392 #endif /* USE_ITT_BUILD */ 2393 2394 #endif // KMP_USE_DYNAMIC_LOCK 2395 } // __kmpc_init_nest_lock 2396 2397 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2398 #if KMP_USE_DYNAMIC_LOCK 2399 2400 #if USE_ITT_BUILD 2401 kmp_user_lock_p lck; 2402 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2403 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2404 } else { 2405 lck = (kmp_user_lock_p)user_lock; 2406 } 2407 __kmp_itt_lock_destroyed(lck); 2408 #endif 2409 #if OMPT_SUPPORT && OMPT_OPTIONAL 2410 // This is the case, if called from omp_init_lock_with_hint: 2411 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2412 if (!codeptr) 2413 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2414 if (ompt_enabled.ompt_callback_lock_destroy) { 2415 kmp_user_lock_p lck; 2416 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2417 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2418 } else { 2419 lck = (kmp_user_lock_p)user_lock; 2420 } 2421 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2422 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2423 } 2424 #endif 2425 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2426 #else 2427 kmp_user_lock_p lck; 2428 2429 if ((__kmp_user_lock_kind == lk_tas) && 2430 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2431 lck = (kmp_user_lock_p)user_lock; 2432 } 2433 #if KMP_USE_FUTEX 2434 else if ((__kmp_user_lock_kind == lk_futex) && 2435 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2436 lck = (kmp_user_lock_p)user_lock; 2437 } 2438 #endif 2439 else { 2440 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2441 } 2442 2443 #if OMPT_SUPPORT && OMPT_OPTIONAL 2444 // This is the case, if called from omp_init_lock_with_hint: 2445 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2446 if (!codeptr) 2447 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2448 if (ompt_enabled.ompt_callback_lock_destroy) { 2449 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2450 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2451 } 2452 #endif 2453 2454 #if USE_ITT_BUILD 2455 __kmp_itt_lock_destroyed(lck); 2456 #endif /* USE_ITT_BUILD */ 2457 DESTROY_LOCK(lck); 2458 2459 if ((__kmp_user_lock_kind == lk_tas) && 2460 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2461 ; 2462 } 2463 #if KMP_USE_FUTEX 2464 else if ((__kmp_user_lock_kind == lk_futex) && 2465 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2466 ; 2467 } 2468 #endif 2469 else { 2470 __kmp_user_lock_free(user_lock, gtid, lck); 2471 } 2472 #endif // KMP_USE_DYNAMIC_LOCK 2473 } // __kmpc_destroy_lock 2474 2475 /* destroy the lock */ 2476 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2477 #if KMP_USE_DYNAMIC_LOCK 2478 2479 #if USE_ITT_BUILD 2480 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2481 __kmp_itt_lock_destroyed(ilk->lock); 2482 #endif 2483 #if OMPT_SUPPORT && OMPT_OPTIONAL 2484 // This is the case, if called from omp_init_lock_with_hint: 2485 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2486 if (!codeptr) 2487 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2488 if (ompt_enabled.ompt_callback_lock_destroy) { 2489 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2490 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2491 } 2492 #endif 2493 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2494 2495 #else // KMP_USE_DYNAMIC_LOCK 2496 2497 kmp_user_lock_p lck; 2498 2499 if ((__kmp_user_lock_kind == lk_tas) && 2500 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2501 OMP_NEST_LOCK_T_SIZE)) { 2502 lck = (kmp_user_lock_p)user_lock; 2503 } 2504 #if KMP_USE_FUTEX 2505 else if ((__kmp_user_lock_kind == lk_futex) && 2506 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2507 OMP_NEST_LOCK_T_SIZE)) { 2508 lck = (kmp_user_lock_p)user_lock; 2509 } 2510 #endif 2511 else { 2512 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2513 } 2514 2515 #if OMPT_SUPPORT && OMPT_OPTIONAL 2516 // This is the case, if called from omp_init_lock_with_hint: 2517 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2518 if (!codeptr) 2519 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2520 if (ompt_enabled.ompt_callback_lock_destroy) { 2521 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2522 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2523 } 2524 #endif 2525 2526 #if USE_ITT_BUILD 2527 __kmp_itt_lock_destroyed(lck); 2528 #endif /* USE_ITT_BUILD */ 2529 2530 DESTROY_NESTED_LOCK(lck); 2531 2532 if ((__kmp_user_lock_kind == lk_tas) && 2533 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2534 OMP_NEST_LOCK_T_SIZE)) { 2535 ; 2536 } 2537 #if KMP_USE_FUTEX 2538 else if ((__kmp_user_lock_kind == lk_futex) && 2539 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2540 OMP_NEST_LOCK_T_SIZE)) { 2541 ; 2542 } 2543 #endif 2544 else { 2545 __kmp_user_lock_free(user_lock, gtid, lck); 2546 } 2547 #endif // KMP_USE_DYNAMIC_LOCK 2548 } // __kmpc_destroy_nest_lock 2549 2550 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2551 KMP_COUNT_BLOCK(OMP_set_lock); 2552 #if KMP_USE_DYNAMIC_LOCK 2553 int tag = KMP_EXTRACT_D_TAG(user_lock); 2554 #if USE_ITT_BUILD 2555 __kmp_itt_lock_acquiring( 2556 (kmp_user_lock_p) 2557 user_lock); // itt function will get to the right lock object. 2558 #endif 2559 #if OMPT_SUPPORT && OMPT_OPTIONAL 2560 // This is the case, if called from omp_init_lock_with_hint: 2561 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2562 if (!codeptr) 2563 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2564 if (ompt_enabled.ompt_callback_mutex_acquire) { 2565 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2566 ompt_mutex_lock, omp_lock_hint_none, 2567 __ompt_get_mutex_impl_type(user_lock), 2568 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2569 } 2570 #endif 2571 #if KMP_USE_INLINED_TAS 2572 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2573 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2574 } else 2575 #elif KMP_USE_INLINED_FUTEX 2576 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2577 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2578 } else 2579 #endif 2580 { 2581 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2582 } 2583 #if USE_ITT_BUILD 2584 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2585 #endif 2586 #if OMPT_SUPPORT && OMPT_OPTIONAL 2587 if (ompt_enabled.ompt_callback_mutex_acquired) { 2588 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2589 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2590 } 2591 #endif 2592 2593 #else // KMP_USE_DYNAMIC_LOCK 2594 2595 kmp_user_lock_p lck; 2596 2597 if ((__kmp_user_lock_kind == lk_tas) && 2598 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2599 lck = (kmp_user_lock_p)user_lock; 2600 } 2601 #if KMP_USE_FUTEX 2602 else if ((__kmp_user_lock_kind == lk_futex) && 2603 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2604 lck = (kmp_user_lock_p)user_lock; 2605 } 2606 #endif 2607 else { 2608 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2609 } 2610 2611 #if USE_ITT_BUILD 2612 __kmp_itt_lock_acquiring(lck); 2613 #endif /* USE_ITT_BUILD */ 2614 #if OMPT_SUPPORT && OMPT_OPTIONAL 2615 // This is the case, if called from omp_init_lock_with_hint: 2616 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2617 if (!codeptr) 2618 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2619 if (ompt_enabled.ompt_callback_mutex_acquire) { 2620 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2621 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2622 (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2623 } 2624 #endif 2625 2626 ACQUIRE_LOCK(lck, gtid); 2627 2628 #if USE_ITT_BUILD 2629 __kmp_itt_lock_acquired(lck); 2630 #endif /* USE_ITT_BUILD */ 2631 2632 #if OMPT_SUPPORT && OMPT_OPTIONAL 2633 if (ompt_enabled.ompt_callback_mutex_acquired) { 2634 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2635 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2636 } 2637 #endif 2638 2639 #endif // KMP_USE_DYNAMIC_LOCK 2640 } 2641 2642 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2643 #if KMP_USE_DYNAMIC_LOCK 2644 2645 #if USE_ITT_BUILD 2646 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2647 #endif 2648 #if OMPT_SUPPORT && OMPT_OPTIONAL 2649 // This is the case, if called from omp_init_lock_with_hint: 2650 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2651 if (!codeptr) 2652 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2653 if (ompt_enabled.enabled) { 2654 if (ompt_enabled.ompt_callback_mutex_acquire) { 2655 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2656 ompt_mutex_nest_lock, omp_lock_hint_none, 2657 __ompt_get_mutex_impl_type(user_lock), 2658 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2659 } 2660 } 2661 #endif 2662 int acquire_status = 2663 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2664 (void) acquire_status; 2665 #if USE_ITT_BUILD 2666 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2667 #endif 2668 2669 #if OMPT_SUPPORT && OMPT_OPTIONAL 2670 if (ompt_enabled.enabled) { 2671 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2672 if (ompt_enabled.ompt_callback_mutex_acquired) { 2673 // lock_first 2674 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2675 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 2676 codeptr); 2677 } 2678 } else { 2679 if (ompt_enabled.ompt_callback_nest_lock) { 2680 // lock_next 2681 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2682 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2683 } 2684 } 2685 } 2686 #endif 2687 2688 #else // KMP_USE_DYNAMIC_LOCK 2689 int acquire_status; 2690 kmp_user_lock_p lck; 2691 2692 if ((__kmp_user_lock_kind == lk_tas) && 2693 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2694 OMP_NEST_LOCK_T_SIZE)) { 2695 lck = (kmp_user_lock_p)user_lock; 2696 } 2697 #if KMP_USE_FUTEX 2698 else if ((__kmp_user_lock_kind == lk_futex) && 2699 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2700 OMP_NEST_LOCK_T_SIZE)) { 2701 lck = (kmp_user_lock_p)user_lock; 2702 } 2703 #endif 2704 else { 2705 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2706 } 2707 2708 #if USE_ITT_BUILD 2709 __kmp_itt_lock_acquiring(lck); 2710 #endif /* USE_ITT_BUILD */ 2711 #if OMPT_SUPPORT && OMPT_OPTIONAL 2712 // This is the case, if called from omp_init_lock_with_hint: 2713 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2714 if (!codeptr) 2715 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2716 if (ompt_enabled.enabled) { 2717 if (ompt_enabled.ompt_callback_mutex_acquire) { 2718 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2719 ompt_mutex_nest_lock, omp_lock_hint_none, 2720 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck, 2721 codeptr); 2722 } 2723 } 2724 #endif 2725 2726 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2727 2728 #if USE_ITT_BUILD 2729 __kmp_itt_lock_acquired(lck); 2730 #endif /* USE_ITT_BUILD */ 2731 2732 #if OMPT_SUPPORT && OMPT_OPTIONAL 2733 if (ompt_enabled.enabled) { 2734 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2735 if (ompt_enabled.ompt_callback_mutex_acquired) { 2736 // lock_first 2737 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2738 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2739 } 2740 } else { 2741 if (ompt_enabled.ompt_callback_nest_lock) { 2742 // lock_next 2743 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2744 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2745 } 2746 } 2747 } 2748 #endif 2749 2750 #endif // KMP_USE_DYNAMIC_LOCK 2751 } 2752 2753 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2754 #if KMP_USE_DYNAMIC_LOCK 2755 2756 int tag = KMP_EXTRACT_D_TAG(user_lock); 2757 #if USE_ITT_BUILD 2758 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2759 #endif 2760 #if KMP_USE_INLINED_TAS 2761 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2762 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2763 } else 2764 #elif KMP_USE_INLINED_FUTEX 2765 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2766 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2767 } else 2768 #endif 2769 { 2770 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2771 } 2772 2773 #if OMPT_SUPPORT && OMPT_OPTIONAL 2774 // This is the case, if called from omp_init_lock_with_hint: 2775 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2776 if (!codeptr) 2777 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2778 if (ompt_enabled.ompt_callback_mutex_released) { 2779 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2780 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2781 } 2782 #endif 2783 2784 #else // KMP_USE_DYNAMIC_LOCK 2785 2786 kmp_user_lock_p lck; 2787 2788 /* Can't use serial interval since not block structured */ 2789 /* release the lock */ 2790 2791 if ((__kmp_user_lock_kind == lk_tas) && 2792 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2793 #if KMP_OS_LINUX && \ 2794 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2795 // "fast" path implemented to fix customer performance issue 2796 #if USE_ITT_BUILD 2797 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2798 #endif /* USE_ITT_BUILD */ 2799 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2800 KMP_MB(); 2801 2802 #if OMPT_SUPPORT && OMPT_OPTIONAL 2803 // This is the case, if called from omp_init_lock_with_hint: 2804 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2805 if (!codeptr) 2806 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2807 if (ompt_enabled.ompt_callback_mutex_released) { 2808 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2809 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2810 } 2811 #endif 2812 2813 return; 2814 #else 2815 lck = (kmp_user_lock_p)user_lock; 2816 #endif 2817 } 2818 #if KMP_USE_FUTEX 2819 else if ((__kmp_user_lock_kind == lk_futex) && 2820 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2821 lck = (kmp_user_lock_p)user_lock; 2822 } 2823 #endif 2824 else { 2825 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2826 } 2827 2828 #if USE_ITT_BUILD 2829 __kmp_itt_lock_releasing(lck); 2830 #endif /* USE_ITT_BUILD */ 2831 2832 RELEASE_LOCK(lck, gtid); 2833 2834 #if OMPT_SUPPORT && OMPT_OPTIONAL 2835 // This is the case, if called from omp_init_lock_with_hint: 2836 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2837 if (!codeptr) 2838 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2839 if (ompt_enabled.ompt_callback_mutex_released) { 2840 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2841 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2842 } 2843 #endif 2844 2845 #endif // KMP_USE_DYNAMIC_LOCK 2846 } 2847 2848 /* release the lock */ 2849 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2850 #if KMP_USE_DYNAMIC_LOCK 2851 2852 #if USE_ITT_BUILD 2853 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2854 #endif 2855 int release_status = 2856 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2857 (void) release_status; 2858 2859 #if OMPT_SUPPORT && OMPT_OPTIONAL 2860 // This is the case, if called from omp_init_lock_with_hint: 2861 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2862 if (!codeptr) 2863 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2864 if (ompt_enabled.enabled) { 2865 if (release_status == KMP_LOCK_RELEASED) { 2866 if (ompt_enabled.ompt_callback_mutex_released) { 2867 // release_lock_last 2868 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2869 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 2870 codeptr); 2871 } 2872 } else if (ompt_enabled.ompt_callback_nest_lock) { 2873 // release_lock_prev 2874 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2875 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2876 } 2877 } 2878 #endif 2879 2880 #else // KMP_USE_DYNAMIC_LOCK 2881 2882 kmp_user_lock_p lck; 2883 2884 /* Can't use serial interval since not block structured */ 2885 2886 if ((__kmp_user_lock_kind == lk_tas) && 2887 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2888 OMP_NEST_LOCK_T_SIZE)) { 2889 #if KMP_OS_LINUX && \ 2890 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2891 // "fast" path implemented to fix customer performance issue 2892 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2893 #if USE_ITT_BUILD 2894 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2895 #endif /* USE_ITT_BUILD */ 2896 2897 #if OMPT_SUPPORT && OMPT_OPTIONAL 2898 int release_status = KMP_LOCK_STILL_HELD; 2899 #endif 2900 2901 if (--(tl->lk.depth_locked) == 0) { 2902 TCW_4(tl->lk.poll, 0); 2903 #if OMPT_SUPPORT && OMPT_OPTIONAL 2904 release_status = KMP_LOCK_RELEASED; 2905 #endif 2906 } 2907 KMP_MB(); 2908 2909 #if OMPT_SUPPORT && OMPT_OPTIONAL 2910 // This is the case, if called from omp_init_lock_with_hint: 2911 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2912 if (!codeptr) 2913 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2914 if (ompt_enabled.enabled) { 2915 if (release_status == KMP_LOCK_RELEASED) { 2916 if (ompt_enabled.ompt_callback_mutex_released) { 2917 // release_lock_last 2918 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2919 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2920 } 2921 } else if (ompt_enabled.ompt_callback_nest_lock) { 2922 // release_lock_previous 2923 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2924 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2925 } 2926 } 2927 #endif 2928 2929 return; 2930 #else 2931 lck = (kmp_user_lock_p)user_lock; 2932 #endif 2933 } 2934 #if KMP_USE_FUTEX 2935 else if ((__kmp_user_lock_kind == lk_futex) && 2936 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2937 OMP_NEST_LOCK_T_SIZE)) { 2938 lck = (kmp_user_lock_p)user_lock; 2939 } 2940 #endif 2941 else { 2942 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2943 } 2944 2945 #if USE_ITT_BUILD 2946 __kmp_itt_lock_releasing(lck); 2947 #endif /* USE_ITT_BUILD */ 2948 2949 int release_status; 2950 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2951 #if OMPT_SUPPORT && OMPT_OPTIONAL 2952 // This is the case, if called from omp_init_lock_with_hint: 2953 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2954 if (!codeptr) 2955 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2956 if (ompt_enabled.enabled) { 2957 if (release_status == KMP_LOCK_RELEASED) { 2958 if (ompt_enabled.ompt_callback_mutex_released) { 2959 // release_lock_last 2960 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2961 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2962 } 2963 } else if (ompt_enabled.ompt_callback_nest_lock) { 2964 // release_lock_previous 2965 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2966 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 2967 } 2968 } 2969 #endif 2970 2971 #endif // KMP_USE_DYNAMIC_LOCK 2972 } 2973 2974 /* try to acquire the lock */ 2975 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2976 KMP_COUNT_BLOCK(OMP_test_lock); 2977 2978 #if KMP_USE_DYNAMIC_LOCK 2979 int rc; 2980 int tag = KMP_EXTRACT_D_TAG(user_lock); 2981 #if USE_ITT_BUILD 2982 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2983 #endif 2984 #if OMPT_SUPPORT && OMPT_OPTIONAL 2985 // This is the case, if called from omp_init_lock_with_hint: 2986 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2987 if (!codeptr) 2988 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2989 if (ompt_enabled.ompt_callback_mutex_acquire) { 2990 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2991 ompt_mutex_lock, omp_lock_hint_none, 2992 __ompt_get_mutex_impl_type(user_lock), 2993 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 2994 } 2995 #endif 2996 #if KMP_USE_INLINED_TAS 2997 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2998 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2999 } else 3000 #elif KMP_USE_INLINED_FUTEX 3001 if (tag == locktag_futex && !__kmp_env_consistency_check) { 3002 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 3003 } else 3004 #endif 3005 { 3006 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 3007 } 3008 if (rc) { 3009 #if USE_ITT_BUILD 3010 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3011 #endif 3012 #if OMPT_SUPPORT && OMPT_OPTIONAL 3013 if (ompt_enabled.ompt_callback_mutex_acquired) { 3014 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3015 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3016 } 3017 #endif 3018 return FTN_TRUE; 3019 } else { 3020 #if USE_ITT_BUILD 3021 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3022 #endif 3023 return FTN_FALSE; 3024 } 3025 3026 #else // KMP_USE_DYNAMIC_LOCK 3027 3028 kmp_user_lock_p lck; 3029 int rc; 3030 3031 if ((__kmp_user_lock_kind == lk_tas) && 3032 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 3033 lck = (kmp_user_lock_p)user_lock; 3034 } 3035 #if KMP_USE_FUTEX 3036 else if ((__kmp_user_lock_kind == lk_futex) && 3037 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 3038 lck = (kmp_user_lock_p)user_lock; 3039 } 3040 #endif 3041 else { 3042 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 3043 } 3044 3045 #if USE_ITT_BUILD 3046 __kmp_itt_lock_acquiring(lck); 3047 #endif /* USE_ITT_BUILD */ 3048 #if OMPT_SUPPORT && OMPT_OPTIONAL 3049 // This is the case, if called from omp_init_lock_with_hint: 3050 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3051 if (!codeptr) 3052 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3053 if (ompt_enabled.ompt_callback_mutex_acquire) { 3054 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3055 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 3056 (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3057 } 3058 #endif 3059 3060 rc = TEST_LOCK(lck, gtid); 3061 #if USE_ITT_BUILD 3062 if (rc) { 3063 __kmp_itt_lock_acquired(lck); 3064 } else { 3065 __kmp_itt_lock_cancelled(lck); 3066 } 3067 #endif /* USE_ITT_BUILD */ 3068 #if OMPT_SUPPORT && OMPT_OPTIONAL 3069 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 3070 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3071 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3072 } 3073 #endif 3074 3075 return (rc ? FTN_TRUE : FTN_FALSE); 3076 3077 /* Can't use serial interval since not block structured */ 3078 3079 #endif // KMP_USE_DYNAMIC_LOCK 3080 } 3081 3082 /* try to acquire the lock */ 3083 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 3084 #if KMP_USE_DYNAMIC_LOCK 3085 int rc; 3086 #if USE_ITT_BUILD 3087 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 3088 #endif 3089 #if OMPT_SUPPORT && OMPT_OPTIONAL 3090 // This is the case, if called from omp_init_lock_with_hint: 3091 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3092 if (!codeptr) 3093 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3094 if (ompt_enabled.ompt_callback_mutex_acquire) { 3095 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3096 ompt_mutex_nest_lock, omp_lock_hint_none, 3097 __ompt_get_mutex_impl_type(user_lock), 3098 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3099 } 3100 #endif 3101 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3102 #if USE_ITT_BUILD 3103 if (rc) { 3104 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3105 } else { 3106 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3107 } 3108 #endif 3109 #if OMPT_SUPPORT && OMPT_OPTIONAL 3110 if (ompt_enabled.enabled && rc) { 3111 if (rc == 1) { 3112 if (ompt_enabled.ompt_callback_mutex_acquired) { 3113 // lock_first 3114 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3115 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, 3116 codeptr); 3117 } 3118 } else { 3119 if (ompt_enabled.ompt_callback_nest_lock) { 3120 // lock_next 3121 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3122 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr); 3123 } 3124 } 3125 } 3126 #endif 3127 return rc; 3128 3129 #else // KMP_USE_DYNAMIC_LOCK 3130 3131 kmp_user_lock_p lck; 3132 int rc; 3133 3134 if ((__kmp_user_lock_kind == lk_tas) && 3135 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3136 OMP_NEST_LOCK_T_SIZE)) { 3137 lck = (kmp_user_lock_p)user_lock; 3138 } 3139 #if KMP_USE_FUTEX 3140 else if ((__kmp_user_lock_kind == lk_futex) && 3141 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3142 OMP_NEST_LOCK_T_SIZE)) { 3143 lck = (kmp_user_lock_p)user_lock; 3144 } 3145 #endif 3146 else { 3147 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3148 } 3149 3150 #if USE_ITT_BUILD 3151 __kmp_itt_lock_acquiring(lck); 3152 #endif /* USE_ITT_BUILD */ 3153 3154 #if OMPT_SUPPORT && OMPT_OPTIONAL 3155 // This is the case, if called from omp_init_lock_with_hint: 3156 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3157 if (!codeptr) 3158 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3159 if (ompt_enabled.enabled) && 3160 ompt_enabled.ompt_callback_mutex_acquire) { 3161 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3162 ompt_mutex_nest_lock, omp_lock_hint_none, 3163 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck, 3164 codeptr); 3165 } 3166 #endif 3167 3168 rc = TEST_NESTED_LOCK(lck, gtid); 3169 #if USE_ITT_BUILD 3170 if (rc) { 3171 __kmp_itt_lock_acquired(lck); 3172 } else { 3173 __kmp_itt_lock_cancelled(lck); 3174 } 3175 #endif /* USE_ITT_BUILD */ 3176 #if OMPT_SUPPORT && OMPT_OPTIONAL 3177 if (ompt_enabled.enabled && rc) { 3178 if (rc == 1) { 3179 if (ompt_enabled.ompt_callback_mutex_acquired) { 3180 // lock_first 3181 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3182 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3183 } 3184 } else { 3185 if (ompt_enabled.ompt_callback_nest_lock) { 3186 // lock_next 3187 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3188 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr); 3189 } 3190 } 3191 } 3192 #endif 3193 return rc; 3194 3195 /* Can't use serial interval since not block structured */ 3196 3197 #endif // KMP_USE_DYNAMIC_LOCK 3198 } 3199 3200 // Interface to fast scalable reduce methods routines 3201 3202 // keep the selected method in a thread local structure for cross-function 3203 // usage: will be used in __kmpc_end_reduce* functions; 3204 // another solution: to re-determine the method one more time in 3205 // __kmpc_end_reduce* functions (new prototype required then) 3206 // AT: which solution is better? 3207 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3208 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3209 3210 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3211 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3212 3213 // description of the packed_reduction_method variable: look at the macros in 3214 // kmp.h 3215 3216 // used in a critical section reduce block 3217 static __forceinline void 3218 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3219 kmp_critical_name *crit) { 3220 3221 // this lock was visible to a customer and to the threading profile tool as a 3222 // serial overhead span (although it's used for an internal purpose only) 3223 // why was it visible in previous implementation? 3224 // should we keep it visible in new reduce block? 3225 kmp_user_lock_p lck; 3226 3227 #if KMP_USE_DYNAMIC_LOCK 3228 3229 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3230 // Check if it is initialized. 3231 if (*lk == 0) { 3232 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3233 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3234 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3235 } else { 3236 __kmp_init_indirect_csptr(crit, loc, global_tid, 3237 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3238 } 3239 } 3240 // Branch for accessing the actual lock object and set operation. This 3241 // branching is inevitable since this lock initialization does not follow the 3242 // normal dispatch path (lock table is not used). 3243 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3244 lck = (kmp_user_lock_p)lk; 3245 KMP_DEBUG_ASSERT(lck != NULL); 3246 if (__kmp_env_consistency_check) { 3247 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3248 } 3249 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3250 } else { 3251 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3252 lck = ilk->lock; 3253 KMP_DEBUG_ASSERT(lck != NULL); 3254 if (__kmp_env_consistency_check) { 3255 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3256 } 3257 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3258 } 3259 3260 #else // KMP_USE_DYNAMIC_LOCK 3261 3262 // We know that the fast reduction code is only emitted by Intel compilers 3263 // with 32 byte critical sections. If there isn't enough space, then we 3264 // have to use a pointer. 3265 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3266 lck = (kmp_user_lock_p)crit; 3267 } else { 3268 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3269 } 3270 KMP_DEBUG_ASSERT(lck != NULL); 3271 3272 if (__kmp_env_consistency_check) 3273 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3274 3275 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3276 3277 #endif // KMP_USE_DYNAMIC_LOCK 3278 } 3279 3280 // used in a critical section reduce block 3281 static __forceinline void 3282 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3283 kmp_critical_name *crit) { 3284 3285 kmp_user_lock_p lck; 3286 3287 #if KMP_USE_DYNAMIC_LOCK 3288 3289 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3290 lck = (kmp_user_lock_p)crit; 3291 if (__kmp_env_consistency_check) 3292 __kmp_pop_sync(global_tid, ct_critical, loc); 3293 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3294 } else { 3295 kmp_indirect_lock_t *ilk = 3296 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3297 if (__kmp_env_consistency_check) 3298 __kmp_pop_sync(global_tid, ct_critical, loc); 3299 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3300 } 3301 3302 #else // KMP_USE_DYNAMIC_LOCK 3303 3304 // We know that the fast reduction code is only emitted by Intel compilers 3305 // with 32 byte critical sections. If there isn't enough space, then we have 3306 // to use a pointer. 3307 if (__kmp_base_user_lock_size > 32) { 3308 lck = *((kmp_user_lock_p *)crit); 3309 KMP_ASSERT(lck != NULL); 3310 } else { 3311 lck = (kmp_user_lock_p)crit; 3312 } 3313 3314 if (__kmp_env_consistency_check) 3315 __kmp_pop_sync(global_tid, ct_critical, loc); 3316 3317 __kmp_release_user_lock_with_checks(lck, global_tid); 3318 3319 #endif // KMP_USE_DYNAMIC_LOCK 3320 } // __kmp_end_critical_section_reduce_block 3321 3322 static __forceinline int 3323 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3324 int *task_state) { 3325 kmp_team_t *team; 3326 3327 // Check if we are inside the teams construct? 3328 if (th->th.th_teams_microtask) { 3329 *team_p = team = th->th.th_team; 3330 if (team->t.t_level == th->th.th_teams_level) { 3331 // This is reduction at teams construct. 3332 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3333 // Let's swap teams temporarily for the reduction. 3334 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3335 th->th.th_team = team->t.t_parent; 3336 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3337 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3338 *task_state = th->th.th_task_state; 3339 th->th.th_task_state = 0; 3340 3341 return 1; 3342 } 3343 } 3344 return 0; 3345 } 3346 3347 static __forceinline void 3348 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3349 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3350 th->th.th_info.ds.ds_tid = 0; 3351 th->th.th_team = team; 3352 th->th.th_team_nproc = team->t.t_nproc; 3353 th->th.th_task_team = team->t.t_task_team[task_state]; 3354 th->th.th_task_state = task_state; 3355 } 3356 3357 /* 2.a.i. Reduce Block without a terminating barrier */ 3358 /*! 3359 @ingroup SYNCHRONIZATION 3360 @param loc source location information 3361 @param global_tid global thread number 3362 @param num_vars number of items (variables) to be reduced 3363 @param reduce_size size of data in bytes to be reduced 3364 @param reduce_data pointer to data to be reduced 3365 @param reduce_func callback function providing reduction operation on two 3366 operands and returning result of reduction in lhs_data 3367 @param lck pointer to the unique lock data structure 3368 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3369 threads if atomic reduction needed 3370 3371 The nowait version is used for a reduce clause with the nowait argument. 3372 */ 3373 kmp_int32 3374 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3375 size_t reduce_size, void *reduce_data, 3376 void (*reduce_func)(void *lhs_data, void *rhs_data), 3377 kmp_critical_name *lck) { 3378 3379 KMP_COUNT_BLOCK(REDUCE_nowait); 3380 int retval = 0; 3381 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3382 kmp_info_t *th; 3383 kmp_team_t *team; 3384 int teams_swapped = 0, task_state; 3385 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3386 __kmp_assert_valid_gtid(global_tid); 3387 3388 // why do we need this initialization here at all? 3389 // Reduction clause can not be used as a stand-alone directive. 3390 3391 // do not call __kmp_serial_initialize(), it will be called by 3392 // __kmp_parallel_initialize() if needed 3393 // possible detection of false-positive race by the threadchecker ??? 3394 if (!TCR_4(__kmp_init_parallel)) 3395 __kmp_parallel_initialize(); 3396 3397 __kmp_resume_if_soft_paused(); 3398 3399 // check correctness of reduce block nesting 3400 #if KMP_USE_DYNAMIC_LOCK 3401 if (__kmp_env_consistency_check) 3402 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3403 #else 3404 if (__kmp_env_consistency_check) 3405 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3406 #endif 3407 3408 th = __kmp_thread_from_gtid(global_tid); 3409 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3410 3411 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3412 // the value should be kept in a variable 3413 // the variable should be either a construct-specific or thread-specific 3414 // property, not a team specific property 3415 // (a thread can reach the next reduce block on the next construct, reduce 3416 // method may differ on the next construct) 3417 // an ident_t "loc" parameter could be used as a construct-specific property 3418 // (what if loc == 0?) 3419 // (if both construct-specific and team-specific variables were shared, 3420 // then unness extra syncs should be needed) 3421 // a thread-specific variable is better regarding two issues above (next 3422 // construct and extra syncs) 3423 // a thread-specific "th_local.reduction_method" variable is used currently 3424 // each thread executes 'determine' and 'set' lines (no need to execute by one 3425 // thread, to avoid unness extra syncs) 3426 3427 packed_reduction_method = __kmp_determine_reduction_method( 3428 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3429 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3430 3431 OMPT_REDUCTION_DECL(th, global_tid); 3432 if (packed_reduction_method == critical_reduce_block) { 3433 3434 OMPT_REDUCTION_BEGIN; 3435 3436 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3437 retval = 1; 3438 3439 } else if (packed_reduction_method == empty_reduce_block) { 3440 3441 OMPT_REDUCTION_BEGIN; 3442 3443 // usage: if team size == 1, no synchronization is required ( Intel 3444 // platforms only ) 3445 retval = 1; 3446 3447 } else if (packed_reduction_method == atomic_reduce_block) { 3448 3449 retval = 2; 3450 3451 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3452 // won't be called by the code gen) 3453 // (it's not quite good, because the checking block has been closed by 3454 // this 'pop', 3455 // but atomic operation has not been executed yet, will be executed 3456 // slightly later, literally on next instruction) 3457 if (__kmp_env_consistency_check) 3458 __kmp_pop_sync(global_tid, ct_reduce, loc); 3459 3460 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3461 tree_reduce_block)) { 3462 3463 // AT: performance issue: a real barrier here 3464 // AT: (if master goes slow, other threads are blocked here waiting for the 3465 // master to come and release them) 3466 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3467 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3468 // be confusing to a customer) 3469 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3470 // might go faster and be more in line with sense of NOWAIT 3471 // AT: TO DO: do epcc test and compare times 3472 3473 // this barrier should be invisible to a customer and to the threading profile 3474 // tool (it's neither a terminating barrier nor customer's code, it's 3475 // used for an internal purpose) 3476 #if OMPT_SUPPORT 3477 // JP: can this barrier potentially leed to task scheduling? 3478 // JP: as long as there is a barrier in the implementation, OMPT should and 3479 // will provide the barrier events 3480 // so we set-up the necessary frame/return addresses. 3481 ompt_frame_t *ompt_frame; 3482 if (ompt_enabled.enabled) { 3483 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3484 if (ompt_frame->enter_frame.ptr == NULL) 3485 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3486 OMPT_STORE_RETURN_ADDRESS(global_tid); 3487 } 3488 #endif 3489 #if USE_ITT_NOTIFY 3490 __kmp_threads[global_tid]->th.th_ident = loc; 3491 #endif 3492 retval = 3493 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3494 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3495 retval = (retval != 0) ? (0) : (1); 3496 #if OMPT_SUPPORT && OMPT_OPTIONAL 3497 if (ompt_enabled.enabled) { 3498 ompt_frame->enter_frame = ompt_data_none; 3499 } 3500 #endif 3501 3502 // all other workers except master should do this pop here 3503 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3504 if (__kmp_env_consistency_check) { 3505 if (retval == 0) { 3506 __kmp_pop_sync(global_tid, ct_reduce, loc); 3507 } 3508 } 3509 3510 } else { 3511 3512 // should never reach this block 3513 KMP_ASSERT(0); // "unexpected method" 3514 } 3515 if (teams_swapped) { 3516 __kmp_restore_swapped_teams(th, team, task_state); 3517 } 3518 KA_TRACE( 3519 10, 3520 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3521 global_tid, packed_reduction_method, retval)); 3522 3523 return retval; 3524 } 3525 3526 /*! 3527 @ingroup SYNCHRONIZATION 3528 @param loc source location information 3529 @param global_tid global thread id. 3530 @param lck pointer to the unique lock data structure 3531 3532 Finish the execution of a reduce nowait. 3533 */ 3534 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3535 kmp_critical_name *lck) { 3536 3537 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3538 3539 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3540 __kmp_assert_valid_gtid(global_tid); 3541 3542 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3543 3544 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid); 3545 3546 if (packed_reduction_method == critical_reduce_block) { 3547 3548 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3549 OMPT_REDUCTION_END; 3550 3551 } else if (packed_reduction_method == empty_reduce_block) { 3552 3553 // usage: if team size == 1, no synchronization is required ( on Intel 3554 // platforms only ) 3555 3556 OMPT_REDUCTION_END; 3557 3558 } else if (packed_reduction_method == atomic_reduce_block) { 3559 3560 // neither master nor other workers should get here 3561 // (code gen does not generate this call in case 2: atomic reduce block) 3562 // actually it's better to remove this elseif at all; 3563 // after removal this value will checked by the 'else' and will assert 3564 3565 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3566 tree_reduce_block)) { 3567 3568 // only master gets here 3569 // OMPT: tree reduction is annotated in the barrier code 3570 3571 } else { 3572 3573 // should never reach this block 3574 KMP_ASSERT(0); // "unexpected method" 3575 } 3576 3577 if (__kmp_env_consistency_check) 3578 __kmp_pop_sync(global_tid, ct_reduce, loc); 3579 3580 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3581 global_tid, packed_reduction_method)); 3582 3583 return; 3584 } 3585 3586 /* 2.a.ii. Reduce Block with a terminating barrier */ 3587 3588 /*! 3589 @ingroup SYNCHRONIZATION 3590 @param loc source location information 3591 @param global_tid global thread number 3592 @param num_vars number of items (variables) to be reduced 3593 @param reduce_size size of data in bytes to be reduced 3594 @param reduce_data pointer to data to be reduced 3595 @param reduce_func callback function providing reduction operation on two 3596 operands and returning result of reduction in lhs_data 3597 @param lck pointer to the unique lock data structure 3598 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3599 threads if atomic reduction needed 3600 3601 A blocking reduce that includes an implicit barrier. 3602 */ 3603 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3604 size_t reduce_size, void *reduce_data, 3605 void (*reduce_func)(void *lhs_data, void *rhs_data), 3606 kmp_critical_name *lck) { 3607 KMP_COUNT_BLOCK(REDUCE_wait); 3608 int retval = 0; 3609 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3610 kmp_info_t *th; 3611 kmp_team_t *team; 3612 int teams_swapped = 0, task_state; 3613 3614 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3615 __kmp_assert_valid_gtid(global_tid); 3616 3617 // why do we need this initialization here at all? 3618 // Reduction clause can not be a stand-alone directive. 3619 3620 // do not call __kmp_serial_initialize(), it will be called by 3621 // __kmp_parallel_initialize() if needed 3622 // possible detection of false-positive race by the threadchecker ??? 3623 if (!TCR_4(__kmp_init_parallel)) 3624 __kmp_parallel_initialize(); 3625 3626 __kmp_resume_if_soft_paused(); 3627 3628 // check correctness of reduce block nesting 3629 #if KMP_USE_DYNAMIC_LOCK 3630 if (__kmp_env_consistency_check) 3631 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3632 #else 3633 if (__kmp_env_consistency_check) 3634 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3635 #endif 3636 3637 th = __kmp_thread_from_gtid(global_tid); 3638 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3639 3640 packed_reduction_method = __kmp_determine_reduction_method( 3641 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3642 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3643 3644 OMPT_REDUCTION_DECL(th, global_tid); 3645 3646 if (packed_reduction_method == critical_reduce_block) { 3647 3648 OMPT_REDUCTION_BEGIN; 3649 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3650 retval = 1; 3651 3652 } else if (packed_reduction_method == empty_reduce_block) { 3653 3654 OMPT_REDUCTION_BEGIN; 3655 // usage: if team size == 1, no synchronization is required ( Intel 3656 // platforms only ) 3657 retval = 1; 3658 3659 } else if (packed_reduction_method == atomic_reduce_block) { 3660 3661 retval = 2; 3662 3663 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3664 tree_reduce_block)) { 3665 3666 // case tree_reduce_block: 3667 // this barrier should be visible to a customer and to the threading profile 3668 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3669 #if OMPT_SUPPORT 3670 ompt_frame_t *ompt_frame; 3671 if (ompt_enabled.enabled) { 3672 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3673 if (ompt_frame->enter_frame.ptr == NULL) 3674 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3675 OMPT_STORE_RETURN_ADDRESS(global_tid); 3676 } 3677 #endif 3678 #if USE_ITT_NOTIFY 3679 __kmp_threads[global_tid]->th.th_ident = 3680 loc; // needed for correct notification of frames 3681 #endif 3682 retval = 3683 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3684 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3685 retval = (retval != 0) ? (0) : (1); 3686 #if OMPT_SUPPORT && OMPT_OPTIONAL 3687 if (ompt_enabled.enabled) { 3688 ompt_frame->enter_frame = ompt_data_none; 3689 } 3690 #endif 3691 3692 // all other workers except master should do this pop here 3693 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3694 if (__kmp_env_consistency_check) { 3695 if (retval == 0) { // 0: all other workers; 1: master 3696 __kmp_pop_sync(global_tid, ct_reduce, loc); 3697 } 3698 } 3699 3700 } else { 3701 3702 // should never reach this block 3703 KMP_ASSERT(0); // "unexpected method" 3704 } 3705 if (teams_swapped) { 3706 __kmp_restore_swapped_teams(th, team, task_state); 3707 } 3708 3709 KA_TRACE(10, 3710 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3711 global_tid, packed_reduction_method, retval)); 3712 return retval; 3713 } 3714 3715 /*! 3716 @ingroup SYNCHRONIZATION 3717 @param loc source location information 3718 @param global_tid global thread id. 3719 @param lck pointer to the unique lock data structure 3720 3721 Finish the execution of a blocking reduce. 3722 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3723 start function. 3724 */ 3725 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3726 kmp_critical_name *lck) { 3727 3728 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3729 kmp_info_t *th; 3730 kmp_team_t *team; 3731 int teams_swapped = 0, task_state; 3732 3733 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3734 __kmp_assert_valid_gtid(global_tid); 3735 3736 th = __kmp_thread_from_gtid(global_tid); 3737 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3738 3739 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3740 3741 // this barrier should be visible to a customer and to the threading profile 3742 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3743 OMPT_REDUCTION_DECL(th, global_tid); 3744 3745 if (packed_reduction_method == critical_reduce_block) { 3746 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3747 3748 OMPT_REDUCTION_END; 3749 3750 // TODO: implicit barrier: should be exposed 3751 #if OMPT_SUPPORT 3752 ompt_frame_t *ompt_frame; 3753 if (ompt_enabled.enabled) { 3754 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3755 if (ompt_frame->enter_frame.ptr == NULL) 3756 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3757 OMPT_STORE_RETURN_ADDRESS(global_tid); 3758 } 3759 #endif 3760 #if USE_ITT_NOTIFY 3761 __kmp_threads[global_tid]->th.th_ident = loc; 3762 #endif 3763 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3764 #if OMPT_SUPPORT && OMPT_OPTIONAL 3765 if (ompt_enabled.enabled) { 3766 ompt_frame->enter_frame = ompt_data_none; 3767 } 3768 #endif 3769 3770 } else if (packed_reduction_method == empty_reduce_block) { 3771 3772 OMPT_REDUCTION_END; 3773 3774 // usage: if team size==1, no synchronization is required (Intel platforms only) 3775 3776 // TODO: implicit barrier: should be exposed 3777 #if OMPT_SUPPORT 3778 ompt_frame_t *ompt_frame; 3779 if (ompt_enabled.enabled) { 3780 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3781 if (ompt_frame->enter_frame.ptr == NULL) 3782 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3783 OMPT_STORE_RETURN_ADDRESS(global_tid); 3784 } 3785 #endif 3786 #if USE_ITT_NOTIFY 3787 __kmp_threads[global_tid]->th.th_ident = loc; 3788 #endif 3789 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3790 #if OMPT_SUPPORT && OMPT_OPTIONAL 3791 if (ompt_enabled.enabled) { 3792 ompt_frame->enter_frame = ompt_data_none; 3793 } 3794 #endif 3795 3796 } else if (packed_reduction_method == atomic_reduce_block) { 3797 3798 #if OMPT_SUPPORT 3799 ompt_frame_t *ompt_frame; 3800 if (ompt_enabled.enabled) { 3801 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3802 if (ompt_frame->enter_frame.ptr == NULL) 3803 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 3804 OMPT_STORE_RETURN_ADDRESS(global_tid); 3805 } 3806 #endif 3807 // TODO: implicit barrier: should be exposed 3808 #if USE_ITT_NOTIFY 3809 __kmp_threads[global_tid]->th.th_ident = loc; 3810 #endif 3811 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3812 #if OMPT_SUPPORT && OMPT_OPTIONAL 3813 if (ompt_enabled.enabled) { 3814 ompt_frame->enter_frame = ompt_data_none; 3815 } 3816 #endif 3817 3818 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3819 tree_reduce_block)) { 3820 3821 // only master executes here (master releases all other workers) 3822 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3823 global_tid); 3824 3825 } else { 3826 3827 // should never reach this block 3828 KMP_ASSERT(0); // "unexpected method" 3829 } 3830 if (teams_swapped) { 3831 __kmp_restore_swapped_teams(th, team, task_state); 3832 } 3833 3834 if (__kmp_env_consistency_check) 3835 __kmp_pop_sync(global_tid, ct_reduce, loc); 3836 3837 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3838 global_tid, packed_reduction_method)); 3839 3840 return; 3841 } 3842 3843 #undef __KMP_GET_REDUCTION_METHOD 3844 #undef __KMP_SET_REDUCTION_METHOD 3845 3846 /* end of interface to fast scalable reduce routines */ 3847 3848 kmp_uint64 __kmpc_get_taskid() { 3849 3850 kmp_int32 gtid; 3851 kmp_info_t *thread; 3852 3853 gtid = __kmp_get_gtid(); 3854 if (gtid < 0) { 3855 return 0; 3856 } 3857 thread = __kmp_thread_from_gtid(gtid); 3858 return thread->th.th_current_task->td_task_id; 3859 3860 } // __kmpc_get_taskid 3861 3862 kmp_uint64 __kmpc_get_parent_taskid() { 3863 3864 kmp_int32 gtid; 3865 kmp_info_t *thread; 3866 kmp_taskdata_t *parent_task; 3867 3868 gtid = __kmp_get_gtid(); 3869 if (gtid < 0) { 3870 return 0; 3871 } 3872 thread = __kmp_thread_from_gtid(gtid); 3873 parent_task = thread->th.th_current_task->td_parent; 3874 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3875 3876 } // __kmpc_get_parent_taskid 3877 3878 /*! 3879 @ingroup WORK_SHARING 3880 @param loc source location information. 3881 @param gtid global thread number. 3882 @param num_dims number of associated doacross loops. 3883 @param dims info on loops bounds. 3884 3885 Initialize doacross loop information. 3886 Expect compiler send us inclusive bounds, 3887 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3888 */ 3889 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3890 const struct kmp_dim *dims) { 3891 __kmp_assert_valid_gtid(gtid); 3892 int j, idx; 3893 kmp_int64 last, trace_count; 3894 kmp_info_t *th = __kmp_threads[gtid]; 3895 kmp_team_t *team = th->th.th_team; 3896 kmp_uint32 *flags; 3897 kmp_disp_t *pr_buf = th->th.th_dispatch; 3898 dispatch_shared_info_t *sh_buf; 3899 3900 KA_TRACE( 3901 20, 3902 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3903 gtid, num_dims, !team->t.t_serialized)); 3904 KMP_DEBUG_ASSERT(dims != NULL); 3905 KMP_DEBUG_ASSERT(num_dims > 0); 3906 3907 if (team->t.t_serialized) { 3908 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3909 return; // no dependencies if team is serialized 3910 } 3911 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3912 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3913 // the next loop 3914 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3915 3916 // Save bounds info into allocated private buffer 3917 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3918 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3919 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3920 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3921 pr_buf->th_doacross_info[0] = 3922 (kmp_int64)num_dims; // first element is number of dimensions 3923 // Save also address of num_done in order to access it later without knowing 3924 // the buffer index 3925 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3926 pr_buf->th_doacross_info[2] = dims[0].lo; 3927 pr_buf->th_doacross_info[3] = dims[0].up; 3928 pr_buf->th_doacross_info[4] = dims[0].st; 3929 last = 5; 3930 for (j = 1; j < num_dims; ++j) { 3931 kmp_int64 3932 range_length; // To keep ranges of all dimensions but the first dims[0] 3933 if (dims[j].st == 1) { // most common case 3934 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3935 range_length = dims[j].up - dims[j].lo + 1; 3936 } else { 3937 if (dims[j].st > 0) { 3938 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3939 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3940 } else { // negative increment 3941 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3942 range_length = 3943 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3944 } 3945 } 3946 pr_buf->th_doacross_info[last++] = range_length; 3947 pr_buf->th_doacross_info[last++] = dims[j].lo; 3948 pr_buf->th_doacross_info[last++] = dims[j].up; 3949 pr_buf->th_doacross_info[last++] = dims[j].st; 3950 } 3951 3952 // Compute total trip count. 3953 // Start with range of dims[0] which we don't need to keep in the buffer. 3954 if (dims[0].st == 1) { // most common case 3955 trace_count = dims[0].up - dims[0].lo + 1; 3956 } else if (dims[0].st > 0) { 3957 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3958 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3959 } else { // negative increment 3960 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3961 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3962 } 3963 for (j = 1; j < num_dims; ++j) { 3964 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3965 } 3966 KMP_DEBUG_ASSERT(trace_count > 0); 3967 3968 // Check if shared buffer is not occupied by other loop (idx - 3969 // __kmp_dispatch_num_buffers) 3970 if (idx != sh_buf->doacross_buf_idx) { 3971 // Shared buffer is occupied, wait for it to be free 3972 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3973 __kmp_eq_4, NULL); 3974 } 3975 #if KMP_32_BIT_ARCH 3976 // Check if we are the first thread. After the CAS the first thread gets 0, 3977 // others get 1 if initialization is in progress, allocated pointer otherwise. 3978 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3979 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3980 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3981 #else 3982 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3983 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3984 #endif 3985 if (flags == NULL) { 3986 // we are the first thread, allocate the array of flags 3987 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3988 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3989 KMP_MB(); 3990 sh_buf->doacross_flags = flags; 3991 } else if (flags == (kmp_uint32 *)1) { 3992 #if KMP_32_BIT_ARCH 3993 // initialization is still in progress, need to wait 3994 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3995 #else 3996 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3997 #endif 3998 KMP_YIELD(TRUE); 3999 KMP_MB(); 4000 } else { 4001 KMP_MB(); 4002 } 4003 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 4004 pr_buf->th_doacross_flags = 4005 sh_buf->doacross_flags; // save private copy in order to not 4006 // touch shared buffer on each iteration 4007 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 4008 } 4009 4010 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 4011 __kmp_assert_valid_gtid(gtid); 4012 kmp_int32 shft, num_dims, i; 4013 kmp_uint32 flag; 4014 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4015 kmp_info_t *th = __kmp_threads[gtid]; 4016 kmp_team_t *team = th->th.th_team; 4017 kmp_disp_t *pr_buf; 4018 kmp_int64 lo, up, st; 4019 4020 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 4021 if (team->t.t_serialized) { 4022 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 4023 return; // no dependencies if team is serialized 4024 } 4025 4026 // calculate sequential iteration number and check out-of-bounds condition 4027 pr_buf = th->th.th_dispatch; 4028 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4029 num_dims = pr_buf->th_doacross_info[0]; 4030 lo = pr_buf->th_doacross_info[2]; 4031 up = pr_buf->th_doacross_info[3]; 4032 st = pr_buf->th_doacross_info[4]; 4033 #if OMPT_SUPPORT && OMPT_OPTIONAL 4034 ompt_dependence_t deps[num_dims]; 4035 #endif 4036 if (st == 1) { // most common case 4037 if (vec[0] < lo || vec[0] > up) { 4038 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4039 "bounds [%lld,%lld]\n", 4040 gtid, vec[0], lo, up)); 4041 return; 4042 } 4043 iter_number = vec[0] - lo; 4044 } else if (st > 0) { 4045 if (vec[0] < lo || vec[0] > up) { 4046 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4047 "bounds [%lld,%lld]\n", 4048 gtid, vec[0], lo, up)); 4049 return; 4050 } 4051 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4052 } else { // negative increment 4053 if (vec[0] > lo || vec[0] < up) { 4054 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4055 "bounds [%lld,%lld]\n", 4056 gtid, vec[0], lo, up)); 4057 return; 4058 } 4059 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4060 } 4061 #if OMPT_SUPPORT && OMPT_OPTIONAL 4062 deps[0].variable.value = iter_number; 4063 deps[0].dependence_type = ompt_dependence_type_sink; 4064 #endif 4065 for (i = 1; i < num_dims; ++i) { 4066 kmp_int64 iter, ln; 4067 kmp_int32 j = i * 4; 4068 ln = pr_buf->th_doacross_info[j + 1]; 4069 lo = pr_buf->th_doacross_info[j + 2]; 4070 up = pr_buf->th_doacross_info[j + 3]; 4071 st = pr_buf->th_doacross_info[j + 4]; 4072 if (st == 1) { 4073 if (vec[i] < lo || vec[i] > up) { 4074 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4075 "bounds [%lld,%lld]\n", 4076 gtid, vec[i], lo, up)); 4077 return; 4078 } 4079 iter = vec[i] - lo; 4080 } else if (st > 0) { 4081 if (vec[i] < lo || vec[i] > up) { 4082 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4083 "bounds [%lld,%lld]\n", 4084 gtid, vec[i], lo, up)); 4085 return; 4086 } 4087 iter = (kmp_uint64)(vec[i] - lo) / st; 4088 } else { // st < 0 4089 if (vec[i] > lo || vec[i] < up) { 4090 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 4091 "bounds [%lld,%lld]\n", 4092 gtid, vec[i], lo, up)); 4093 return; 4094 } 4095 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4096 } 4097 iter_number = iter + ln * iter_number; 4098 #if OMPT_SUPPORT && OMPT_OPTIONAL 4099 deps[i].variable.value = iter; 4100 deps[i].dependence_type = ompt_dependence_type_sink; 4101 #endif 4102 } 4103 shft = iter_number % 32; // use 32-bit granularity 4104 iter_number >>= 5; // divided by 32 4105 flag = 1 << shft; 4106 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 4107 KMP_YIELD(TRUE); 4108 } 4109 KMP_MB(); 4110 #if OMPT_SUPPORT && OMPT_OPTIONAL 4111 if (ompt_enabled.ompt_callback_dependences) { 4112 ompt_callbacks.ompt_callback(ompt_callback_dependences)( 4113 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims); 4114 } 4115 #endif 4116 KA_TRACE(20, 4117 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 4118 gtid, (iter_number << 5) + shft)); 4119 } 4120 4121 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4122 __kmp_assert_valid_gtid(gtid); 4123 kmp_int32 shft, num_dims, i; 4124 kmp_uint32 flag; 4125 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4126 kmp_info_t *th = __kmp_threads[gtid]; 4127 kmp_team_t *team = th->th.th_team; 4128 kmp_disp_t *pr_buf; 4129 kmp_int64 lo, st; 4130 4131 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4132 if (team->t.t_serialized) { 4133 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4134 return; // no dependencies if team is serialized 4135 } 4136 4137 // calculate sequential iteration number (same as in "wait" but no 4138 // out-of-bounds checks) 4139 pr_buf = th->th.th_dispatch; 4140 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4141 num_dims = pr_buf->th_doacross_info[0]; 4142 lo = pr_buf->th_doacross_info[2]; 4143 st = pr_buf->th_doacross_info[4]; 4144 #if OMPT_SUPPORT && OMPT_OPTIONAL 4145 ompt_dependence_t deps[num_dims]; 4146 #endif 4147 if (st == 1) { // most common case 4148 iter_number = vec[0] - lo; 4149 } else if (st > 0) { 4150 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4151 } else { // negative increment 4152 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4153 } 4154 #if OMPT_SUPPORT && OMPT_OPTIONAL 4155 deps[0].variable.value = iter_number; 4156 deps[0].dependence_type = ompt_dependence_type_source; 4157 #endif 4158 for (i = 1; i < num_dims; ++i) { 4159 kmp_int64 iter, ln; 4160 kmp_int32 j = i * 4; 4161 ln = pr_buf->th_doacross_info[j + 1]; 4162 lo = pr_buf->th_doacross_info[j + 2]; 4163 st = pr_buf->th_doacross_info[j + 4]; 4164 if (st == 1) { 4165 iter = vec[i] - lo; 4166 } else if (st > 0) { 4167 iter = (kmp_uint64)(vec[i] - lo) / st; 4168 } else { // st < 0 4169 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4170 } 4171 iter_number = iter + ln * iter_number; 4172 #if OMPT_SUPPORT && OMPT_OPTIONAL 4173 deps[i].variable.value = iter; 4174 deps[i].dependence_type = ompt_dependence_type_source; 4175 #endif 4176 } 4177 #if OMPT_SUPPORT && OMPT_OPTIONAL 4178 if (ompt_enabled.ompt_callback_dependences) { 4179 ompt_callbacks.ompt_callback(ompt_callback_dependences)( 4180 &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims); 4181 } 4182 #endif 4183 shft = iter_number % 32; // use 32-bit granularity 4184 iter_number >>= 5; // divided by 32 4185 flag = 1 << shft; 4186 KMP_MB(); 4187 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4188 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4189 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4190 (iter_number << 5) + shft)); 4191 } 4192 4193 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4194 __kmp_assert_valid_gtid(gtid); 4195 kmp_int32 num_done; 4196 kmp_info_t *th = __kmp_threads[gtid]; 4197 kmp_team_t *team = th->th.th_team; 4198 kmp_disp_t *pr_buf = th->th.th_dispatch; 4199 4200 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4201 if (team->t.t_serialized) { 4202 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4203 return; // nothing to do 4204 } 4205 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4206 if (num_done == th->th.th_team_nproc) { 4207 // we are the last thread, need to free shared resources 4208 int idx = pr_buf->th_doacross_buf_idx - 1; 4209 dispatch_shared_info_t *sh_buf = 4210 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4211 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4212 (kmp_int64)&sh_buf->doacross_num_done); 4213 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4214 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4215 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4216 sh_buf->doacross_flags = NULL; 4217 sh_buf->doacross_num_done = 0; 4218 sh_buf->doacross_buf_idx += 4219 __kmp_dispatch_num_buffers; // free buffer for future re-use 4220 } 4221 // free private resources (need to keep buffer index forever) 4222 pr_buf->th_doacross_flags = NULL; 4223 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4224 pr_buf->th_doacross_info = NULL; 4225 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4226 } 4227 4228 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */ 4229 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { 4230 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator); 4231 } 4232 4233 void omp_free(void *ptr, omp_allocator_handle_t allocator) { 4234 __kmpc_free(__kmp_entry_gtid(), ptr, allocator); 4235 } 4236 4237 int __kmpc_get_target_offload(void) { 4238 if (!__kmp_init_serial) { 4239 __kmp_serial_initialize(); 4240 } 4241 return __kmp_target_offload; 4242 } 4243 4244 int __kmpc_pause_resource(kmp_pause_status_t level) { 4245 if (!__kmp_init_serial) { 4246 return 1; // Can't pause if runtime is not initialized 4247 } 4248 return __kmp_pause_resource(level); 4249 } 4250