1 /* 2 * kmp_csupport.cpp -- kfront linkage support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omp.h" /* extern "C" declarations of user-visible routines */ 15 #include "kmp.h" 16 #include "kmp_error.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_lock.h" 20 #include "kmp_stats.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #define MAX_MESSAGE 512 27 28 // flags will be used in future, e.g. to implement openmp_strict library 29 // restrictions 30 31 /*! 32 * @ingroup STARTUP_SHUTDOWN 33 * @param loc in source location information 34 * @param flags in for future use (currently ignored) 35 * 36 * Initialize the runtime library. This call is optional; if it is not made then 37 * it will be implicitly called by attempts to use other library functions. 38 */ 39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) { 40 // By default __kmpc_begin() is no-op. 41 char *env; 42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL && 43 __kmp_str_match_true(env)) { 44 __kmp_middle_initialize(); 45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n")); 46 } else if (__kmp_ignore_mppbeg() == FALSE) { 47 // By default __kmp_ignore_mppbeg() returns TRUE. 48 __kmp_internal_begin(); 49 KC_TRACE(10, ("__kmpc_begin: called\n")); 50 } 51 } 52 53 /*! 54 * @ingroup STARTUP_SHUTDOWN 55 * @param loc source location information 56 * 57 * Shutdown the runtime library. This is also optional, and even if called will 58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to 59 * zero. 60 */ 61 void __kmpc_end(ident_t *loc) { 62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() 63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND 64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() 65 // returns FALSE and __kmpc_end() will unregister this root (it can cause 66 // library shut down). 67 if (__kmp_ignore_mppend() == FALSE) { 68 KC_TRACE(10, ("__kmpc_end: called\n")); 69 KA_TRACE(30, ("__kmpc_end\n")); 70 71 __kmp_internal_end_thread(-1); 72 } 73 } 74 75 /*! 76 @ingroup THREAD_STATES 77 @param loc Source location information. 78 @return The global thread index of the active thread. 79 80 This function can be called in any context. 81 82 If the runtime has ony been entered at the outermost level from a 83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is 84 that which would be returned by omp_get_thread_num() in the outermost 85 active parallel construct. (Or zero if there is no active parallel 86 construct, since the master thread is necessarily thread zero). 87 88 If multiple non-OpenMP threads all enter an OpenMP construct then this 89 will be a unique thread identifier among all the threads created by 90 the OpenMP runtime (but the value cannote be defined in terms of 91 OpenMP thread ids returned by omp_get_thread_num()). 92 */ 93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) { 94 kmp_int32 gtid = __kmp_entry_gtid(); 95 96 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid)); 97 98 return gtid; 99 } 100 101 /*! 102 @ingroup THREAD_STATES 103 @param loc Source location information. 104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime 105 106 This function can be called in any context. 107 It returns the total number of threads under the control of the OpenMP runtime. 108 That is not a number that can be determined by any OpenMP standard calls, since 109 the library may be called from more than one non-OpenMP thread, and this 110 reflects the total over all such calls. Similarly the runtime maintains 111 underlying threads even when they are not active (since the cost of creating 112 and destroying OS threads is high), this call counts all such threads even if 113 they are not waiting for work. 114 */ 115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) { 116 KC_TRACE(10, 117 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth)); 118 119 return TCR_4(__kmp_all_nth); 120 } 121 122 /*! 123 @ingroup THREAD_STATES 124 @param loc Source location information. 125 @return The thread number of the calling thread in the innermost active parallel 126 construct. 127 */ 128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) { 129 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n")); 130 return __kmp_tid_from_gtid(__kmp_entry_gtid()); 131 } 132 133 /*! 134 @ingroup THREAD_STATES 135 @param loc Source location information. 136 @return The number of threads in the innermost active parallel construct. 137 */ 138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) { 139 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n")); 140 141 return __kmp_entry_thread()->th.th_team->t.t_nproc; 142 } 143 144 /*! 145 * @ingroup DEPRECATED 146 * @param loc location description 147 * 148 * This function need not be called. It always returns TRUE. 149 */ 150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) { 151 #ifndef KMP_DEBUG 152 153 return TRUE; 154 155 #else 156 157 const char *semi2; 158 const char *semi3; 159 int line_no; 160 161 if (__kmp_par_range == 0) { 162 return TRUE; 163 } 164 semi2 = loc->psource; 165 if (semi2 == NULL) { 166 return TRUE; 167 } 168 semi2 = strchr(semi2, ';'); 169 if (semi2 == NULL) { 170 return TRUE; 171 } 172 semi2 = strchr(semi2 + 1, ';'); 173 if (semi2 == NULL) { 174 return TRUE; 175 } 176 if (__kmp_par_range_filename[0]) { 177 const char *name = semi2 - 1; 178 while ((name > loc->psource) && (*name != '/') && (*name != ';')) { 179 name--; 180 } 181 if ((*name == '/') || (*name == ';')) { 182 name++; 183 } 184 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) { 185 return __kmp_par_range < 0; 186 } 187 } 188 semi3 = strchr(semi2 + 1, ';'); 189 if (__kmp_par_range_routine[0]) { 190 if ((semi3 != NULL) && (semi3 > semi2) && 191 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) { 192 return __kmp_par_range < 0; 193 } 194 } 195 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) { 196 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) { 197 return __kmp_par_range > 0; 198 } 199 return __kmp_par_range < 0; 200 } 201 return TRUE; 202 203 #endif /* KMP_DEBUG */ 204 } 205 206 /*! 207 @ingroup THREAD_STATES 208 @param loc Source location information. 209 @return 1 if this thread is executing inside an active parallel region, zero if 210 not. 211 */ 212 kmp_int32 __kmpc_in_parallel(ident_t *loc) { 213 return __kmp_entry_thread()->th.th_root->r.r_active; 214 } 215 216 /*! 217 @ingroup PARALLEL 218 @param loc source location information 219 @param global_tid global thread number 220 @param num_threads number of threads requested for this parallel construct 221 222 Set the number of threads to be used by the next fork spawned by this thread. 223 This call is only required if the parallel construct has a `num_threads` clause. 224 */ 225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, 226 kmp_int32 num_threads) { 227 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n", 228 global_tid, num_threads)); 229 230 __kmp_push_num_threads(loc, global_tid, num_threads); 231 } 232 233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) { 234 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n")); 235 236 /* the num_threads are automatically popped */ 237 } 238 239 #if OMP_40_ENABLED 240 241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, 242 kmp_int32 proc_bind) { 243 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid, 244 proc_bind)); 245 246 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind); 247 } 248 249 #endif /* OMP_40_ENABLED */ 250 251 /*! 252 @ingroup PARALLEL 253 @param loc source location information 254 @param argc total number of arguments in the ellipsis 255 @param microtask pointer to callback routine consisting of outlined parallel 256 construct 257 @param ... pointers to shared variables that aren't global 258 259 Do the actual fork and call the microtask in the relevant number of threads. 260 */ 261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) { 262 int gtid = __kmp_entry_gtid(); 263 264 #if (KMP_STATS_ENABLED) 265 // If we were in a serial region, then stop the serial timer, record 266 // the event, and start parallel region timer 267 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 268 if (previous_state == stats_state_e::SERIAL_REGION) { 269 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead); 270 } else { 271 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead); 272 } 273 int inParallel = __kmpc_in_parallel(loc); 274 if (inParallel) { 275 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL); 276 } else { 277 KMP_COUNT_BLOCK(OMP_PARALLEL); 278 } 279 #endif 280 281 // maybe to save thr_state is enough here 282 { 283 va_list ap; 284 va_start(ap, microtask); 285 286 #if OMPT_SUPPORT 287 omp_frame_t *ompt_frame; 288 if (ompt_enabled.enabled) { 289 kmp_info_t *master_th = __kmp_threads[gtid]; 290 kmp_team_t *parent_team = master_th->th.th_team; 291 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info; 292 if (lwt) 293 ompt_frame = &(lwt->ompt_task_info.frame); 294 else { 295 int tid = __kmp_tid_from_gtid(gtid); 296 ompt_frame = &( 297 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame); 298 } 299 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 300 OMPT_STORE_RETURN_ADDRESS(gtid); 301 } 302 #endif 303 304 #if INCLUDE_SSC_MARKS 305 SSC_MARK_FORKING(); 306 #endif 307 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 308 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task 309 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, 310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 312 &ap 313 #else 314 ap 315 #endif 316 ); 317 #if INCLUDE_SSC_MARKS 318 SSC_MARK_JOINING(); 319 #endif 320 __kmp_join_call(loc, gtid 321 #if OMPT_SUPPORT 322 , 323 fork_context_intel 324 #endif 325 ); 326 327 va_end(ap); 328 } 329 330 #if KMP_STATS_ENABLED 331 if (previous_state == stats_state_e::SERIAL_REGION) { 332 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial); 333 } else { 334 KMP_POP_PARTITIONED_TIMER(); 335 } 336 #endif // KMP_STATS_ENABLED 337 } 338 339 #if OMP_40_ENABLED 340 /*! 341 @ingroup PARALLEL 342 @param loc source location information 343 @param global_tid global thread number 344 @param num_teams number of teams requested for the teams construct 345 @param num_threads number of threads per team requested for the teams construct 346 347 Set the number of teams to be used by the teams construct. 348 This call is only required if the teams construct has a `num_teams` clause 349 or a `thread_limit` clause (or both). 350 */ 351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, 352 kmp_int32 num_teams, kmp_int32 num_threads) { 353 KA_TRACE(20, 354 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n", 355 global_tid, num_teams, num_threads)); 356 357 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); 358 } 359 360 /*! 361 @ingroup PARALLEL 362 @param loc source location information 363 @param argc total number of arguments in the ellipsis 364 @param microtask pointer to callback routine consisting of outlined teams 365 construct 366 @param ... pointers to shared variables that aren't global 367 368 Do the actual fork and call the microtask in the relevant number of threads. 369 */ 370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, 371 ...) { 372 int gtid = __kmp_entry_gtid(); 373 kmp_info_t *this_thr = __kmp_threads[gtid]; 374 va_list ap; 375 va_start(ap, microtask); 376 377 KMP_COUNT_BLOCK(OMP_TEAMS); 378 379 // remember teams entry point and nesting level 380 this_thr->th.th_teams_microtask = microtask; 381 this_thr->th.th_teams_level = 382 this_thr->th.th_team->t.t_level; // AC: can be >0 on host 383 384 #if OMPT_SUPPORT 385 kmp_team_t *parent_team = this_thr->th.th_team; 386 int tid = __kmp_tid_from_gtid(gtid); 387 if (ompt_enabled.enabled) { 388 parent_team->t.t_implicit_task_taskdata[tid] 389 .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 390 } 391 OMPT_STORE_RETURN_ADDRESS(gtid); 392 #endif 393 394 // check if __kmpc_push_num_teams called, set default number of teams 395 // otherwise 396 if (this_thr->th.th_teams_size.nteams == 0) { 397 __kmp_push_num_teams(loc, gtid, 0, 0); 398 } 399 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1); 400 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1); 401 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1); 402 403 __kmp_fork_call(loc, gtid, fork_context_intel, argc, 404 VOLATILE_CAST(microtask_t) 405 __kmp_teams_master, // "wrapped" task 406 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, 407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 408 &ap 409 #else 410 ap 411 #endif 412 ); 413 __kmp_join_call(loc, gtid 414 #if OMPT_SUPPORT 415 , 416 fork_context_intel 417 #endif 418 ); 419 420 this_thr->th.th_teams_microtask = NULL; 421 this_thr->th.th_teams_level = 0; 422 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L; 423 va_end(ap); 424 } 425 #endif /* OMP_40_ENABLED */ 426 427 // I don't think this function should ever have been exported. 428 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated 429 // openmp code ever called it, but it's been exported from the RTL for so 430 // long that I'm afraid to remove the definition. 431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); } 432 433 /*! 434 @ingroup PARALLEL 435 @param loc source location information 436 @param global_tid global thread number 437 438 Enter a serialized parallel construct. This interface is used to handle a 439 conditional parallel region, like this, 440 @code 441 #pragma omp parallel if (condition) 442 @endcode 443 when the condition is false. 444 */ 445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 446 // The implementation is now in kmp_runtime.cpp so that it can share static 447 // functions with kmp_fork_call since the tasks to be done are similar in 448 // each case. 449 #if OMPT_SUPPORT 450 OMPT_STORE_RETURN_ADDRESS(global_tid); 451 #endif 452 __kmp_serialized_parallel(loc, global_tid); 453 } 454 455 /*! 456 @ingroup PARALLEL 457 @param loc source location information 458 @param global_tid global thread number 459 460 Leave a serialized parallel construct. 461 */ 462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 463 kmp_internal_control_t *top; 464 kmp_info_t *this_thr; 465 kmp_team_t *serial_team; 466 467 KC_TRACE(10, 468 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid)); 469 470 /* skip all this code for autopar serialized loops since it results in 471 unacceptable overhead */ 472 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 473 return; 474 475 // Not autopar code 476 if (!TCR_4(__kmp_init_parallel)) 477 __kmp_parallel_initialize(); 478 479 this_thr = __kmp_threads[global_tid]; 480 serial_team = this_thr->th.th_serial_team; 481 482 #if OMP_45_ENABLED 483 kmp_task_team_t *task_team = this_thr->th.th_task_team; 484 485 // we need to wait for the proxy tasks before finishing the thread 486 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) 487 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL)); 488 #endif 489 490 KMP_MB(); 491 KMP_DEBUG_ASSERT(serial_team); 492 KMP_ASSERT(serial_team->t.t_serialized); 493 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 494 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team); 495 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 496 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 497 498 #if OMPT_SUPPORT 499 if (ompt_enabled.enabled && 500 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 501 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL; 502 if (ompt_enabled.ompt_callback_implicit_task) { 503 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 504 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1, 505 OMPT_CUR_TASK_INFO(this_thr)->thread_num); 506 } 507 508 // reset clear the task id only after unlinking the task 509 ompt_data_t *parent_task_data; 510 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); 511 512 if (ompt_enabled.ompt_callback_parallel_end) { 513 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 514 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, 515 ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid)); 516 } 517 __ompt_lw_taskteam_unlink(this_thr); 518 this_thr->th.ompt_thread_info.state = omp_state_overhead; 519 } 520 #endif 521 522 /* If necessary, pop the internal control stack values and replace the team 523 * values */ 524 top = serial_team->t.t_control_stack_top; 525 if (top && top->serial_nesting_level == serial_team->t.t_serialized) { 526 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top); 527 serial_team->t.t_control_stack_top = top->next; 528 __kmp_free(top); 529 } 530 531 // if( serial_team -> t.t_serialized > 1 ) 532 serial_team->t.t_level--; 533 534 /* pop dispatch buffers stack */ 535 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer); 536 { 537 dispatch_private_info_t *disp_buffer = 538 serial_team->t.t_dispatch->th_disp_buffer; 539 serial_team->t.t_dispatch->th_disp_buffer = 540 serial_team->t.t_dispatch->th_disp_buffer->next; 541 __kmp_free(disp_buffer); 542 } 543 544 --serial_team->t.t_serialized; 545 if (serial_team->t.t_serialized == 0) { 546 547 /* return to the parallel section */ 548 549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 550 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) { 551 __kmp_clear_x87_fpu_status_word(); 552 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word); 553 __kmp_load_mxcsr(&serial_team->t.t_mxcsr); 554 } 555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 556 557 this_thr->th.th_team = serial_team->t.t_parent; 558 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid; 559 560 /* restore values cached in the thread */ 561 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */ 562 this_thr->th.th_team_master = 563 serial_team->t.t_parent->t.t_threads[0]; /* JPH */ 564 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized; 565 566 /* TODO the below shouldn't need to be adjusted for serialized teams */ 567 this_thr->th.th_dispatch = 568 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid]; 569 570 __kmp_pop_current_task_from_thread(this_thr); 571 572 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0); 573 this_thr->th.th_current_task->td_flags.executing = 1; 574 575 if (__kmp_tasking_mode != tskm_immediate_exec) { 576 // Copy the task team from the new child / old parent team to the thread. 577 this_thr->th.th_task_team = 578 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]; 579 KA_TRACE(20, 580 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / " 581 "team %p\n", 582 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 583 } 584 } else { 585 if (__kmp_tasking_mode != tskm_immediate_exec) { 586 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting " 587 "depth of serial team %p to %d\n", 588 global_tid, serial_team, serial_team->t.t_serialized)); 589 } 590 } 591 592 if (__kmp_env_consistency_check) 593 __kmp_pop_parallel(global_tid, NULL); 594 #if OMPT_SUPPORT 595 if (ompt_enabled.enabled) 596 this_thr->th.ompt_thread_info.state = 597 ((this_thr->th.th_team_serialized) ? omp_state_work_serial 598 : omp_state_work_parallel); 599 #endif 600 } 601 602 /*! 603 @ingroup SYNCHRONIZATION 604 @param loc source location information. 605 606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though 607 depending on the memory ordering convention obeyed by the compiler 608 even that may not be necessary). 609 */ 610 void __kmpc_flush(ident_t *loc) { 611 KC_TRACE(10, ("__kmpc_flush: called\n")); 612 613 /* need explicit __mf() here since use volatile instead in library */ 614 KMP_MB(); /* Flush all pending memory write invalidates. */ 615 616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) 617 #if KMP_MIC 618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used. 619 // We shouldn't need it, though, since the ABI rules require that 620 // * If the compiler generates NGO stores it also generates the fence 621 // * If users hand-code NGO stores they should insert the fence 622 // therefore no incomplete unordered stores should be visible. 623 #else 624 // C74404 625 // This is to address non-temporal store instructions (sfence needed). 626 // The clflush instruction is addressed either (mfence needed). 627 // Probably the non-temporal load monvtdqa instruction should also be 628 // addressed. 629 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2. 630 if (!__kmp_cpuinfo.initialized) { 631 __kmp_query_cpuid(&__kmp_cpuinfo); 632 } 633 if (!__kmp_cpuinfo.sse2) { 634 // CPU cannot execute SSE2 instructions. 635 } else { 636 #if KMP_COMPILER_ICC 637 _mm_mfence(); 638 #elif KMP_COMPILER_MSVC 639 MemoryBarrier(); 640 #else 641 __sync_synchronize(); 642 #endif // KMP_COMPILER_ICC 643 } 644 #endif // KMP_MIC 645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64) 646 // Nothing to see here move along 647 #elif KMP_ARCH_PPC64 648 // Nothing needed here (we have a real MB above). 649 #if KMP_OS_CNK 650 // The flushing thread needs to yield here; this prevents a 651 // busy-waiting thread from saturating the pipeline. flush is 652 // often used in loops like this: 653 // while (!flag) { 654 // #pragma omp flush(flag) 655 // } 656 // and adding the yield here is good for at least a 10x speedup 657 // when running >2 threads per core (on the NAS LU benchmark). 658 __kmp_yield(TRUE); 659 #endif 660 #else 661 #error Unknown or unsupported architecture 662 #endif 663 664 #if OMPT_SUPPORT && OMPT_OPTIONAL 665 if (ompt_enabled.ompt_callback_flush) { 666 ompt_callbacks.ompt_callback(ompt_callback_flush)( 667 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0)); 668 } 669 #endif 670 } 671 672 /* -------------------------------------------------------------------------- */ 673 /*! 674 @ingroup SYNCHRONIZATION 675 @param loc source location information 676 @param global_tid thread id. 677 678 Execute a barrier. 679 */ 680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { 681 KMP_COUNT_BLOCK(OMP_BARRIER); 682 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid)); 683 684 if (!TCR_4(__kmp_init_parallel)) 685 __kmp_parallel_initialize(); 686 687 if (__kmp_env_consistency_check) { 688 if (loc == 0) { 689 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 690 } 691 692 __kmp_check_barrier(global_tid, ct_barrier, loc); 693 } 694 695 #if OMPT_SUPPORT 696 omp_frame_t *ompt_frame; 697 if (ompt_enabled.enabled) { 698 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 699 if (ompt_frame->enter_frame == NULL) 700 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 701 OMPT_STORE_RETURN_ADDRESS(global_tid); 702 } 703 #endif 704 __kmp_threads[global_tid]->th.th_ident = loc; 705 // TODO: explicit barrier_wait_id: 706 // this function is called when 'barrier' directive is present or 707 // implicit barrier at the end of a worksharing construct. 708 // 1) better to add a per-thread barrier counter to a thread data structure 709 // 2) set to 0 when a new team is created 710 // 4) no sync is required 711 712 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 713 #if OMPT_SUPPORT && OMPT_OPTIONAL 714 if (ompt_enabled.enabled) { 715 ompt_frame->enter_frame = NULL; 716 } 717 #endif 718 } 719 720 /* The BARRIER for a MASTER section is always explicit */ 721 /*! 722 @ingroup WORK_SHARING 723 @param loc source location information. 724 @param global_tid global thread number . 725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise. 726 */ 727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) { 728 int status = 0; 729 730 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid)); 731 732 if (!TCR_4(__kmp_init_parallel)) 733 __kmp_parallel_initialize(); 734 735 if (KMP_MASTER_GTID(global_tid)) { 736 KMP_COUNT_BLOCK(OMP_MASTER); 737 KMP_PUSH_PARTITIONED_TIMER(OMP_master); 738 status = 1; 739 } 740 741 #if OMPT_SUPPORT && OMPT_OPTIONAL 742 if (status) { 743 if (ompt_enabled.ompt_callback_master) { 744 kmp_info_t *this_thr = __kmp_threads[global_tid]; 745 kmp_team_t *team = this_thr->th.th_team; 746 747 int tid = __kmp_tid_from_gtid(global_tid); 748 ompt_callbacks.ompt_callback(ompt_callback_master)( 749 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data), 750 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 751 OMPT_GET_RETURN_ADDRESS(0)); 752 } 753 } 754 #endif 755 756 if (__kmp_env_consistency_check) { 757 #if KMP_USE_DYNAMIC_LOCK 758 if (status) 759 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0); 760 else 761 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0); 762 #else 763 if (status) 764 __kmp_push_sync(global_tid, ct_master, loc, NULL); 765 else 766 __kmp_check_sync(global_tid, ct_master, loc, NULL); 767 #endif 768 } 769 770 return status; 771 } 772 773 /*! 774 @ingroup WORK_SHARING 775 @param loc source location information. 776 @param global_tid global thread number . 777 778 Mark the end of a <tt>master</tt> region. This should only be called by the 779 thread that executes the <tt>master</tt> region. 780 */ 781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) { 782 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid)); 783 784 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid)); 785 KMP_POP_PARTITIONED_TIMER(); 786 787 #if OMPT_SUPPORT && OMPT_OPTIONAL 788 kmp_info_t *this_thr = __kmp_threads[global_tid]; 789 kmp_team_t *team = this_thr->th.th_team; 790 if (ompt_enabled.ompt_callback_master) { 791 int tid = __kmp_tid_from_gtid(global_tid); 792 ompt_callbacks.ompt_callback(ompt_callback_master)( 793 ompt_scope_end, &(team->t.ompt_team_info.parallel_data), 794 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 795 OMPT_GET_RETURN_ADDRESS(0)); 796 } 797 #endif 798 799 if (__kmp_env_consistency_check) { 800 if (global_tid < 0) 801 KMP_WARNING(ThreadIdentInvalid); 802 803 if (KMP_MASTER_GTID(global_tid)) 804 __kmp_pop_sync(global_tid, ct_master, loc); 805 } 806 } 807 808 /*! 809 @ingroup WORK_SHARING 810 @param loc source location information. 811 @param gtid global thread number. 812 813 Start execution of an <tt>ordered</tt> construct. 814 */ 815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) { 816 int cid = 0; 817 kmp_info_t *th; 818 KMP_DEBUG_ASSERT(__kmp_init_serial); 819 820 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid)); 821 822 if (!TCR_4(__kmp_init_parallel)) 823 __kmp_parallel_initialize(); 824 825 #if USE_ITT_BUILD 826 __kmp_itt_ordered_prep(gtid); 827 // TODO: ordered_wait_id 828 #endif /* USE_ITT_BUILD */ 829 830 th = __kmp_threads[gtid]; 831 832 #if OMPT_SUPPORT && OMPT_OPTIONAL 833 kmp_team_t *team; 834 omp_wait_id_t lck; 835 void *codeptr_ra; 836 if (ompt_enabled.enabled) { 837 OMPT_STORE_RETURN_ADDRESS(gtid); 838 team = __kmp_team_from_gtid(gtid); 839 lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value; 840 /* OMPT state update */ 841 th->th.ompt_thread_info.wait_id = lck; 842 th->th.ompt_thread_info.state = omp_state_wait_ordered; 843 844 /* OMPT event callback */ 845 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 846 if (ompt_enabled.ompt_callback_mutex_acquire) { 847 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 848 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, 849 (omp_wait_id_t)lck, codeptr_ra); 850 } 851 } 852 #endif 853 854 if (th->th.th_dispatch->th_deo_fcn != 0) 855 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc); 856 else 857 __kmp_parallel_deo(>id, &cid, loc); 858 859 #if OMPT_SUPPORT && OMPT_OPTIONAL 860 if (ompt_enabled.enabled) { 861 /* OMPT state update */ 862 th->th.ompt_thread_info.state = omp_state_work_parallel; 863 th->th.ompt_thread_info.wait_id = 0; 864 865 /* OMPT event callback */ 866 if (ompt_enabled.ompt_callback_mutex_acquired) { 867 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 868 ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra); 869 } 870 } 871 #endif 872 873 #if USE_ITT_BUILD 874 __kmp_itt_ordered_start(gtid); 875 #endif /* USE_ITT_BUILD */ 876 } 877 878 /*! 879 @ingroup WORK_SHARING 880 @param loc source location information. 881 @param gtid global thread number. 882 883 End execution of an <tt>ordered</tt> construct. 884 */ 885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) { 886 int cid = 0; 887 kmp_info_t *th; 888 889 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid)); 890 891 #if USE_ITT_BUILD 892 __kmp_itt_ordered_end(gtid); 893 // TODO: ordered_wait_id 894 #endif /* USE_ITT_BUILD */ 895 896 th = __kmp_threads[gtid]; 897 898 if (th->th.th_dispatch->th_dxo_fcn != 0) 899 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc); 900 else 901 __kmp_parallel_dxo(>id, &cid, loc); 902 903 #if OMPT_SUPPORT && OMPT_OPTIONAL 904 OMPT_STORE_RETURN_ADDRESS(gtid); 905 if (ompt_enabled.ompt_callback_mutex_released) { 906 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 907 ompt_mutex_ordered, 908 (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value, 909 OMPT_LOAD_RETURN_ADDRESS(gtid)); 910 } 911 #endif 912 } 913 914 #if KMP_USE_DYNAMIC_LOCK 915 916 static __forceinline void 917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc, 918 kmp_int32 gtid, kmp_indirect_locktag_t tag) { 919 // Pointer to the allocated indirect lock is written to crit, while indexing 920 // is ignored. 921 void *idx; 922 kmp_indirect_lock_t **lck; 923 lck = (kmp_indirect_lock_t **)crit; 924 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag); 925 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock); 926 KMP_SET_I_LOCK_LOCATION(ilk, loc); 927 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section); 928 KA_TRACE(20, 929 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag)); 930 #if USE_ITT_BUILD 931 __kmp_itt_critical_creating(ilk->lock, loc); 932 #endif 933 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk); 934 if (status == 0) { 935 #if USE_ITT_BUILD 936 __kmp_itt_critical_destroyed(ilk->lock); 937 #endif 938 // We don't really need to destroy the unclaimed lock here since it will be 939 // cleaned up at program exit. 940 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx); 941 } 942 KMP_DEBUG_ASSERT(*lck != NULL); 943 } 944 945 // Fast-path acquire tas lock 946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \ 947 { \ 948 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 949 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 950 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 951 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 952 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 953 kmp_uint32 spins; \ 954 KMP_FSYNC_PREPARE(l); \ 955 KMP_INIT_YIELD(spins); \ 956 if (TCR_4(__kmp_nth) > \ 957 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 958 KMP_YIELD(TRUE); \ 959 } else { \ 960 KMP_YIELD_SPIN(spins); \ 961 } \ 962 kmp_backoff_t backoff = __kmp_spin_backoff_params; \ 963 while ( \ 964 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \ 965 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \ 966 __kmp_spin_backoff(&backoff); \ 967 if (TCR_4(__kmp_nth) > \ 968 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \ 969 KMP_YIELD(TRUE); \ 970 } else { \ 971 KMP_YIELD_SPIN(spins); \ 972 } \ 973 } \ 974 } \ 975 KMP_FSYNC_ACQUIRED(l); \ 976 } 977 978 // Fast-path test tas lock 979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \ 980 { \ 981 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \ 982 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \ 983 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \ 984 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \ 985 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \ 986 } 987 988 // Fast-path release tas lock 989 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \ 990 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); } 991 992 #if KMP_USE_FUTEX 993 994 #include <sys/syscall.h> 995 #include <unistd.h> 996 #ifndef FUTEX_WAIT 997 #define FUTEX_WAIT 0 998 #endif 999 #ifndef FUTEX_WAKE 1000 #define FUTEX_WAKE 1 1001 #endif 1002 1003 // Fast-path acquire futex lock 1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \ 1005 { \ 1006 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1007 kmp_int32 gtid_code = (gtid + 1) << 1; \ 1008 KMP_MB(); \ 1009 KMP_FSYNC_PREPARE(ftx); \ 1010 kmp_int32 poll_val; \ 1011 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \ 1012 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1013 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \ 1014 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \ 1015 if (!cond) { \ 1016 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \ 1017 poll_val | \ 1018 KMP_LOCK_BUSY(1, futex))) { \ 1019 continue; \ 1020 } \ 1021 poll_val |= KMP_LOCK_BUSY(1, futex); \ 1022 } \ 1023 kmp_int32 rc; \ 1024 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \ 1025 NULL, NULL, 0)) != 0) { \ 1026 continue; \ 1027 } \ 1028 gtid_code |= 1; \ 1029 } \ 1030 KMP_FSYNC_ACQUIRED(ftx); \ 1031 } 1032 1033 // Fast-path test futex lock 1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \ 1035 { \ 1036 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1037 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \ 1038 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \ 1039 KMP_FSYNC_ACQUIRED(ftx); \ 1040 rc = TRUE; \ 1041 } else { \ 1042 rc = FALSE; \ 1043 } \ 1044 } 1045 1046 // Fast-path release futex lock 1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \ 1048 { \ 1049 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \ 1050 KMP_MB(); \ 1051 KMP_FSYNC_RELEASING(ftx); \ 1052 kmp_int32 poll_val = \ 1053 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \ 1054 if (KMP_LOCK_STRIP(poll_val) & 1) { \ 1055 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \ 1056 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \ 1057 } \ 1058 KMP_MB(); \ 1059 KMP_YIELD(TCR_4(__kmp_nth) > \ 1060 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \ 1061 } 1062 1063 #endif // KMP_USE_FUTEX 1064 1065 #else // KMP_USE_DYNAMIC_LOCK 1066 1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit, 1068 ident_t const *loc, 1069 kmp_int32 gtid) { 1070 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit; 1071 1072 // Because of the double-check, the following load doesn't need to be volatile 1073 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1074 1075 if (lck == NULL) { 1076 void *idx; 1077 1078 // Allocate & initialize the lock. 1079 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup() 1080 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section); 1081 __kmp_init_user_lock_with_checks(lck); 1082 __kmp_set_user_lock_location(lck, loc); 1083 #if USE_ITT_BUILD 1084 __kmp_itt_critical_creating(lck); 1085 // __kmp_itt_critical_creating() should be called *before* the first usage 1086 // of underlying lock. It is the only place where we can guarantee it. There 1087 // are chances the lock will destroyed with no usage, but it is not a 1088 // problem, because this is not real event seen by user but rather setting 1089 // name for object (lock). See more details in kmp_itt.h. 1090 #endif /* USE_ITT_BUILD */ 1091 1092 // Use a cmpxchg instruction to slam the start of the critical section with 1093 // the lock pointer. If another thread beat us to it, deallocate the lock, 1094 // and use the lock that the other thread allocated. 1095 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck); 1096 1097 if (status == 0) { 1098 // Deallocate the lock and reload the value. 1099 #if USE_ITT_BUILD 1100 __kmp_itt_critical_destroyed(lck); 1101 // Let ITT know the lock is destroyed and the same memory location may be reused 1102 // for another purpose. 1103 #endif /* USE_ITT_BUILD */ 1104 __kmp_destroy_user_lock_with_checks(lck); 1105 __kmp_user_lock_free(&idx, gtid, lck); 1106 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp); 1107 KMP_DEBUG_ASSERT(lck != NULL); 1108 } 1109 } 1110 return lck; 1111 } 1112 1113 #endif // KMP_USE_DYNAMIC_LOCK 1114 1115 /*! 1116 @ingroup WORK_SHARING 1117 @param loc source location information. 1118 @param global_tid global thread number . 1119 @param crit identity of the critical section. This could be a pointer to a lock 1120 associated with the critical section, or some other suitably unique value. 1121 1122 Enter code protected by a `critical` construct. 1123 This function blocks until the executing thread can enter the critical section. 1124 */ 1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, 1126 kmp_critical_name *crit) { 1127 #if KMP_USE_DYNAMIC_LOCK 1128 #if OMPT_SUPPORT && OMPT_OPTIONAL 1129 OMPT_STORE_RETURN_ADDRESS(global_tid); 1130 #endif // OMPT_SUPPORT 1131 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); 1132 #else 1133 KMP_COUNT_BLOCK(OMP_CRITICAL); 1134 #if OMPT_SUPPORT && OMPT_OPTIONAL 1135 omp_state_t prev_state = omp_state_undefined; 1136 ompt_thread_info_t ti; 1137 #endif 1138 kmp_user_lock_p lck; 1139 1140 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1141 1142 // TODO: add THR_OVHD_STATE 1143 1144 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1145 KMP_CHECK_USER_LOCK_INIT(); 1146 1147 if ((__kmp_user_lock_kind == lk_tas) && 1148 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1149 lck = (kmp_user_lock_p)crit; 1150 } 1151 #if KMP_USE_FUTEX 1152 else if ((__kmp_user_lock_kind == lk_futex) && 1153 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1154 lck = (kmp_user_lock_p)crit; 1155 } 1156 #endif 1157 else { // ticket, queuing or drdpa 1158 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 1159 } 1160 1161 if (__kmp_env_consistency_check) 1162 __kmp_push_sync(global_tid, ct_critical, loc, lck); 1163 1164 // since the critical directive binds to all threads, not just the current 1165 // team we have to check this even if we are in a serialized team. 1166 // also, even if we are the uber thread, we still have to conduct the lock, 1167 // as we have to contend with sibling threads. 1168 1169 #if USE_ITT_BUILD 1170 __kmp_itt_critical_acquiring(lck); 1171 #endif /* USE_ITT_BUILD */ 1172 #if OMPT_SUPPORT && OMPT_OPTIONAL 1173 OMPT_STORE_RETURN_ADDRESS(gtid); 1174 void *codeptr_ra = NULL; 1175 if (ompt_enabled.enabled) { 1176 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1177 /* OMPT state update */ 1178 prev_state = ti.state; 1179 ti.wait_id = (omp_wait_id_t)lck; 1180 ti.state = omp_state_wait_critical; 1181 1182 /* OMPT event callback */ 1183 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid); 1184 if (ompt_enabled.ompt_callback_mutex_acquire) { 1185 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1186 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 1187 (omp_wait_id_t)crit, codeptr_ra); 1188 } 1189 } 1190 #endif 1191 // Value of 'crit' should be good for using as a critical_id of the critical 1192 // section directive. 1193 __kmp_acquire_user_lock_with_checks(lck, global_tid); 1194 1195 #if USE_ITT_BUILD 1196 __kmp_itt_critical_acquired(lck); 1197 #endif /* USE_ITT_BUILD */ 1198 #if OMPT_SUPPORT && OMPT_OPTIONAL 1199 if (ompt_enabled.enabled) { 1200 /* OMPT state update */ 1201 ti.state = prev_state; 1202 ti.wait_id = 0; 1203 1204 /* OMPT event callback */ 1205 if (ompt_enabled.ompt_callback_mutex_acquired) { 1206 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1207 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra); 1208 } 1209 } 1210 #endif 1211 KMP_POP_PARTITIONED_TIMER(); 1212 1213 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1214 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1215 #endif // KMP_USE_DYNAMIC_LOCK 1216 } 1217 1218 #if KMP_USE_DYNAMIC_LOCK 1219 1220 // Converts the given hint to an internal lock implementation 1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) { 1222 #if KMP_USE_TSX 1223 #define KMP_TSX_LOCK(seq) lockseq_##seq 1224 #else 1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq 1226 #endif 1227 1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm) 1230 #else 1231 #define KMP_CPUINFO_RTM 0 1232 #endif 1233 1234 // Hints that do not require further logic 1235 if (hint & kmp_lock_hint_hle) 1236 return KMP_TSX_LOCK(hle); 1237 if (hint & kmp_lock_hint_rtm) 1238 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq; 1239 if (hint & kmp_lock_hint_adaptive) 1240 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq; 1241 1242 // Rule out conflicting hints first by returning the default lock 1243 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended)) 1244 return __kmp_user_lock_seq; 1245 if ((hint & omp_lock_hint_speculative) && 1246 (hint & omp_lock_hint_nonspeculative)) 1247 return __kmp_user_lock_seq; 1248 1249 // Do not even consider speculation when it appears to be contended 1250 if (hint & omp_lock_hint_contended) 1251 return lockseq_queuing; 1252 1253 // Uncontended lock without speculation 1254 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative)) 1255 return lockseq_tas; 1256 1257 // HLE lock for speculation 1258 if (hint & omp_lock_hint_speculative) 1259 return KMP_TSX_LOCK(hle); 1260 1261 return __kmp_user_lock_seq; 1262 } 1263 1264 #if OMPT_SUPPORT && OMPT_OPTIONAL 1265 #if KMP_USE_DYNAMIC_LOCK 1266 static kmp_mutex_impl_t 1267 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) { 1268 if (user_lock) { 1269 switch (KMP_EXTRACT_D_TAG(user_lock)) { 1270 case 0: 1271 break; 1272 #if KMP_USE_FUTEX 1273 case locktag_futex: 1274 return kmp_mutex_impl_queuing; 1275 #endif 1276 case locktag_tas: 1277 return kmp_mutex_impl_spin; 1278 #if KMP_USE_TSX 1279 case locktag_hle: 1280 return kmp_mutex_impl_speculative; 1281 #endif 1282 default: 1283 return ompt_mutex_impl_unknown; 1284 } 1285 ilock = KMP_LOOKUP_I_LOCK(user_lock); 1286 } 1287 KMP_ASSERT(ilock); 1288 switch (ilock->type) { 1289 #if KMP_USE_TSX 1290 case locktag_adaptive: 1291 case locktag_rtm: 1292 return kmp_mutex_impl_speculative; 1293 #endif 1294 case locktag_nested_tas: 1295 return kmp_mutex_impl_spin; 1296 #if KMP_USE_FUTEX 1297 case locktag_nested_futex: 1298 #endif 1299 case locktag_ticket: 1300 case locktag_queuing: 1301 case locktag_drdpa: 1302 case locktag_nested_ticket: 1303 case locktag_nested_queuing: 1304 case locktag_nested_drdpa: 1305 return kmp_mutex_impl_queuing; 1306 default: 1307 return ompt_mutex_impl_unknown; 1308 } 1309 } 1310 #else 1311 // For locks without dynamic binding 1312 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() { 1313 switch (__kmp_user_lock_kind) { 1314 case lk_tas: 1315 return kmp_mutex_impl_spin; 1316 #if KMP_USE_FUTEX 1317 case lk_futex: 1318 #endif 1319 case lk_ticket: 1320 case lk_queuing: 1321 case lk_drdpa: 1322 return kmp_mutex_impl_queuing; 1323 #if KMP_USE_TSX 1324 case lk_hle: 1325 case lk_rtm: 1326 case lk_adaptive: 1327 return kmp_mutex_impl_speculative; 1328 #endif 1329 default: 1330 return ompt_mutex_impl_unknown; 1331 } 1332 } 1333 #endif // KMP_USE_DYNAMIC_LOCK 1334 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1335 1336 /*! 1337 @ingroup WORK_SHARING 1338 @param loc source location information. 1339 @param global_tid global thread number. 1340 @param crit identity of the critical section. This could be a pointer to a lock 1341 associated with the critical section, or some other suitably unique value. 1342 @param hint the lock hint. 1343 1344 Enter code protected by a `critical` construct with a hint. The hint value is 1345 used to suggest a lock implementation. This function blocks until the executing 1346 thread can enter the critical section unless the hint suggests use of 1347 speculative execution and the hardware supports it. 1348 */ 1349 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid, 1350 kmp_critical_name *crit, uintptr_t hint) { 1351 KMP_COUNT_BLOCK(OMP_CRITICAL); 1352 kmp_user_lock_p lck; 1353 #if OMPT_SUPPORT && OMPT_OPTIONAL 1354 omp_state_t prev_state = omp_state_undefined; 1355 ompt_thread_info_t ti; 1356 // This is the case, if called from __kmpc_critical: 1357 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1358 if (!codeptr) 1359 codeptr = OMPT_GET_RETURN_ADDRESS(0); 1360 #endif 1361 1362 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid)); 1363 1364 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 1365 // Check if it is initialized. 1366 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait); 1367 if (*lk == 0) { 1368 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint); 1369 if (KMP_IS_D_LOCK(lckseq)) { 1370 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 1371 KMP_GET_D_TAG(lckseq)); 1372 } else { 1373 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq)); 1374 } 1375 } 1376 // Branch for accessing the actual lock object and set operation. This 1377 // branching is inevitable since this lock initialization does not follow the 1378 // normal dispatch path (lock table is not used). 1379 if (KMP_EXTRACT_D_TAG(lk) != 0) { 1380 lck = (kmp_user_lock_p)lk; 1381 if (__kmp_env_consistency_check) { 1382 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1383 __kmp_map_hint_to_lock(hint)); 1384 } 1385 #if USE_ITT_BUILD 1386 __kmp_itt_critical_acquiring(lck); 1387 #endif 1388 #if OMPT_SUPPORT && OMPT_OPTIONAL 1389 if (ompt_enabled.enabled) { 1390 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1391 /* OMPT state update */ 1392 prev_state = ti.state; 1393 ti.wait_id = (omp_wait_id_t)lck; 1394 ti.state = omp_state_wait_critical; 1395 1396 /* OMPT event callback */ 1397 if (ompt_enabled.ompt_callback_mutex_acquire) { 1398 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1399 ompt_mutex_critical, (unsigned int)hint, 1400 __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr); 1401 } 1402 } 1403 #endif 1404 #if KMP_USE_INLINED_TAS 1405 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1406 KMP_ACQUIRE_TAS_LOCK(lck, global_tid); 1407 } else 1408 #elif KMP_USE_INLINED_FUTEX 1409 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1410 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid); 1411 } else 1412 #endif 1413 { 1414 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 1415 } 1416 } else { 1417 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 1418 lck = ilk->lock; 1419 if (__kmp_env_consistency_check) { 1420 __kmp_push_sync(global_tid, ct_critical, loc, lck, 1421 __kmp_map_hint_to_lock(hint)); 1422 } 1423 #if USE_ITT_BUILD 1424 __kmp_itt_critical_acquiring(lck); 1425 #endif 1426 #if OMPT_SUPPORT && OMPT_OPTIONAL 1427 if (ompt_enabled.enabled) { 1428 ti = __kmp_threads[global_tid]->th.ompt_thread_info; 1429 /* OMPT state update */ 1430 prev_state = ti.state; 1431 ti.wait_id = (omp_wait_id_t)lck; 1432 ti.state = omp_state_wait_critical; 1433 1434 /* OMPT event callback */ 1435 if (ompt_enabled.ompt_callback_mutex_acquire) { 1436 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 1437 ompt_mutex_critical, (unsigned int)hint, 1438 __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr); 1439 } 1440 } 1441 #endif 1442 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 1443 } 1444 KMP_POP_PARTITIONED_TIMER(); 1445 1446 #if USE_ITT_BUILD 1447 __kmp_itt_critical_acquired(lck); 1448 #endif /* USE_ITT_BUILD */ 1449 #if OMPT_SUPPORT && OMPT_OPTIONAL 1450 if (ompt_enabled.enabled) { 1451 /* OMPT state update */ 1452 ti.state = prev_state; 1453 ti.wait_id = 0; 1454 1455 /* OMPT event callback */ 1456 if (ompt_enabled.ompt_callback_mutex_acquired) { 1457 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 1458 ompt_mutex_critical, (omp_wait_id_t)crit, codeptr); 1459 } 1460 } 1461 #endif 1462 1463 KMP_PUSH_PARTITIONED_TIMER(OMP_critical); 1464 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid)); 1465 } // __kmpc_critical_with_hint 1466 1467 #endif // KMP_USE_DYNAMIC_LOCK 1468 1469 /*! 1470 @ingroup WORK_SHARING 1471 @param loc source location information. 1472 @param global_tid global thread number . 1473 @param crit identity of the critical section. This could be a pointer to a lock 1474 associated with the critical section, or some other suitably unique value. 1475 1476 Leave a critical section, releasing any lock that was held during its execution. 1477 */ 1478 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, 1479 kmp_critical_name *crit) { 1480 kmp_user_lock_p lck; 1481 1482 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid)); 1483 1484 #if KMP_USE_DYNAMIC_LOCK 1485 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 1486 lck = (kmp_user_lock_p)crit; 1487 KMP_ASSERT(lck != NULL); 1488 if (__kmp_env_consistency_check) { 1489 __kmp_pop_sync(global_tid, ct_critical, loc); 1490 } 1491 #if USE_ITT_BUILD 1492 __kmp_itt_critical_releasing(lck); 1493 #endif 1494 #if KMP_USE_INLINED_TAS 1495 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) { 1496 KMP_RELEASE_TAS_LOCK(lck, global_tid); 1497 } else 1498 #elif KMP_USE_INLINED_FUTEX 1499 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) { 1500 KMP_RELEASE_FUTEX_LOCK(lck, global_tid); 1501 } else 1502 #endif 1503 { 1504 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 1505 } 1506 } else { 1507 kmp_indirect_lock_t *ilk = 1508 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 1509 KMP_ASSERT(ilk != NULL); 1510 lck = ilk->lock; 1511 if (__kmp_env_consistency_check) { 1512 __kmp_pop_sync(global_tid, ct_critical, loc); 1513 } 1514 #if USE_ITT_BUILD 1515 __kmp_itt_critical_releasing(lck); 1516 #endif 1517 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid); 1518 } 1519 1520 #else // KMP_USE_DYNAMIC_LOCK 1521 1522 if ((__kmp_user_lock_kind == lk_tas) && 1523 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) { 1524 lck = (kmp_user_lock_p)crit; 1525 } 1526 #if KMP_USE_FUTEX 1527 else if ((__kmp_user_lock_kind == lk_futex) && 1528 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) { 1529 lck = (kmp_user_lock_p)crit; 1530 } 1531 #endif 1532 else { // ticket, queuing or drdpa 1533 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit)); 1534 } 1535 1536 KMP_ASSERT(lck != NULL); 1537 1538 if (__kmp_env_consistency_check) 1539 __kmp_pop_sync(global_tid, ct_critical, loc); 1540 1541 #if USE_ITT_BUILD 1542 __kmp_itt_critical_releasing(lck); 1543 #endif /* USE_ITT_BUILD */ 1544 // Value of 'crit' should be good for using as a critical_id of the critical 1545 // section directive. 1546 __kmp_release_user_lock_with_checks(lck, global_tid); 1547 1548 #endif // KMP_USE_DYNAMIC_LOCK 1549 1550 #if OMPT_SUPPORT && OMPT_OPTIONAL 1551 /* OMPT release event triggers after lock is released; place here to trigger 1552 * for all #if branches */ 1553 OMPT_STORE_RETURN_ADDRESS(global_tid); 1554 if (ompt_enabled.ompt_callback_mutex_released) { 1555 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 1556 ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0)); 1557 } 1558 #endif 1559 1560 KMP_POP_PARTITIONED_TIMER(); 1561 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid)); 1562 } 1563 1564 /*! 1565 @ingroup SYNCHRONIZATION 1566 @param loc source location information 1567 @param global_tid thread id. 1568 @return one if the thread should execute the master block, zero otherwise 1569 1570 Start execution of a combined barrier and master. The barrier is executed inside 1571 this function. 1572 */ 1573 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1574 int status; 1575 1576 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid)); 1577 1578 if (!TCR_4(__kmp_init_parallel)) 1579 __kmp_parallel_initialize(); 1580 1581 if (__kmp_env_consistency_check) 1582 __kmp_check_barrier(global_tid, ct_barrier, loc); 1583 1584 #if OMPT_SUPPORT 1585 omp_frame_t *ompt_frame; 1586 if (ompt_enabled.enabled) { 1587 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1588 if (ompt_frame->enter_frame == NULL) 1589 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1590 OMPT_STORE_RETURN_ADDRESS(global_tid); 1591 } 1592 #endif 1593 #if USE_ITT_NOTIFY 1594 __kmp_threads[global_tid]->th.th_ident = loc; 1595 #endif 1596 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL); 1597 #if OMPT_SUPPORT && OMPT_OPTIONAL 1598 if (ompt_enabled.enabled) { 1599 ompt_frame->enter_frame = NULL; 1600 } 1601 #endif 1602 1603 return (status != 0) ? 0 : 1; 1604 } 1605 1606 /*! 1607 @ingroup SYNCHRONIZATION 1608 @param loc source location information 1609 @param global_tid thread id. 1610 1611 Complete the execution of a combined barrier and master. This function should 1612 only be called at the completion of the <tt>master</tt> code. Other threads will 1613 still be waiting at the barrier and this call releases them. 1614 */ 1615 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) { 1616 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid)); 1617 1618 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1619 } 1620 1621 /*! 1622 @ingroup SYNCHRONIZATION 1623 @param loc source location information 1624 @param global_tid thread id. 1625 @return one if the thread should execute the master block, zero otherwise 1626 1627 Start execution of a combined barrier and master(nowait) construct. 1628 The barrier is executed inside this function. 1629 There is no equivalent "end" function, since the 1630 */ 1631 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) { 1632 kmp_int32 ret; 1633 1634 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid)); 1635 1636 if (!TCR_4(__kmp_init_parallel)) 1637 __kmp_parallel_initialize(); 1638 1639 if (__kmp_env_consistency_check) { 1640 if (loc == 0) { 1641 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user? 1642 } 1643 __kmp_check_barrier(global_tid, ct_barrier, loc); 1644 } 1645 1646 #if OMPT_SUPPORT 1647 omp_frame_t *ompt_frame; 1648 if (ompt_enabled.enabled) { 1649 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1650 if (ompt_frame->enter_frame == NULL) 1651 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1652 OMPT_STORE_RETURN_ADDRESS(global_tid); 1653 } 1654 #endif 1655 #if USE_ITT_NOTIFY 1656 __kmp_threads[global_tid]->th.th_ident = loc; 1657 #endif 1658 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 1659 #if OMPT_SUPPORT && OMPT_OPTIONAL 1660 if (ompt_enabled.enabled) { 1661 ompt_frame->enter_frame = NULL; 1662 } 1663 #endif 1664 1665 ret = __kmpc_master(loc, global_tid); 1666 1667 if (__kmp_env_consistency_check) { 1668 /* there's no __kmpc_end_master called; so the (stats) */ 1669 /* actions of __kmpc_end_master are done here */ 1670 1671 if (global_tid < 0) { 1672 KMP_WARNING(ThreadIdentInvalid); 1673 } 1674 if (ret) { 1675 /* only one thread should do the pop since only */ 1676 /* one did the push (see __kmpc_master()) */ 1677 1678 __kmp_pop_sync(global_tid, ct_master, loc); 1679 } 1680 } 1681 1682 return (ret); 1683 } 1684 1685 /* The BARRIER for a SINGLE process section is always explicit */ 1686 /*! 1687 @ingroup WORK_SHARING 1688 @param loc source location information 1689 @param global_tid global thread number 1690 @return One if this thread should execute the single construct, zero otherwise. 1691 1692 Test whether to execute a <tt>single</tt> construct. 1693 There are no implicit barriers in the two "single" calls, rather the compiler 1694 should introduce an explicit barrier if it is required. 1695 */ 1696 1697 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) { 1698 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE); 1699 1700 if (rc) { 1701 // We are going to execute the single statement, so we should count it. 1702 KMP_COUNT_BLOCK(OMP_SINGLE); 1703 KMP_PUSH_PARTITIONED_TIMER(OMP_single); 1704 } 1705 1706 #if OMPT_SUPPORT && OMPT_OPTIONAL 1707 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1708 kmp_team_t *team = this_thr->th.th_team; 1709 int tid = __kmp_tid_from_gtid(global_tid); 1710 1711 if (ompt_enabled.enabled) { 1712 if (rc) { 1713 if (ompt_enabled.ompt_callback_work) { 1714 ompt_callbacks.ompt_callback(ompt_callback_work)( 1715 ompt_work_single_executor, ompt_scope_begin, 1716 &(team->t.ompt_team_info.parallel_data), 1717 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1718 1, OMPT_GET_RETURN_ADDRESS(0)); 1719 } 1720 } else { 1721 if (ompt_enabled.ompt_callback_work) { 1722 ompt_callbacks.ompt_callback(ompt_callback_work)( 1723 ompt_work_single_other, ompt_scope_begin, 1724 &(team->t.ompt_team_info.parallel_data), 1725 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1726 1, OMPT_GET_RETURN_ADDRESS(0)); 1727 ompt_callbacks.ompt_callback(ompt_callback_work)( 1728 ompt_work_single_other, ompt_scope_end, 1729 &(team->t.ompt_team_info.parallel_data), 1730 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1731 1, OMPT_GET_RETURN_ADDRESS(0)); 1732 } 1733 } 1734 } 1735 #endif 1736 1737 return rc; 1738 } 1739 1740 /*! 1741 @ingroup WORK_SHARING 1742 @param loc source location information 1743 @param global_tid global thread number 1744 1745 Mark the end of a <tt>single</tt> construct. This function should 1746 only be called by the thread that executed the block of code protected 1747 by the `single` construct. 1748 */ 1749 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { 1750 __kmp_exit_single(global_tid); 1751 KMP_POP_PARTITIONED_TIMER(); 1752 1753 #if OMPT_SUPPORT && OMPT_OPTIONAL 1754 kmp_info_t *this_thr = __kmp_threads[global_tid]; 1755 kmp_team_t *team = this_thr->th.th_team; 1756 int tid = __kmp_tid_from_gtid(global_tid); 1757 1758 if (ompt_enabled.ompt_callback_work) { 1759 ompt_callbacks.ompt_callback(ompt_callback_work)( 1760 ompt_work_single_executor, ompt_scope_end, 1761 &(team->t.ompt_team_info.parallel_data), 1762 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1, 1763 OMPT_GET_RETURN_ADDRESS(0)); 1764 } 1765 #endif 1766 } 1767 1768 /*! 1769 @ingroup WORK_SHARING 1770 @param loc Source location 1771 @param global_tid Global thread id 1772 1773 Mark the end of a statically scheduled loop. 1774 */ 1775 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) { 1776 KMP_POP_PARTITIONED_TIMER(); 1777 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid)); 1778 1779 #if OMPT_SUPPORT && OMPT_OPTIONAL 1780 if (ompt_enabled.ompt_callback_work) { 1781 ompt_work_type_t ompt_work_type = ompt_work_loop; 1782 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 1783 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 1784 // Determine workshare type 1785 if (loc != NULL) { 1786 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) { 1787 ompt_work_type = ompt_work_loop; 1788 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) { 1789 ompt_work_type = ompt_work_sections; 1790 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) { 1791 ompt_work_type = ompt_work_distribute; 1792 } else { 1793 // use default set above. 1794 // a warning about this case is provided in __kmpc_for_static_init 1795 } 1796 KMP_DEBUG_ASSERT(ompt_work_type); 1797 } 1798 ompt_callbacks.ompt_callback(ompt_callback_work)( 1799 ompt_work_type, ompt_scope_end, &(team_info->parallel_data), 1800 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)); 1801 } 1802 #endif 1803 if (__kmp_env_consistency_check) 1804 __kmp_pop_workshare(global_tid, ct_pdo, loc); 1805 } 1806 1807 // User routines which take C-style arguments (call by value) 1808 // different from the Fortran equivalent routines 1809 1810 void ompc_set_num_threads(int arg) { 1811 // !!!!! TODO: check the per-task binding 1812 __kmp_set_num_threads(arg, __kmp_entry_gtid()); 1813 } 1814 1815 void ompc_set_dynamic(int flag) { 1816 kmp_info_t *thread; 1817 1818 /* For the thread-private implementation of the internal controls */ 1819 thread = __kmp_entry_thread(); 1820 1821 __kmp_save_internal_controls(thread); 1822 1823 set__dynamic(thread, flag ? TRUE : FALSE); 1824 } 1825 1826 void ompc_set_nested(int flag) { 1827 kmp_info_t *thread; 1828 1829 /* For the thread-private internal controls implementation */ 1830 thread = __kmp_entry_thread(); 1831 1832 __kmp_save_internal_controls(thread); 1833 1834 set__nested(thread, flag ? TRUE : FALSE); 1835 } 1836 1837 void ompc_set_max_active_levels(int max_active_levels) { 1838 /* TO DO */ 1839 /* we want per-task implementation of this internal control */ 1840 1841 /* For the per-thread internal controls implementation */ 1842 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels); 1843 } 1844 1845 void ompc_set_schedule(omp_sched_t kind, int modifier) { 1846 // !!!!! TODO: check the per-task binding 1847 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier); 1848 } 1849 1850 int ompc_get_ancestor_thread_num(int level) { 1851 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level); 1852 } 1853 1854 int ompc_get_team_size(int level) { 1855 return __kmp_get_team_size(__kmp_entry_gtid(), level); 1856 } 1857 1858 void kmpc_set_stacksize(int arg) { 1859 // __kmp_aux_set_stacksize initializes the library if needed 1860 __kmp_aux_set_stacksize(arg); 1861 } 1862 1863 void kmpc_set_stacksize_s(size_t arg) { 1864 // __kmp_aux_set_stacksize initializes the library if needed 1865 __kmp_aux_set_stacksize(arg); 1866 } 1867 1868 void kmpc_set_blocktime(int arg) { 1869 int gtid, tid; 1870 kmp_info_t *thread; 1871 1872 gtid = __kmp_entry_gtid(); 1873 tid = __kmp_tid_from_gtid(gtid); 1874 thread = __kmp_thread_from_gtid(gtid); 1875 1876 __kmp_aux_set_blocktime(arg, thread, tid); 1877 } 1878 1879 void kmpc_set_library(int arg) { 1880 // __kmp_user_set_library initializes the library if needed 1881 __kmp_user_set_library((enum library_type)arg); 1882 } 1883 1884 void kmpc_set_defaults(char const *str) { 1885 // __kmp_aux_set_defaults initializes the library if needed 1886 __kmp_aux_set_defaults(str, KMP_STRLEN(str)); 1887 } 1888 1889 void kmpc_set_disp_num_buffers(int arg) { 1890 // ignore after initialization because some teams have already 1891 // allocated dispatch buffers 1892 if (__kmp_init_serial == 0 && arg > 0) 1893 __kmp_dispatch_num_buffers = arg; 1894 } 1895 1896 int kmpc_set_affinity_mask_proc(int proc, void **mask) { 1897 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1898 return -1; 1899 #else 1900 if (!TCR_4(__kmp_init_middle)) { 1901 __kmp_middle_initialize(); 1902 } 1903 return __kmp_aux_set_affinity_mask_proc(proc, mask); 1904 #endif 1905 } 1906 1907 int kmpc_unset_affinity_mask_proc(int proc, void **mask) { 1908 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1909 return -1; 1910 #else 1911 if (!TCR_4(__kmp_init_middle)) { 1912 __kmp_middle_initialize(); 1913 } 1914 return __kmp_aux_unset_affinity_mask_proc(proc, mask); 1915 #endif 1916 } 1917 1918 int kmpc_get_affinity_mask_proc(int proc, void **mask) { 1919 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED 1920 return -1; 1921 #else 1922 if (!TCR_4(__kmp_init_middle)) { 1923 __kmp_middle_initialize(); 1924 } 1925 return __kmp_aux_get_affinity_mask_proc(proc, mask); 1926 #endif 1927 } 1928 1929 /* -------------------------------------------------------------------------- */ 1930 /*! 1931 @ingroup THREADPRIVATE 1932 @param loc source location information 1933 @param gtid global thread number 1934 @param cpy_size size of the cpy_data buffer 1935 @param cpy_data pointer to data to be copied 1936 @param cpy_func helper function to call for copying data 1937 @param didit flag variable: 1=single thread; 0=not single thread 1938 1939 __kmpc_copyprivate implements the interface for the private data broadcast 1940 needed for the copyprivate clause associated with a single region in an 1941 OpenMP<sup>*</sup> program (both C and Fortran). 1942 All threads participating in the parallel region call this routine. 1943 One of the threads (called the single thread) should have the <tt>didit</tt> 1944 variable set to 1 and all other threads should have that variable set to 0. 1945 All threads pass a pointer to a data buffer (cpy_data) that they have built. 1946 1947 The OpenMP specification forbids the use of nowait on the single region when a 1948 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a 1949 barrier internally to avoid race conditions, so the code generation for the 1950 single region should avoid generating a barrier after the call to @ref 1951 __kmpc_copyprivate. 1952 1953 The <tt>gtid</tt> parameter is the global thread id for the current thread. 1954 The <tt>loc</tt> parameter is a pointer to source location information. 1955 1956 Internal implementation: The single thread will first copy its descriptor 1957 address (cpy_data) to a team-private location, then the other threads will each 1958 call the function pointed to by the parameter cpy_func, which carries out the 1959 copy by copying the data using the cpy_data buffer. 1960 1961 The cpy_func routine used for the copy and the contents of the data area defined 1962 by cpy_data and cpy_size may be built in any fashion that will allow the copy 1963 to be done. For instance, the cpy_data buffer can hold the actual data to be 1964 copied or it may hold a list of pointers to the data. The cpy_func routine must 1965 interpret the cpy_data buffer appropriately. 1966 1967 The interface to cpy_func is as follows: 1968 @code 1969 void cpy_func( void *destination, void *source ) 1970 @endcode 1971 where void *destination is the cpy_data pointer for the thread being copied to 1972 and void *source is the cpy_data pointer for the thread being copied from. 1973 */ 1974 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, 1975 void *cpy_data, void (*cpy_func)(void *, void *), 1976 kmp_int32 didit) { 1977 void **data_ptr; 1978 1979 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid)); 1980 1981 KMP_MB(); 1982 1983 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data; 1984 1985 if (__kmp_env_consistency_check) { 1986 if (loc == 0) { 1987 KMP_WARNING(ConstructIdentInvalid); 1988 } 1989 } 1990 1991 // ToDo: Optimize the following two barriers into some kind of split barrier 1992 1993 if (didit) 1994 *data_ptr = cpy_data; 1995 1996 #if OMPT_SUPPORT 1997 omp_frame_t *ompt_frame; 1998 if (ompt_enabled.enabled) { 1999 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 2000 if (ompt_frame->enter_frame == NULL) 2001 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 2002 OMPT_STORE_RETURN_ADDRESS(gtid); 2003 } 2004 #endif 2005 /* This barrier is not a barrier region boundary */ 2006 #if USE_ITT_NOTIFY 2007 __kmp_threads[gtid]->th.th_ident = loc; 2008 #endif 2009 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2010 2011 if (!didit) 2012 (*cpy_func)(cpy_data, *data_ptr); 2013 2014 // Consider next barrier a user-visible barrier for barrier region boundaries 2015 // Nesting checks are already handled by the single construct checks 2016 2017 #if OMPT_SUPPORT 2018 if (ompt_enabled.enabled) { 2019 OMPT_STORE_RETURN_ADDRESS(gtid); 2020 } 2021 #endif 2022 #if USE_ITT_NOTIFY 2023 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. 2024 // tasks can overwrite the location) 2025 #endif 2026 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2027 #if OMPT_SUPPORT && OMPT_OPTIONAL 2028 if (ompt_enabled.enabled) { 2029 ompt_frame->enter_frame = NULL; 2030 } 2031 #endif 2032 } 2033 2034 /* -------------------------------------------------------------------------- */ 2035 2036 #define INIT_LOCK __kmp_init_user_lock_with_checks 2037 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks 2038 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks 2039 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed 2040 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks 2041 #define ACQUIRE_NESTED_LOCK_TIMED \ 2042 __kmp_acquire_nested_user_lock_with_checks_timed 2043 #define RELEASE_LOCK __kmp_release_user_lock_with_checks 2044 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks 2045 #define TEST_LOCK __kmp_test_user_lock_with_checks 2046 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks 2047 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks 2048 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks 2049 2050 // TODO: Make check abort messages use location info & pass it into 2051 // with_checks routines 2052 2053 #if KMP_USE_DYNAMIC_LOCK 2054 2055 // internal lock initializer 2056 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock, 2057 kmp_dyna_lockseq_t seq) { 2058 if (KMP_IS_D_LOCK(seq)) { 2059 KMP_INIT_D_LOCK(lock, seq); 2060 #if USE_ITT_BUILD 2061 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL); 2062 #endif 2063 } else { 2064 KMP_INIT_I_LOCK(lock, seq); 2065 #if USE_ITT_BUILD 2066 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2067 __kmp_itt_lock_creating(ilk->lock, loc); 2068 #endif 2069 } 2070 } 2071 2072 // internal nest lock initializer 2073 static __forceinline void 2074 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, 2075 kmp_dyna_lockseq_t seq) { 2076 #if KMP_USE_TSX 2077 // Don't have nested lock implementation for speculative locks 2078 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive) 2079 seq = __kmp_user_lock_seq; 2080 #endif 2081 switch (seq) { 2082 case lockseq_tas: 2083 seq = lockseq_nested_tas; 2084 break; 2085 #if KMP_USE_FUTEX 2086 case lockseq_futex: 2087 seq = lockseq_nested_futex; 2088 break; 2089 #endif 2090 case lockseq_ticket: 2091 seq = lockseq_nested_ticket; 2092 break; 2093 case lockseq_queuing: 2094 seq = lockseq_nested_queuing; 2095 break; 2096 case lockseq_drdpa: 2097 seq = lockseq_nested_drdpa; 2098 break; 2099 default: 2100 seq = lockseq_nested_queuing; 2101 } 2102 KMP_INIT_I_LOCK(lock, seq); 2103 #if USE_ITT_BUILD 2104 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock); 2105 __kmp_itt_lock_creating(ilk->lock, loc); 2106 #endif 2107 } 2108 2109 /* initialize the lock with a hint */ 2110 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, 2111 uintptr_t hint) { 2112 KMP_DEBUG_ASSERT(__kmp_init_serial); 2113 if (__kmp_env_consistency_check && user_lock == NULL) { 2114 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint"); 2115 } 2116 2117 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2118 2119 #if OMPT_SUPPORT && OMPT_OPTIONAL 2120 // This is the case, if called from omp_init_lock_with_hint: 2121 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2122 if (!codeptr) 2123 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2124 if (ompt_enabled.ompt_callback_lock_init) { 2125 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2126 ompt_mutex_lock, (omp_lock_hint_t)hint, 2127 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2128 codeptr); 2129 } 2130 #endif 2131 } 2132 2133 /* initialize the lock with a hint */ 2134 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, 2135 void **user_lock, uintptr_t hint) { 2136 KMP_DEBUG_ASSERT(__kmp_init_serial); 2137 if (__kmp_env_consistency_check && user_lock == NULL) { 2138 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint"); 2139 } 2140 2141 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint)); 2142 2143 #if OMPT_SUPPORT && OMPT_OPTIONAL 2144 // This is the case, if called from omp_init_lock_with_hint: 2145 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2146 if (!codeptr) 2147 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2148 if (ompt_enabled.ompt_callback_lock_init) { 2149 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2150 ompt_mutex_nest_lock, (omp_lock_hint_t)hint, 2151 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2152 codeptr); 2153 } 2154 #endif 2155 } 2156 2157 #endif // KMP_USE_DYNAMIC_LOCK 2158 2159 /* initialize the lock */ 2160 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2161 #if KMP_USE_DYNAMIC_LOCK 2162 2163 KMP_DEBUG_ASSERT(__kmp_init_serial); 2164 if (__kmp_env_consistency_check && user_lock == NULL) { 2165 KMP_FATAL(LockIsUninitialized, "omp_init_lock"); 2166 } 2167 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2168 2169 #if OMPT_SUPPORT && OMPT_OPTIONAL 2170 // This is the case, if called from omp_init_lock_with_hint: 2171 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2172 if (!codeptr) 2173 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2174 if (ompt_enabled.ompt_callback_lock_init) { 2175 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2176 ompt_mutex_lock, omp_lock_hint_none, 2177 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2178 codeptr); 2179 } 2180 #endif 2181 2182 #else // KMP_USE_DYNAMIC_LOCK 2183 2184 static char const *const func = "omp_init_lock"; 2185 kmp_user_lock_p lck; 2186 KMP_DEBUG_ASSERT(__kmp_init_serial); 2187 2188 if (__kmp_env_consistency_check) { 2189 if (user_lock == NULL) { 2190 KMP_FATAL(LockIsUninitialized, func); 2191 } 2192 } 2193 2194 KMP_CHECK_USER_LOCK_INIT(); 2195 2196 if ((__kmp_user_lock_kind == lk_tas) && 2197 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2198 lck = (kmp_user_lock_p)user_lock; 2199 } 2200 #if KMP_USE_FUTEX 2201 else if ((__kmp_user_lock_kind == lk_futex) && 2202 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2203 lck = (kmp_user_lock_p)user_lock; 2204 } 2205 #endif 2206 else { 2207 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2208 } 2209 INIT_LOCK(lck); 2210 __kmp_set_user_lock_location(lck, loc); 2211 2212 #if OMPT_SUPPORT && OMPT_OPTIONAL 2213 // This is the case, if called from omp_init_lock_with_hint: 2214 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2215 if (!codeptr) 2216 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2217 if (ompt_enabled.ompt_callback_lock_init) { 2218 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2219 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2220 (omp_wait_id_t)user_lock, codeptr); 2221 } 2222 #endif 2223 2224 #if USE_ITT_BUILD 2225 __kmp_itt_lock_creating(lck); 2226 #endif /* USE_ITT_BUILD */ 2227 2228 #endif // KMP_USE_DYNAMIC_LOCK 2229 } // __kmpc_init_lock 2230 2231 /* initialize the lock */ 2232 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2233 #if KMP_USE_DYNAMIC_LOCK 2234 2235 KMP_DEBUG_ASSERT(__kmp_init_serial); 2236 if (__kmp_env_consistency_check && user_lock == NULL) { 2237 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock"); 2238 } 2239 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq); 2240 2241 #if OMPT_SUPPORT && OMPT_OPTIONAL 2242 // This is the case, if called from omp_init_lock_with_hint: 2243 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2244 if (!codeptr) 2245 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2246 if (ompt_enabled.ompt_callback_lock_init) { 2247 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2248 ompt_mutex_nest_lock, omp_lock_hint_none, 2249 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2250 codeptr); 2251 } 2252 #endif 2253 2254 #else // KMP_USE_DYNAMIC_LOCK 2255 2256 static char const *const func = "omp_init_nest_lock"; 2257 kmp_user_lock_p lck; 2258 KMP_DEBUG_ASSERT(__kmp_init_serial); 2259 2260 if (__kmp_env_consistency_check) { 2261 if (user_lock == NULL) { 2262 KMP_FATAL(LockIsUninitialized, func); 2263 } 2264 } 2265 2266 KMP_CHECK_USER_LOCK_INIT(); 2267 2268 if ((__kmp_user_lock_kind == lk_tas) && 2269 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2270 OMP_NEST_LOCK_T_SIZE)) { 2271 lck = (kmp_user_lock_p)user_lock; 2272 } 2273 #if KMP_USE_FUTEX 2274 else if ((__kmp_user_lock_kind == lk_futex) && 2275 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2276 OMP_NEST_LOCK_T_SIZE)) { 2277 lck = (kmp_user_lock_p)user_lock; 2278 } 2279 #endif 2280 else { 2281 lck = __kmp_user_lock_allocate(user_lock, gtid, 0); 2282 } 2283 2284 INIT_NESTED_LOCK(lck); 2285 __kmp_set_user_lock_location(lck, loc); 2286 2287 #if OMPT_SUPPORT && OMPT_OPTIONAL 2288 // This is the case, if called from omp_init_lock_with_hint: 2289 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2290 if (!codeptr) 2291 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2292 if (ompt_enabled.ompt_callback_lock_init) { 2293 ompt_callbacks.ompt_callback(ompt_callback_lock_init)( 2294 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2295 (omp_wait_id_t)user_lock, codeptr); 2296 } 2297 #endif 2298 2299 #if USE_ITT_BUILD 2300 __kmp_itt_lock_creating(lck); 2301 #endif /* USE_ITT_BUILD */ 2302 2303 #endif // KMP_USE_DYNAMIC_LOCK 2304 } // __kmpc_init_nest_lock 2305 2306 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2307 #if KMP_USE_DYNAMIC_LOCK 2308 2309 #if USE_ITT_BUILD 2310 kmp_user_lock_p lck; 2311 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2312 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2313 } else { 2314 lck = (kmp_user_lock_p)user_lock; 2315 } 2316 __kmp_itt_lock_destroyed(lck); 2317 #endif 2318 #if OMPT_SUPPORT && OMPT_OPTIONAL 2319 // This is the case, if called from omp_init_lock_with_hint: 2320 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2321 if (!codeptr) 2322 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2323 if (ompt_enabled.ompt_callback_lock_destroy) { 2324 kmp_user_lock_p lck; 2325 if (KMP_EXTRACT_D_TAG(user_lock) == 0) { 2326 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock; 2327 } else { 2328 lck = (kmp_user_lock_p)user_lock; 2329 } 2330 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2331 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2332 } 2333 #endif 2334 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2335 #else 2336 kmp_user_lock_p lck; 2337 2338 if ((__kmp_user_lock_kind == lk_tas) && 2339 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2340 lck = (kmp_user_lock_p)user_lock; 2341 } 2342 #if KMP_USE_FUTEX 2343 else if ((__kmp_user_lock_kind == lk_futex) && 2344 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2345 lck = (kmp_user_lock_p)user_lock; 2346 } 2347 #endif 2348 else { 2349 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock"); 2350 } 2351 2352 #if OMPT_SUPPORT && OMPT_OPTIONAL 2353 // This is the case, if called from omp_init_lock_with_hint: 2354 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2355 if (!codeptr) 2356 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2357 if (ompt_enabled.ompt_callback_lock_destroy) { 2358 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2359 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2360 } 2361 #endif 2362 2363 #if USE_ITT_BUILD 2364 __kmp_itt_lock_destroyed(lck); 2365 #endif /* USE_ITT_BUILD */ 2366 DESTROY_LOCK(lck); 2367 2368 if ((__kmp_user_lock_kind == lk_tas) && 2369 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2370 ; 2371 } 2372 #if KMP_USE_FUTEX 2373 else if ((__kmp_user_lock_kind == lk_futex) && 2374 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2375 ; 2376 } 2377 #endif 2378 else { 2379 __kmp_user_lock_free(user_lock, gtid, lck); 2380 } 2381 #endif // KMP_USE_DYNAMIC_LOCK 2382 } // __kmpc_destroy_lock 2383 2384 /* destroy the lock */ 2385 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2386 #if KMP_USE_DYNAMIC_LOCK 2387 2388 #if USE_ITT_BUILD 2389 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock); 2390 __kmp_itt_lock_destroyed(ilk->lock); 2391 #endif 2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 // This is the case, if called from omp_init_lock_with_hint: 2394 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2395 if (!codeptr) 2396 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2397 if (ompt_enabled.ompt_callback_lock_destroy) { 2398 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2399 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2400 } 2401 #endif 2402 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock); 2403 2404 #else // KMP_USE_DYNAMIC_LOCK 2405 2406 kmp_user_lock_p lck; 2407 2408 if ((__kmp_user_lock_kind == lk_tas) && 2409 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2410 OMP_NEST_LOCK_T_SIZE)) { 2411 lck = (kmp_user_lock_p)user_lock; 2412 } 2413 #if KMP_USE_FUTEX 2414 else if ((__kmp_user_lock_kind == lk_futex) && 2415 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2416 OMP_NEST_LOCK_T_SIZE)) { 2417 lck = (kmp_user_lock_p)user_lock; 2418 } 2419 #endif 2420 else { 2421 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock"); 2422 } 2423 2424 #if OMPT_SUPPORT && OMPT_OPTIONAL 2425 // This is the case, if called from omp_init_lock_with_hint: 2426 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2427 if (!codeptr) 2428 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2429 if (ompt_enabled.ompt_callback_lock_destroy) { 2430 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)( 2431 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2432 } 2433 #endif 2434 2435 #if USE_ITT_BUILD 2436 __kmp_itt_lock_destroyed(lck); 2437 #endif /* USE_ITT_BUILD */ 2438 2439 DESTROY_NESTED_LOCK(lck); 2440 2441 if ((__kmp_user_lock_kind == lk_tas) && 2442 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2443 OMP_NEST_LOCK_T_SIZE)) { 2444 ; 2445 } 2446 #if KMP_USE_FUTEX 2447 else if ((__kmp_user_lock_kind == lk_futex) && 2448 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2449 OMP_NEST_LOCK_T_SIZE)) { 2450 ; 2451 } 2452 #endif 2453 else { 2454 __kmp_user_lock_free(user_lock, gtid, lck); 2455 } 2456 #endif // KMP_USE_DYNAMIC_LOCK 2457 } // __kmpc_destroy_nest_lock 2458 2459 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2460 KMP_COUNT_BLOCK(OMP_set_lock); 2461 #if KMP_USE_DYNAMIC_LOCK 2462 int tag = KMP_EXTRACT_D_TAG(user_lock); 2463 #if USE_ITT_BUILD 2464 __kmp_itt_lock_acquiring( 2465 (kmp_user_lock_p) 2466 user_lock); // itt function will get to the right lock object. 2467 #endif 2468 #if OMPT_SUPPORT && OMPT_OPTIONAL 2469 // This is the case, if called from omp_init_lock_with_hint: 2470 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2471 if (!codeptr) 2472 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2473 if (ompt_enabled.ompt_callback_mutex_acquire) { 2474 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2475 ompt_mutex_lock, omp_lock_hint_none, 2476 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2477 codeptr); 2478 } 2479 #endif 2480 #if KMP_USE_INLINED_TAS 2481 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2482 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid); 2483 } else 2484 #elif KMP_USE_INLINED_FUTEX 2485 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2486 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid); 2487 } else 2488 #endif 2489 { 2490 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2491 } 2492 #if USE_ITT_BUILD 2493 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2494 #endif 2495 #if OMPT_SUPPORT && OMPT_OPTIONAL 2496 if (ompt_enabled.ompt_callback_mutex_acquired) { 2497 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2498 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2499 } 2500 #endif 2501 2502 #else // KMP_USE_DYNAMIC_LOCK 2503 2504 kmp_user_lock_p lck; 2505 2506 if ((__kmp_user_lock_kind == lk_tas) && 2507 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2508 lck = (kmp_user_lock_p)user_lock; 2509 } 2510 #if KMP_USE_FUTEX 2511 else if ((__kmp_user_lock_kind == lk_futex) && 2512 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2513 lck = (kmp_user_lock_p)user_lock; 2514 } 2515 #endif 2516 else { 2517 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock"); 2518 } 2519 2520 #if USE_ITT_BUILD 2521 __kmp_itt_lock_acquiring(lck); 2522 #endif /* USE_ITT_BUILD */ 2523 #if OMPT_SUPPORT && OMPT_OPTIONAL 2524 // This is the case, if called from omp_init_lock_with_hint: 2525 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2526 if (!codeptr) 2527 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2528 if (ompt_enabled.ompt_callback_mutex_acquire) { 2529 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2530 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2531 (omp_wait_id_t)lck, codeptr); 2532 } 2533 #endif 2534 2535 ACQUIRE_LOCK(lck, gtid); 2536 2537 #if USE_ITT_BUILD 2538 __kmp_itt_lock_acquired(lck); 2539 #endif /* USE_ITT_BUILD */ 2540 2541 #if OMPT_SUPPORT && OMPT_OPTIONAL 2542 if (ompt_enabled.ompt_callback_mutex_acquired) { 2543 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2544 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2545 } 2546 #endif 2547 2548 #endif // KMP_USE_DYNAMIC_LOCK 2549 } 2550 2551 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2552 #if KMP_USE_DYNAMIC_LOCK 2553 2554 #if USE_ITT_BUILD 2555 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2556 #endif 2557 #if OMPT_SUPPORT && OMPT_OPTIONAL 2558 // This is the case, if called from omp_init_lock_with_hint: 2559 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2560 if (!codeptr) 2561 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2562 if (ompt_enabled.enabled) { 2563 if (ompt_enabled.ompt_callback_mutex_acquire) { 2564 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2565 ompt_mutex_nest_lock, omp_lock_hint_none, 2566 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2567 codeptr); 2568 } 2569 } 2570 #endif 2571 int acquire_status = 2572 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid); 2573 #if USE_ITT_BUILD 2574 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2575 #endif 2576 2577 #if OMPT_SUPPORT && OMPT_OPTIONAL 2578 if (ompt_enabled.enabled) { 2579 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2580 if (ompt_enabled.ompt_callback_mutex_acquired) { 2581 // lock_first 2582 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2583 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2584 } 2585 } else { 2586 if (ompt_enabled.ompt_callback_nest_lock) { 2587 // lock_next 2588 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2589 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 2590 } 2591 } 2592 } 2593 #endif 2594 2595 #else // KMP_USE_DYNAMIC_LOCK 2596 int acquire_status; 2597 kmp_user_lock_p lck; 2598 2599 if ((__kmp_user_lock_kind == lk_tas) && 2600 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2601 OMP_NEST_LOCK_T_SIZE)) { 2602 lck = (kmp_user_lock_p)user_lock; 2603 } 2604 #if KMP_USE_FUTEX 2605 else if ((__kmp_user_lock_kind == lk_futex) && 2606 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2607 OMP_NEST_LOCK_T_SIZE)) { 2608 lck = (kmp_user_lock_p)user_lock; 2609 } 2610 #endif 2611 else { 2612 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock"); 2613 } 2614 2615 #if USE_ITT_BUILD 2616 __kmp_itt_lock_acquiring(lck); 2617 #endif /* USE_ITT_BUILD */ 2618 #if OMPT_SUPPORT && OMPT_OPTIONAL 2619 // This is the case, if called from omp_init_lock_with_hint: 2620 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2621 if (!codeptr) 2622 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2623 if (ompt_enabled.enabled) { 2624 if (ompt_enabled.ompt_callback_mutex_acquire) { 2625 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2626 ompt_mutex_nest_lock, omp_lock_hint_none, 2627 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 2628 } 2629 } 2630 #endif 2631 2632 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status); 2633 2634 #if USE_ITT_BUILD 2635 __kmp_itt_lock_acquired(lck); 2636 #endif /* USE_ITT_BUILD */ 2637 2638 #if OMPT_SUPPORT && OMPT_OPTIONAL 2639 if (ompt_enabled.enabled) { 2640 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) { 2641 if (ompt_enabled.ompt_callback_mutex_acquired) { 2642 // lock_first 2643 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2644 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2645 } 2646 } else { 2647 if (ompt_enabled.ompt_callback_nest_lock) { 2648 // lock_next 2649 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2650 ompt_scope_begin, (omp_wait_id_t)lck, codeptr); 2651 } 2652 } 2653 } 2654 #endif 2655 2656 #endif // KMP_USE_DYNAMIC_LOCK 2657 } 2658 2659 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2660 #if KMP_USE_DYNAMIC_LOCK 2661 2662 int tag = KMP_EXTRACT_D_TAG(user_lock); 2663 #if USE_ITT_BUILD 2664 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2665 #endif 2666 #if KMP_USE_INLINED_TAS 2667 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2668 KMP_RELEASE_TAS_LOCK(user_lock, gtid); 2669 } else 2670 #elif KMP_USE_INLINED_FUTEX 2671 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2672 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid); 2673 } else 2674 #endif 2675 { 2676 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2677 } 2678 2679 #if OMPT_SUPPORT && OMPT_OPTIONAL 2680 // This is the case, if called from omp_init_lock_with_hint: 2681 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2682 if (!codeptr) 2683 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2684 if (ompt_enabled.ompt_callback_mutex_released) { 2685 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2686 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2687 } 2688 #endif 2689 2690 #else // KMP_USE_DYNAMIC_LOCK 2691 2692 kmp_user_lock_p lck; 2693 2694 /* Can't use serial interval since not block structured */ 2695 /* release the lock */ 2696 2697 if ((__kmp_user_lock_kind == lk_tas) && 2698 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2699 #if KMP_OS_LINUX && \ 2700 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2701 // "fast" path implemented to fix customer performance issue 2702 #if USE_ITT_BUILD 2703 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2704 #endif /* USE_ITT_BUILD */ 2705 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0); 2706 KMP_MB(); 2707 2708 #if OMPT_SUPPORT && OMPT_OPTIONAL 2709 // This is the case, if called from omp_init_lock_with_hint: 2710 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2711 if (!codeptr) 2712 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2713 if (ompt_enabled.ompt_callback_mutex_released) { 2714 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2715 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2716 } 2717 #endif 2718 2719 return; 2720 #else 2721 lck = (kmp_user_lock_p)user_lock; 2722 #endif 2723 } 2724 #if KMP_USE_FUTEX 2725 else if ((__kmp_user_lock_kind == lk_futex) && 2726 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2727 lck = (kmp_user_lock_p)user_lock; 2728 } 2729 #endif 2730 else { 2731 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock"); 2732 } 2733 2734 #if USE_ITT_BUILD 2735 __kmp_itt_lock_releasing(lck); 2736 #endif /* USE_ITT_BUILD */ 2737 2738 RELEASE_LOCK(lck, gtid); 2739 2740 #if OMPT_SUPPORT && OMPT_OPTIONAL 2741 // This is the case, if called from omp_init_lock_with_hint: 2742 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2743 if (!codeptr) 2744 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2745 if (ompt_enabled.ompt_callback_mutex_released) { 2746 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2747 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2748 } 2749 #endif 2750 2751 #endif // KMP_USE_DYNAMIC_LOCK 2752 } 2753 2754 /* release the lock */ 2755 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2756 #if KMP_USE_DYNAMIC_LOCK 2757 2758 #if USE_ITT_BUILD 2759 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2760 #endif 2761 int release_status = 2762 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid); 2763 2764 #if OMPT_SUPPORT && OMPT_OPTIONAL 2765 // This is the case, if called from omp_init_lock_with_hint: 2766 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2767 if (!codeptr) 2768 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2769 if (ompt_enabled.enabled) { 2770 if (release_status == KMP_LOCK_RELEASED) { 2771 if (ompt_enabled.ompt_callback_mutex_released) { 2772 // release_lock_last 2773 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2774 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 2775 } 2776 } else if (ompt_enabled.ompt_callback_nest_lock) { 2777 // release_lock_prev 2778 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2779 ompt_scope_end, (omp_wait_id_t)user_lock, codeptr); 2780 } 2781 } 2782 #endif 2783 2784 #else // KMP_USE_DYNAMIC_LOCK 2785 2786 kmp_user_lock_p lck; 2787 2788 /* Can't use serial interval since not block structured */ 2789 2790 if ((__kmp_user_lock_kind == lk_tas) && 2791 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 2792 OMP_NEST_LOCK_T_SIZE)) { 2793 #if KMP_OS_LINUX && \ 2794 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 2795 // "fast" path implemented to fix customer performance issue 2796 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock; 2797 #if USE_ITT_BUILD 2798 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock); 2799 #endif /* USE_ITT_BUILD */ 2800 2801 #if OMPT_SUPPORT && OMPT_OPTIONAL 2802 int release_status = KMP_LOCK_STILL_HELD; 2803 #endif 2804 2805 if (--(tl->lk.depth_locked) == 0) { 2806 TCW_4(tl->lk.poll, 0); 2807 #if OMPT_SUPPORT && OMPT_OPTIONAL 2808 release_status = KMP_LOCK_RELEASED; 2809 #endif 2810 } 2811 KMP_MB(); 2812 2813 #if OMPT_SUPPORT && OMPT_OPTIONAL 2814 // This is the case, if called from omp_init_lock_with_hint: 2815 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2816 if (!codeptr) 2817 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2818 if (ompt_enabled.enabled) { 2819 if (release_status == KMP_LOCK_RELEASED) { 2820 if (ompt_enabled.ompt_callback_mutex_released) { 2821 // release_lock_last 2822 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2823 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2824 } 2825 } else if (ompt_enabled.ompt_callback_nest_lock) { 2826 // release_lock_previous 2827 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2828 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2829 } 2830 } 2831 #endif 2832 2833 return; 2834 #else 2835 lck = (kmp_user_lock_p)user_lock; 2836 #endif 2837 } 2838 #if KMP_USE_FUTEX 2839 else if ((__kmp_user_lock_kind == lk_futex) && 2840 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 2841 OMP_NEST_LOCK_T_SIZE)) { 2842 lck = (kmp_user_lock_p)user_lock; 2843 } 2844 #endif 2845 else { 2846 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock"); 2847 } 2848 2849 #if USE_ITT_BUILD 2850 __kmp_itt_lock_releasing(lck); 2851 #endif /* USE_ITT_BUILD */ 2852 2853 int release_status; 2854 release_status = RELEASE_NESTED_LOCK(lck, gtid); 2855 #if OMPT_SUPPORT && OMPT_OPTIONAL 2856 // This is the case, if called from omp_init_lock_with_hint: 2857 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2858 if (!codeptr) 2859 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2860 if (ompt_enabled.enabled) { 2861 if (release_status == KMP_LOCK_RELEASED) { 2862 if (ompt_enabled.ompt_callback_mutex_released) { 2863 // release_lock_last 2864 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)( 2865 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 2866 } 2867 } else if (ompt_enabled.ompt_callback_nest_lock) { 2868 // release_lock_previous 2869 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 2870 ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr); 2871 } 2872 } 2873 #endif 2874 2875 #endif // KMP_USE_DYNAMIC_LOCK 2876 } 2877 2878 /* try to acquire the lock */ 2879 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2880 KMP_COUNT_BLOCK(OMP_test_lock); 2881 2882 #if KMP_USE_DYNAMIC_LOCK 2883 int rc; 2884 int tag = KMP_EXTRACT_D_TAG(user_lock); 2885 #if USE_ITT_BUILD 2886 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2887 #endif 2888 #if OMPT_SUPPORT && OMPT_OPTIONAL 2889 // This is the case, if called from omp_init_lock_with_hint: 2890 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2891 if (!codeptr) 2892 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2893 if (ompt_enabled.ompt_callback_mutex_acquire) { 2894 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2895 ompt_mutex_lock, omp_lock_hint_none, 2896 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 2897 codeptr); 2898 } 2899 #endif 2900 #if KMP_USE_INLINED_TAS 2901 if (tag == locktag_tas && !__kmp_env_consistency_check) { 2902 KMP_TEST_TAS_LOCK(user_lock, gtid, rc); 2903 } else 2904 #elif KMP_USE_INLINED_FUTEX 2905 if (tag == locktag_futex && !__kmp_env_consistency_check) { 2906 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc); 2907 } else 2908 #endif 2909 { 2910 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid); 2911 } 2912 if (rc) { 2913 #if USE_ITT_BUILD 2914 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 2915 #endif 2916 #if OMPT_SUPPORT && OMPT_OPTIONAL 2917 if (ompt_enabled.ompt_callback_mutex_acquired) { 2918 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2919 ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr); 2920 } 2921 #endif 2922 return FTN_TRUE; 2923 } else { 2924 #if USE_ITT_BUILD 2925 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 2926 #endif 2927 return FTN_FALSE; 2928 } 2929 2930 #else // KMP_USE_DYNAMIC_LOCK 2931 2932 kmp_user_lock_p lck; 2933 int rc; 2934 2935 if ((__kmp_user_lock_kind == lk_tas) && 2936 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) { 2937 lck = (kmp_user_lock_p)user_lock; 2938 } 2939 #if KMP_USE_FUTEX 2940 else if ((__kmp_user_lock_kind == lk_futex) && 2941 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) { 2942 lck = (kmp_user_lock_p)user_lock; 2943 } 2944 #endif 2945 else { 2946 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock"); 2947 } 2948 2949 #if USE_ITT_BUILD 2950 __kmp_itt_lock_acquiring(lck); 2951 #endif /* USE_ITT_BUILD */ 2952 #if OMPT_SUPPORT && OMPT_OPTIONAL 2953 // This is the case, if called from omp_init_lock_with_hint: 2954 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2955 if (!codeptr) 2956 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2957 if (ompt_enabled.ompt_callback_mutex_acquire) { 2958 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 2959 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(), 2960 (omp_wait_id_t)lck, codeptr); 2961 } 2962 #endif 2963 2964 rc = TEST_LOCK(lck, gtid); 2965 #if USE_ITT_BUILD 2966 if (rc) { 2967 __kmp_itt_lock_acquired(lck); 2968 } else { 2969 __kmp_itt_lock_cancelled(lck); 2970 } 2971 #endif /* USE_ITT_BUILD */ 2972 #if OMPT_SUPPORT && OMPT_OPTIONAL 2973 if (rc && ompt_enabled.ompt_callback_mutex_acquired) { 2974 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 2975 ompt_mutex_lock, (omp_wait_id_t)lck, codeptr); 2976 } 2977 #endif 2978 2979 return (rc ? FTN_TRUE : FTN_FALSE); 2980 2981 /* Can't use serial interval since not block structured */ 2982 2983 #endif // KMP_USE_DYNAMIC_LOCK 2984 } 2985 2986 /* try to acquire the lock */ 2987 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) { 2988 #if KMP_USE_DYNAMIC_LOCK 2989 int rc; 2990 #if USE_ITT_BUILD 2991 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); 2992 #endif 2993 #if OMPT_SUPPORT && OMPT_OPTIONAL 2994 // This is the case, if called from omp_init_lock_with_hint: 2995 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2996 if (!codeptr) 2997 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2998 if (ompt_enabled.ompt_callback_mutex_acquire) { 2999 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3000 ompt_mutex_nest_lock, omp_lock_hint_none, 3001 __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock, 3002 codeptr); 3003 } 3004 #endif 3005 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid); 3006 #if USE_ITT_BUILD 3007 if (rc) { 3008 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock); 3009 } else { 3010 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock); 3011 } 3012 #endif 3013 #if OMPT_SUPPORT && OMPT_OPTIONAL 3014 if (ompt_enabled.enabled && rc) { 3015 if (rc == 1) { 3016 if (ompt_enabled.ompt_callback_mutex_acquired) { 3017 // lock_first 3018 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3019 ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr); 3020 } 3021 } else { 3022 if (ompt_enabled.ompt_callback_nest_lock) { 3023 // lock_next 3024 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3025 ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr); 3026 } 3027 } 3028 } 3029 #endif 3030 return rc; 3031 3032 #else // KMP_USE_DYNAMIC_LOCK 3033 3034 kmp_user_lock_p lck; 3035 int rc; 3036 3037 if ((__kmp_user_lock_kind == lk_tas) && 3038 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <= 3039 OMP_NEST_LOCK_T_SIZE)) { 3040 lck = (kmp_user_lock_p)user_lock; 3041 } 3042 #if KMP_USE_FUTEX 3043 else if ((__kmp_user_lock_kind == lk_futex) && 3044 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <= 3045 OMP_NEST_LOCK_T_SIZE)) { 3046 lck = (kmp_user_lock_p)user_lock; 3047 } 3048 #endif 3049 else { 3050 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock"); 3051 } 3052 3053 #if USE_ITT_BUILD 3054 __kmp_itt_lock_acquiring(lck); 3055 #endif /* USE_ITT_BUILD */ 3056 3057 #if OMPT_SUPPORT && OMPT_OPTIONAL 3058 // This is the case, if called from omp_init_lock_with_hint: 3059 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 3060 if (!codeptr) 3061 codeptr = OMPT_GET_RETURN_ADDRESS(0); 3062 if (ompt_enabled.enabled) && 3063 ompt_enabled.ompt_callback_mutex_acquire) { 3064 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)( 3065 ompt_mutex_nest_lock, omp_lock_hint_none, 3066 __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr); 3067 } 3068 #endif 3069 3070 rc = TEST_NESTED_LOCK(lck, gtid); 3071 #if USE_ITT_BUILD 3072 if (rc) { 3073 __kmp_itt_lock_acquired(lck); 3074 } else { 3075 __kmp_itt_lock_cancelled(lck); 3076 } 3077 #endif /* USE_ITT_BUILD */ 3078 #if OMPT_SUPPORT && OMPT_OPTIONAL 3079 if (ompt_enabled.enabled && rc) { 3080 if (rc == 1) { 3081 if (ompt_enabled.ompt_callback_mutex_acquired) { 3082 // lock_first 3083 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)( 3084 ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr); 3085 } 3086 } else { 3087 if (ompt_enabled.ompt_callback_nest_lock) { 3088 // lock_next 3089 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)( 3090 ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr); 3091 } 3092 } 3093 } 3094 #endif 3095 return rc; 3096 3097 /* Can't use serial interval since not block structured */ 3098 3099 #endif // KMP_USE_DYNAMIC_LOCK 3100 } 3101 3102 // Interface to fast scalable reduce methods routines 3103 3104 // keep the selected method in a thread local structure for cross-function 3105 // usage: will be used in __kmpc_end_reduce* functions; 3106 // another solution: to re-determine the method one more time in 3107 // __kmpc_end_reduce* functions (new prototype required then) 3108 // AT: which solution is better? 3109 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \ 3110 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod)) 3111 3112 #define __KMP_GET_REDUCTION_METHOD(gtid) \ 3113 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) 3114 3115 // description of the packed_reduction_method variable: look at the macros in 3116 // kmp.h 3117 3118 // used in a critical section reduce block 3119 static __forceinline void 3120 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3121 kmp_critical_name *crit) { 3122 3123 // this lock was visible to a customer and to the threading profile tool as a 3124 // serial overhead span (although it's used for an internal purpose only) 3125 // why was it visible in previous implementation? 3126 // should we keep it visible in new reduce block? 3127 kmp_user_lock_p lck; 3128 3129 #if KMP_USE_DYNAMIC_LOCK 3130 3131 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit; 3132 // Check if it is initialized. 3133 if (*lk == 0) { 3134 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3135 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, 3136 KMP_GET_D_TAG(__kmp_user_lock_seq)); 3137 } else { 3138 __kmp_init_indirect_csptr(crit, loc, global_tid, 3139 KMP_GET_I_TAG(__kmp_user_lock_seq)); 3140 } 3141 } 3142 // Branch for accessing the actual lock object and set operation. This 3143 // branching is inevitable since this lock initialization does not follow the 3144 // normal dispatch path (lock table is not used). 3145 if (KMP_EXTRACT_D_TAG(lk) != 0) { 3146 lck = (kmp_user_lock_p)lk; 3147 KMP_DEBUG_ASSERT(lck != NULL); 3148 if (__kmp_env_consistency_check) { 3149 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3150 } 3151 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid); 3152 } else { 3153 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk); 3154 lck = ilk->lock; 3155 KMP_DEBUG_ASSERT(lck != NULL); 3156 if (__kmp_env_consistency_check) { 3157 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq); 3158 } 3159 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid); 3160 } 3161 3162 #else // KMP_USE_DYNAMIC_LOCK 3163 3164 // We know that the fast reduction code is only emitted by Intel compilers 3165 // with 32 byte critical sections. If there isn't enough space, then we 3166 // have to use a pointer. 3167 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) { 3168 lck = (kmp_user_lock_p)crit; 3169 } else { 3170 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid); 3171 } 3172 KMP_DEBUG_ASSERT(lck != NULL); 3173 3174 if (__kmp_env_consistency_check) 3175 __kmp_push_sync(global_tid, ct_critical, loc, lck); 3176 3177 __kmp_acquire_user_lock_with_checks(lck, global_tid); 3178 3179 #endif // KMP_USE_DYNAMIC_LOCK 3180 } 3181 3182 // used in a critical section reduce block 3183 static __forceinline void 3184 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid, 3185 kmp_critical_name *crit) { 3186 3187 kmp_user_lock_p lck; 3188 3189 #if KMP_USE_DYNAMIC_LOCK 3190 3191 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) { 3192 lck = (kmp_user_lock_p)crit; 3193 if (__kmp_env_consistency_check) 3194 __kmp_pop_sync(global_tid, ct_critical, loc); 3195 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid); 3196 } else { 3197 kmp_indirect_lock_t *ilk = 3198 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit)); 3199 if (__kmp_env_consistency_check) 3200 __kmp_pop_sync(global_tid, ct_critical, loc); 3201 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid); 3202 } 3203 3204 #else // KMP_USE_DYNAMIC_LOCK 3205 3206 // We know that the fast reduction code is only emitted by Intel compilers 3207 // with 32 byte critical sections. If there isn't enough space, then we have 3208 // to use a pointer. 3209 if (__kmp_base_user_lock_size > 32) { 3210 lck = *((kmp_user_lock_p *)crit); 3211 KMP_ASSERT(lck != NULL); 3212 } else { 3213 lck = (kmp_user_lock_p)crit; 3214 } 3215 3216 if (__kmp_env_consistency_check) 3217 __kmp_pop_sync(global_tid, ct_critical, loc); 3218 3219 __kmp_release_user_lock_with_checks(lck, global_tid); 3220 3221 #endif // KMP_USE_DYNAMIC_LOCK 3222 } // __kmp_end_critical_section_reduce_block 3223 3224 #if OMP_40_ENABLED 3225 static __forceinline int 3226 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p, 3227 int *task_state) { 3228 kmp_team_t *team; 3229 3230 // Check if we are inside the teams construct? 3231 if (th->th.th_teams_microtask) { 3232 *team_p = team = th->th.th_team; 3233 if (team->t.t_level == th->th.th_teams_level) { 3234 // This is reduction at teams construct. 3235 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0 3236 // Let's swap teams temporarily for the reduction. 3237 th->th.th_info.ds.ds_tid = team->t.t_master_tid; 3238 th->th.th_team = team->t.t_parent; 3239 th->th.th_team_nproc = th->th.th_team->t.t_nproc; 3240 th->th.th_task_team = th->th.th_team->t.t_task_team[0]; 3241 *task_state = th->th.th_task_state; 3242 th->th.th_task_state = 0; 3243 3244 return 1; 3245 } 3246 } 3247 return 0; 3248 } 3249 3250 static __forceinline void 3251 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) { 3252 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction. 3253 th->th.th_info.ds.ds_tid = 0; 3254 th->th.th_team = team; 3255 th->th.th_team_nproc = team->t.t_nproc; 3256 th->th.th_task_team = team->t.t_task_team[task_state]; 3257 th->th.th_task_state = task_state; 3258 } 3259 #endif 3260 3261 /* 2.a.i. Reduce Block without a terminating barrier */ 3262 /*! 3263 @ingroup SYNCHRONIZATION 3264 @param loc source location information 3265 @param global_tid global thread number 3266 @param num_vars number of items (variables) to be reduced 3267 @param reduce_size size of data in bytes to be reduced 3268 @param reduce_data pointer to data to be reduced 3269 @param reduce_func callback function providing reduction operation on two 3270 operands and returning result of reduction in lhs_data 3271 @param lck pointer to the unique lock data structure 3272 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3273 threads if atomic reduction needed 3274 3275 The nowait version is used for a reduce clause with the nowait argument. 3276 */ 3277 kmp_int32 3278 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3279 size_t reduce_size, void *reduce_data, 3280 void (*reduce_func)(void *lhs_data, void *rhs_data), 3281 kmp_critical_name *lck) { 3282 3283 KMP_COUNT_BLOCK(REDUCE_nowait); 3284 int retval = 0; 3285 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3286 #if OMP_40_ENABLED 3287 kmp_info_t *th; 3288 kmp_team_t *team; 3289 int teams_swapped = 0, task_state; 3290 #endif 3291 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid)); 3292 3293 // why do we need this initialization here at all? 3294 // Reduction clause can not be used as a stand-alone directive. 3295 3296 // do not call __kmp_serial_initialize(), it will be called by 3297 // __kmp_parallel_initialize() if needed 3298 // possible detection of false-positive race by the threadchecker ??? 3299 if (!TCR_4(__kmp_init_parallel)) 3300 __kmp_parallel_initialize(); 3301 3302 // check correctness of reduce block nesting 3303 #if KMP_USE_DYNAMIC_LOCK 3304 if (__kmp_env_consistency_check) 3305 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3306 #else 3307 if (__kmp_env_consistency_check) 3308 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3309 #endif 3310 3311 #if OMP_40_ENABLED 3312 th = __kmp_thread_from_gtid(global_tid); 3313 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3314 #endif // OMP_40_ENABLED 3315 3316 // packed_reduction_method value will be reused by __kmp_end_reduce* function, 3317 // the value should be kept in a variable 3318 // the variable should be either a construct-specific or thread-specific 3319 // property, not a team specific property 3320 // (a thread can reach the next reduce block on the next construct, reduce 3321 // method may differ on the next construct) 3322 // an ident_t "loc" parameter could be used as a construct-specific property 3323 // (what if loc == 0?) 3324 // (if both construct-specific and team-specific variables were shared, 3325 // then unness extra syncs should be needed) 3326 // a thread-specific variable is better regarding two issues above (next 3327 // construct and extra syncs) 3328 // a thread-specific "th_local.reduction_method" variable is used currently 3329 // each thread executes 'determine' and 'set' lines (no need to execute by one 3330 // thread, to avoid unness extra syncs) 3331 3332 packed_reduction_method = __kmp_determine_reduction_method( 3333 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3334 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3335 3336 if (packed_reduction_method == critical_reduce_block) { 3337 3338 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3339 retval = 1; 3340 3341 } else if (packed_reduction_method == empty_reduce_block) { 3342 3343 // usage: if team size == 1, no synchronization is required ( Intel 3344 // platforms only ) 3345 retval = 1; 3346 3347 } else if (packed_reduction_method == atomic_reduce_block) { 3348 3349 retval = 2; 3350 3351 // all threads should do this pop here (because __kmpc_end_reduce_nowait() 3352 // won't be called by the code gen) 3353 // (it's not quite good, because the checking block has been closed by 3354 // this 'pop', 3355 // but atomic operation has not been executed yet, will be executed 3356 // slightly later, literally on next instruction) 3357 if (__kmp_env_consistency_check) 3358 __kmp_pop_sync(global_tid, ct_reduce, loc); 3359 3360 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3361 tree_reduce_block)) { 3362 3363 // AT: performance issue: a real barrier here 3364 // AT: (if master goes slow, other threads are blocked here waiting for the 3365 // master to come and release them) 3366 // AT: (it's not what a customer might expect specifying NOWAIT clause) 3367 // AT: (specifying NOWAIT won't result in improvement of performance, it'll 3368 // be confusing to a customer) 3369 // AT: another implementation of *barrier_gather*nowait() (or some other design) 3370 // might go faster and be more in line with sense of NOWAIT 3371 // AT: TO DO: do epcc test and compare times 3372 3373 // this barrier should be invisible to a customer and to the threading profile 3374 // tool (it's neither a terminating barrier nor customer's code, it's 3375 // used for an internal purpose) 3376 #if OMPT_SUPPORT 3377 // JP: can this barrier potentially leed to task scheduling? 3378 // JP: as long as there is a barrier in the implementation, OMPT should and 3379 // will provide the barrier events 3380 // so we set-up the necessary frame/return addresses. 3381 omp_frame_t *ompt_frame; 3382 if (ompt_enabled.enabled) { 3383 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3384 if (ompt_frame->enter_frame == NULL) 3385 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3386 OMPT_STORE_RETURN_ADDRESS(global_tid); 3387 } 3388 #endif 3389 #if USE_ITT_NOTIFY 3390 __kmp_threads[global_tid]->th.th_ident = loc; 3391 #endif 3392 retval = 3393 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3394 global_tid, FALSE, reduce_size, reduce_data, reduce_func); 3395 retval = (retval != 0) ? (0) : (1); 3396 #if OMPT_SUPPORT && OMPT_OPTIONAL 3397 if (ompt_enabled.enabled) { 3398 ompt_frame->enter_frame = NULL; 3399 } 3400 #endif 3401 3402 // all other workers except master should do this pop here 3403 // ( none of other workers will get to __kmpc_end_reduce_nowait() ) 3404 if (__kmp_env_consistency_check) { 3405 if (retval == 0) { 3406 __kmp_pop_sync(global_tid, ct_reduce, loc); 3407 } 3408 } 3409 3410 } else { 3411 3412 // should never reach this block 3413 KMP_ASSERT(0); // "unexpected method" 3414 } 3415 #if OMP_40_ENABLED 3416 if (teams_swapped) { 3417 __kmp_restore_swapped_teams(th, team, task_state); 3418 } 3419 #endif 3420 KA_TRACE( 3421 10, 3422 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", 3423 global_tid, packed_reduction_method, retval)); 3424 3425 return retval; 3426 } 3427 3428 /*! 3429 @ingroup SYNCHRONIZATION 3430 @param loc source location information 3431 @param global_tid global thread id. 3432 @param lck pointer to the unique lock data structure 3433 3434 Finish the execution of a reduce nowait. 3435 */ 3436 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, 3437 kmp_critical_name *lck) { 3438 3439 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3440 3441 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid)); 3442 3443 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3444 3445 if (packed_reduction_method == critical_reduce_block) { 3446 3447 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3448 3449 } else if (packed_reduction_method == empty_reduce_block) { 3450 3451 // usage: if team size == 1, no synchronization is required ( on Intel 3452 // platforms only ) 3453 3454 } else if (packed_reduction_method == atomic_reduce_block) { 3455 3456 // neither master nor other workers should get here 3457 // (code gen does not generate this call in case 2: atomic reduce block) 3458 // actually it's better to remove this elseif at all; 3459 // after removal this value will checked by the 'else' and will assert 3460 3461 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3462 tree_reduce_block)) { 3463 3464 // only master gets here 3465 3466 } else { 3467 3468 // should never reach this block 3469 KMP_ASSERT(0); // "unexpected method" 3470 } 3471 3472 if (__kmp_env_consistency_check) 3473 __kmp_pop_sync(global_tid, ct_reduce, loc); 3474 3475 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", 3476 global_tid, packed_reduction_method)); 3477 3478 return; 3479 } 3480 3481 /* 2.a.ii. Reduce Block with a terminating barrier */ 3482 3483 /*! 3484 @ingroup SYNCHRONIZATION 3485 @param loc source location information 3486 @param global_tid global thread number 3487 @param num_vars number of items (variables) to be reduced 3488 @param reduce_size size of data in bytes to be reduced 3489 @param reduce_data pointer to data to be reduced 3490 @param reduce_func callback function providing reduction operation on two 3491 operands and returning result of reduction in lhs_data 3492 @param lck pointer to the unique lock data structure 3493 @result 1 for the master thread, 0 for all other team threads, 2 for all team 3494 threads if atomic reduction needed 3495 3496 A blocking reduce that includes an implicit barrier. 3497 */ 3498 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, 3499 size_t reduce_size, void *reduce_data, 3500 void (*reduce_func)(void *lhs_data, void *rhs_data), 3501 kmp_critical_name *lck) { 3502 KMP_COUNT_BLOCK(REDUCE_wait); 3503 int retval = 0; 3504 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3505 #if OMP_40_ENABLED 3506 kmp_info_t *th; 3507 kmp_team_t *team; 3508 int teams_swapped = 0, task_state; 3509 #endif 3510 3511 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid)); 3512 3513 // why do we need this initialization here at all? 3514 // Reduction clause can not be a stand-alone directive. 3515 3516 // do not call __kmp_serial_initialize(), it will be called by 3517 // __kmp_parallel_initialize() if needed 3518 // possible detection of false-positive race by the threadchecker ??? 3519 if (!TCR_4(__kmp_init_parallel)) 3520 __kmp_parallel_initialize(); 3521 3522 // check correctness of reduce block nesting 3523 #if KMP_USE_DYNAMIC_LOCK 3524 if (__kmp_env_consistency_check) 3525 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0); 3526 #else 3527 if (__kmp_env_consistency_check) 3528 __kmp_push_sync(global_tid, ct_reduce, loc, NULL); 3529 #endif 3530 3531 #if OMP_40_ENABLED 3532 th = __kmp_thread_from_gtid(global_tid); 3533 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3534 #endif // OMP_40_ENABLED 3535 3536 packed_reduction_method = __kmp_determine_reduction_method( 3537 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); 3538 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); 3539 3540 if (packed_reduction_method == critical_reduce_block) { 3541 3542 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); 3543 retval = 1; 3544 3545 } else if (packed_reduction_method == empty_reduce_block) { 3546 3547 // usage: if team size == 1, no synchronization is required ( Intel 3548 // platforms only ) 3549 retval = 1; 3550 3551 } else if (packed_reduction_method == atomic_reduce_block) { 3552 3553 retval = 2; 3554 3555 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3556 tree_reduce_block)) { 3557 3558 // case tree_reduce_block: 3559 // this barrier should be visible to a customer and to the threading profile 3560 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3561 #if OMPT_SUPPORT 3562 omp_frame_t *ompt_frame; 3563 if (ompt_enabled.enabled) { 3564 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3565 if (ompt_frame->enter_frame == NULL) 3566 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3567 OMPT_STORE_RETURN_ADDRESS(global_tid); 3568 } 3569 #endif 3570 #if USE_ITT_NOTIFY 3571 __kmp_threads[global_tid]->th.th_ident = 3572 loc; // needed for correct notification of frames 3573 #endif 3574 retval = 3575 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3576 global_tid, TRUE, reduce_size, reduce_data, reduce_func); 3577 retval = (retval != 0) ? (0) : (1); 3578 #if OMPT_SUPPORT && OMPT_OPTIONAL 3579 if (ompt_enabled.enabled) { 3580 ompt_frame->enter_frame = NULL; 3581 } 3582 #endif 3583 3584 // all other workers except master should do this pop here 3585 // ( none of other workers except master will enter __kmpc_end_reduce() ) 3586 if (__kmp_env_consistency_check) { 3587 if (retval == 0) { // 0: all other workers; 1: master 3588 __kmp_pop_sync(global_tid, ct_reduce, loc); 3589 } 3590 } 3591 3592 } else { 3593 3594 // should never reach this block 3595 KMP_ASSERT(0); // "unexpected method" 3596 } 3597 #if OMP_40_ENABLED 3598 if (teams_swapped) { 3599 __kmp_restore_swapped_teams(th, team, task_state); 3600 } 3601 #endif 3602 3603 KA_TRACE(10, 3604 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", 3605 global_tid, packed_reduction_method, retval)); 3606 3607 return retval; 3608 } 3609 3610 /*! 3611 @ingroup SYNCHRONIZATION 3612 @param loc source location information 3613 @param global_tid global thread id. 3614 @param lck pointer to the unique lock data structure 3615 3616 Finish the execution of a blocking reduce. 3617 The <tt>lck</tt> pointer must be the same as that used in the corresponding 3618 start function. 3619 */ 3620 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, 3621 kmp_critical_name *lck) { 3622 3623 PACKED_REDUCTION_METHOD_T packed_reduction_method; 3624 #if OMP_40_ENABLED 3625 kmp_info_t *th; 3626 kmp_team_t *team; 3627 int teams_swapped = 0, task_state; 3628 #endif 3629 3630 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid)); 3631 3632 #if OMP_40_ENABLED 3633 th = __kmp_thread_from_gtid(global_tid); 3634 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state); 3635 #endif // OMP_40_ENABLED 3636 3637 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); 3638 3639 // this barrier should be visible to a customer and to the threading profile 3640 // tool (it's a terminating barrier on constructs if NOWAIT not specified) 3641 3642 if (packed_reduction_method == critical_reduce_block) { 3643 3644 __kmp_end_critical_section_reduce_block(loc, global_tid, lck); 3645 3646 // TODO: implicit barrier: should be exposed 3647 #if OMPT_SUPPORT 3648 omp_frame_t *ompt_frame; 3649 if (ompt_enabled.enabled) { 3650 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3651 if (ompt_frame->enter_frame == NULL) 3652 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3653 OMPT_STORE_RETURN_ADDRESS(global_tid); 3654 } 3655 #endif 3656 #if USE_ITT_NOTIFY 3657 __kmp_threads[global_tid]->th.th_ident = loc; 3658 #endif 3659 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3660 #if OMPT_SUPPORT && OMPT_OPTIONAL 3661 if (ompt_enabled.enabled) { 3662 ompt_frame->enter_frame = NULL; 3663 } 3664 #endif 3665 3666 } else if (packed_reduction_method == empty_reduce_block) { 3667 3668 // usage: if team size==1, no synchronization is required (Intel platforms only) 3669 3670 // TODO: implicit barrier: should be exposed 3671 #if OMPT_SUPPORT 3672 omp_frame_t *ompt_frame; 3673 if (ompt_enabled.enabled) { 3674 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3675 if (ompt_frame->enter_frame == NULL) 3676 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3677 OMPT_STORE_RETURN_ADDRESS(global_tid); 3678 } 3679 #endif 3680 #if USE_ITT_NOTIFY 3681 __kmp_threads[global_tid]->th.th_ident = loc; 3682 #endif 3683 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3684 #if OMPT_SUPPORT && OMPT_OPTIONAL 3685 if (ompt_enabled.enabled) { 3686 ompt_frame->enter_frame = NULL; 3687 } 3688 #endif 3689 3690 } else if (packed_reduction_method == atomic_reduce_block) { 3691 3692 #if OMPT_SUPPORT 3693 omp_frame_t *ompt_frame; 3694 if (ompt_enabled.enabled) { 3695 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 3696 if (ompt_frame->enter_frame == NULL) 3697 ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1); 3698 OMPT_STORE_RETURN_ADDRESS(global_tid); 3699 } 3700 #endif 3701 // TODO: implicit barrier: should be exposed 3702 #if USE_ITT_NOTIFY 3703 __kmp_threads[global_tid]->th.th_ident = loc; 3704 #endif 3705 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL); 3706 #if OMPT_SUPPORT && OMPT_OPTIONAL 3707 if (ompt_enabled.enabled) { 3708 ompt_frame->enter_frame = NULL; 3709 } 3710 #endif 3711 3712 } else if (TEST_REDUCTION_METHOD(packed_reduction_method, 3713 tree_reduce_block)) { 3714 3715 // only master executes here (master releases all other workers) 3716 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method), 3717 global_tid); 3718 3719 } else { 3720 3721 // should never reach this block 3722 KMP_ASSERT(0); // "unexpected method" 3723 } 3724 #if OMP_40_ENABLED 3725 if (teams_swapped) { 3726 __kmp_restore_swapped_teams(th, team, task_state); 3727 } 3728 #endif 3729 3730 if (__kmp_env_consistency_check) 3731 __kmp_pop_sync(global_tid, ct_reduce, loc); 3732 3733 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n", 3734 global_tid, packed_reduction_method)); 3735 3736 return; 3737 } 3738 3739 #undef __KMP_GET_REDUCTION_METHOD 3740 #undef __KMP_SET_REDUCTION_METHOD 3741 3742 /* end of interface to fast scalable reduce routines */ 3743 3744 kmp_uint64 __kmpc_get_taskid() { 3745 3746 kmp_int32 gtid; 3747 kmp_info_t *thread; 3748 3749 gtid = __kmp_get_gtid(); 3750 if (gtid < 0) { 3751 return 0; 3752 } 3753 thread = __kmp_thread_from_gtid(gtid); 3754 return thread->th.th_current_task->td_task_id; 3755 3756 } // __kmpc_get_taskid 3757 3758 kmp_uint64 __kmpc_get_parent_taskid() { 3759 3760 kmp_int32 gtid; 3761 kmp_info_t *thread; 3762 kmp_taskdata_t *parent_task; 3763 3764 gtid = __kmp_get_gtid(); 3765 if (gtid < 0) { 3766 return 0; 3767 } 3768 thread = __kmp_thread_from_gtid(gtid); 3769 parent_task = thread->th.th_current_task->td_parent; 3770 return (parent_task == NULL ? 0 : parent_task->td_task_id); 3771 3772 } // __kmpc_get_parent_taskid 3773 3774 #if OMP_45_ENABLED 3775 /*! 3776 @ingroup WORK_SHARING 3777 @param loc source location information. 3778 @param gtid global thread number. 3779 @param num_dims number of associated doacross loops. 3780 @param dims info on loops bounds. 3781 3782 Initialize doacross loop information. 3783 Expect compiler send us inclusive bounds, 3784 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. 3785 */ 3786 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, 3787 const struct kmp_dim *dims) { 3788 int j, idx; 3789 kmp_int64 last, trace_count; 3790 kmp_info_t *th = __kmp_threads[gtid]; 3791 kmp_team_t *team = th->th.th_team; 3792 kmp_uint32 *flags; 3793 kmp_disp_t *pr_buf = th->th.th_dispatch; 3794 dispatch_shared_info_t *sh_buf; 3795 3796 KA_TRACE( 3797 20, 3798 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", 3799 gtid, num_dims, !team->t.t_serialized)); 3800 KMP_DEBUG_ASSERT(dims != NULL); 3801 KMP_DEBUG_ASSERT(num_dims > 0); 3802 3803 if (team->t.t_serialized) { 3804 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n")); 3805 return; // no dependencies if team is serialized 3806 } 3807 KMP_DEBUG_ASSERT(team->t.t_nproc > 1); 3808 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for 3809 // the next loop 3810 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 3811 3812 // Save bounds info into allocated private buffer 3813 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); 3814 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc( 3815 th, sizeof(kmp_int64) * (4 * num_dims + 1)); 3816 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3817 pr_buf->th_doacross_info[0] = 3818 (kmp_int64)num_dims; // first element is number of dimensions 3819 // Save also address of num_done in order to access it later without knowing 3820 // the buffer index 3821 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; 3822 pr_buf->th_doacross_info[2] = dims[0].lo; 3823 pr_buf->th_doacross_info[3] = dims[0].up; 3824 pr_buf->th_doacross_info[4] = dims[0].st; 3825 last = 5; 3826 for (j = 1; j < num_dims; ++j) { 3827 kmp_int64 3828 range_length; // To keep ranges of all dimensions but the first dims[0] 3829 if (dims[j].st == 1) { // most common case 3830 // AC: should we care of ranges bigger than LLONG_MAX? (not for now) 3831 range_length = dims[j].up - dims[j].lo + 1; 3832 } else { 3833 if (dims[j].st > 0) { 3834 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); 3835 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; 3836 } else { // negative increment 3837 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); 3838 range_length = 3839 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; 3840 } 3841 } 3842 pr_buf->th_doacross_info[last++] = range_length; 3843 pr_buf->th_doacross_info[last++] = dims[j].lo; 3844 pr_buf->th_doacross_info[last++] = dims[j].up; 3845 pr_buf->th_doacross_info[last++] = dims[j].st; 3846 } 3847 3848 // Compute total trip count. 3849 // Start with range of dims[0] which we don't need to keep in the buffer. 3850 if (dims[0].st == 1) { // most common case 3851 trace_count = dims[0].up - dims[0].lo + 1; 3852 } else if (dims[0].st > 0) { 3853 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); 3854 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; 3855 } else { // negative increment 3856 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); 3857 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; 3858 } 3859 for (j = 1; j < num_dims; ++j) { 3860 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges 3861 } 3862 KMP_DEBUG_ASSERT(trace_count > 0); 3863 3864 // Check if shared buffer is not occupied by other loop (idx - 3865 // __kmp_dispatch_num_buffers) 3866 if (idx != sh_buf->doacross_buf_idx) { 3867 // Shared buffer is occupied, wait for it to be free 3868 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, 3869 __kmp_eq_4, NULL); 3870 } 3871 #if KMP_32_BIT_ARCH 3872 // Check if we are the first thread. After the CAS the first thread gets 0, 3873 // others get 1 if initialization is in progress, allocated pointer otherwise. 3874 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated. 3875 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32( 3876 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1); 3877 #else 3878 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64( 3879 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL); 3880 #endif 3881 if (flags == NULL) { 3882 // we are the first thread, allocate the array of flags 3883 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration 3884 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1); 3885 KMP_MB(); 3886 sh_buf->doacross_flags = flags; 3887 } else if (flags == (kmp_uint32 *)1) { 3888 #if KMP_32_BIT_ARCH 3889 // initialization is still in progress, need to wait 3890 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1) 3891 #else 3892 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL) 3893 #endif 3894 KMP_YIELD(TRUE); 3895 KMP_MB(); 3896 } else { 3897 KMP_MB(); 3898 } 3899 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value 3900 pr_buf->th_doacross_flags = 3901 sh_buf->doacross_flags; // save private copy in order to not 3902 // touch shared buffer on each iteration 3903 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid)); 3904 } 3905 3906 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) { 3907 kmp_int32 shft, num_dims, i; 3908 kmp_uint32 flag; 3909 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 3910 kmp_info_t *th = __kmp_threads[gtid]; 3911 kmp_team_t *team = th->th.th_team; 3912 kmp_disp_t *pr_buf; 3913 kmp_int64 lo, up, st; 3914 3915 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); 3916 if (team->t.t_serialized) { 3917 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n")); 3918 return; // no dependencies if team is serialized 3919 } 3920 3921 // calculate sequential iteration number and check out-of-bounds condition 3922 pr_buf = th->th.th_dispatch; 3923 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 3924 num_dims = pr_buf->th_doacross_info[0]; 3925 lo = pr_buf->th_doacross_info[2]; 3926 up = pr_buf->th_doacross_info[3]; 3927 st = pr_buf->th_doacross_info[4]; 3928 if (st == 1) { // most common case 3929 if (vec[0] < lo || vec[0] > up) { 3930 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3931 "bounds [%lld,%lld]\n", 3932 gtid, vec[0], lo, up)); 3933 return; 3934 } 3935 iter_number = vec[0] - lo; 3936 } else if (st > 0) { 3937 if (vec[0] < lo || vec[0] > up) { 3938 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3939 "bounds [%lld,%lld]\n", 3940 gtid, vec[0], lo, up)); 3941 return; 3942 } 3943 iter_number = (kmp_uint64)(vec[0] - lo) / st; 3944 } else { // negative increment 3945 if (vec[0] > lo || vec[0] < up) { 3946 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3947 "bounds [%lld,%lld]\n", 3948 gtid, vec[0], lo, up)); 3949 return; 3950 } 3951 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 3952 } 3953 for (i = 1; i < num_dims; ++i) { 3954 kmp_int64 iter, ln; 3955 kmp_int32 j = i * 4; 3956 ln = pr_buf->th_doacross_info[j + 1]; 3957 lo = pr_buf->th_doacross_info[j + 2]; 3958 up = pr_buf->th_doacross_info[j + 3]; 3959 st = pr_buf->th_doacross_info[j + 4]; 3960 if (st == 1) { 3961 if (vec[i] < lo || vec[i] > up) { 3962 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3963 "bounds [%lld,%lld]\n", 3964 gtid, vec[i], lo, up)); 3965 return; 3966 } 3967 iter = vec[i] - lo; 3968 } else if (st > 0) { 3969 if (vec[i] < lo || vec[i] > up) { 3970 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3971 "bounds [%lld,%lld]\n", 3972 gtid, vec[i], lo, up)); 3973 return; 3974 } 3975 iter = (kmp_uint64)(vec[i] - lo) / st; 3976 } else { // st < 0 3977 if (vec[i] > lo || vec[i] < up) { 3978 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of " 3979 "bounds [%lld,%lld]\n", 3980 gtid, vec[i], lo, up)); 3981 return; 3982 } 3983 iter = (kmp_uint64)(lo - vec[i]) / (-st); 3984 } 3985 iter_number = iter + ln * iter_number; 3986 } 3987 shft = iter_number % 32; // use 32-bit granularity 3988 iter_number >>= 5; // divided by 32 3989 flag = 1 << shft; 3990 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) { 3991 KMP_YIELD(TRUE); 3992 } 3993 KMP_MB(); 3994 KA_TRACE(20, 3995 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", 3996 gtid, (iter_number << 5) + shft)); 3997 } 3998 3999 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) { 4000 kmp_int32 shft, num_dims, i; 4001 kmp_uint32 flag; 4002 kmp_int64 iter_number; // iteration number of "collapsed" loop nest 4003 kmp_info_t *th = __kmp_threads[gtid]; 4004 kmp_team_t *team = th->th.th_team; 4005 kmp_disp_t *pr_buf; 4006 kmp_int64 lo, st; 4007 4008 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid)); 4009 if (team->t.t_serialized) { 4010 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n")); 4011 return; // no dependencies if team is serialized 4012 } 4013 4014 // calculate sequential iteration number (same as in "wait" but no 4015 // out-of-bounds checks) 4016 pr_buf = th->th.th_dispatch; 4017 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); 4018 num_dims = pr_buf->th_doacross_info[0]; 4019 lo = pr_buf->th_doacross_info[2]; 4020 st = pr_buf->th_doacross_info[4]; 4021 if (st == 1) { // most common case 4022 iter_number = vec[0] - lo; 4023 } else if (st > 0) { 4024 iter_number = (kmp_uint64)(vec[0] - lo) / st; 4025 } else { // negative increment 4026 iter_number = (kmp_uint64)(lo - vec[0]) / (-st); 4027 } 4028 for (i = 1; i < num_dims; ++i) { 4029 kmp_int64 iter, ln; 4030 kmp_int32 j = i * 4; 4031 ln = pr_buf->th_doacross_info[j + 1]; 4032 lo = pr_buf->th_doacross_info[j + 2]; 4033 st = pr_buf->th_doacross_info[j + 4]; 4034 if (st == 1) { 4035 iter = vec[i] - lo; 4036 } else if (st > 0) { 4037 iter = (kmp_uint64)(vec[i] - lo) / st; 4038 } else { // st < 0 4039 iter = (kmp_uint64)(lo - vec[i]) / (-st); 4040 } 4041 iter_number = iter + ln * iter_number; 4042 } 4043 shft = iter_number % 32; // use 32-bit granularity 4044 iter_number >>= 5; // divided by 32 4045 flag = 1 << shft; 4046 KMP_MB(); 4047 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) 4048 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag); 4049 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid, 4050 (iter_number << 5) + shft)); 4051 } 4052 4053 void __kmpc_doacross_fini(ident_t *loc, int gtid) { 4054 kmp_int32 num_done; 4055 kmp_info_t *th = __kmp_threads[gtid]; 4056 kmp_team_t *team = th->th.th_team; 4057 kmp_disp_t *pr_buf = th->th.th_dispatch; 4058 4059 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); 4060 if (team->t.t_serialized) { 4061 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team)); 4062 return; // nothing to do 4063 } 4064 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1; 4065 if (num_done == th->th.th_team_nproc) { 4066 // we are the last thread, need to free shared resources 4067 int idx = pr_buf->th_doacross_buf_idx - 1; 4068 dispatch_shared_info_t *sh_buf = 4069 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers]; 4070 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == 4071 (kmp_int64)&sh_buf->doacross_num_done); 4072 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done); 4073 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); 4074 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags)); 4075 sh_buf->doacross_flags = NULL; 4076 sh_buf->doacross_num_done = 0; 4077 sh_buf->doacross_buf_idx += 4078 __kmp_dispatch_num_buffers; // free buffer for future re-use 4079 } 4080 // free private resources (need to keep buffer index forever) 4081 pr_buf->th_doacross_flags = NULL; 4082 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info); 4083 pr_buf->th_doacross_info = NULL; 4084 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid)); 4085 } 4086 #endif 4087 4088 #if OMP_50_ENABLED 4089 int __kmpc_get_target_offload(void) { 4090 if (!__kmp_init_serial) { 4091 __kmp_serial_initialize(); 4092 } 4093 return __kmp_target_offload; 4094 } 4095 #endif // OMP_50_ENABLED 4096 4097 // end of file // 4098